In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
data = pd.read_csv('spam_assassin.csv', encoding='latin-1')

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    text = [ps.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

data['text'] = data['text'].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['text']).toarray()
y = data['target'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
yPred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, yPred)}")
print(classification_report(y_test, yPred, target_names=['Ham', 'Spam']))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example email text
new_email = ["MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.2.5i In-Reply-To: <0D443C91DCE9CD40B1C795BA222A729E018854FA@milexc01.maxtor.com>; from conor_wynne@maxtor.com on Fri, Jul 26, 2002 at 03:56:22PM +0100 Sender: ilug-admin@linux.ie Errors-To: ilug-admin@linux.ie X-Mailman-Version: 1.1 Precedence: bulk List-Id: Irish Linux Users' Group <ilug.linux.ie> X-Beenthere: ilug@linux.ie On Fri, Jul 26, 2002 at 03:56:22PM +0100 or so it is rumoured hereabouts"]

# Step 1: Preprocess the new email (using the same TF-IDF vectorizer fitted on training data)
new_email_tfidf = vectorizer.transform(new_email).toarray()  # 'vectorizer' is the trained TF-IDF vectorizer

# Step 2: Predict if it's spam or ham using the trained model
prediction = model.predict(new_email_tfidf)

# Step 3: Display the result
if prediction[0] == 1:
    print("This email is spam.")
else:
    print("This email is ham (not spam).")