In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string
import joblib

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  
    text = ' '.join(word for word in text.split() if word not in stop_words) 
    return text

data['v2'] = data['v2'].apply(preprocess_text)

X = data['v2']
y = data['v1']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


joblib.dump(model, 'spam_detector.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


loaded_model = joblib.load('spam_detector.pkl')
loaded_vectorizer = joblib.load('vectorizer.pkl')


new_email = ["Congratulations! You've won a $1000 gift card!"]
new_email_processed = [preprocess_text(email) for email in new_email]
new_email_vectorized = loaded_vectorizer.transform(new_email_processed)
prediction = loaded_model.predict(new_email_vectorized)
print("Prediction:", prediction)


Accuracy: 0.9802690582959641
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.97      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Prediction: ['spam']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
