In [2]:
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
data = pd.read_table(
    "SMSSpamCollection",
    header=None,
    names=["label", "text"]
)
print(data['label'].value_counts())


label
ham     4825
spam     747
Name: count, dtype: int64


In [4]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})


In [5]:
stop_words = stopwords.words('english')

def clean_text(text):
    text = text.lower()                        # Small letters
    text = re.sub('[^a-zA-Z]', ' ', text)     # Symbols remove
    text = ' '.join(text.split())              # Extra spaces remove
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Stopwords remove
    return text

data['text'] = data['text'].apply(clean_text)


In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text'])
y = data['label']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, random_state=42
)


In [8]:
model = MultinomialNB()
model.fit(X_train, y_train)
print("Model training complete!")


Model training complete!


In [9]:
y_pred = model.predict(X_test)


In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9748878923766816


In [11]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.81      0.90       149

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [12]:
print(confusion_matrix(y_test, y_pred))


[[966   0]
 [ 28 121]]


In [13]:
new_email = ["""Congratulations! Click now to claim your exclusive reward before it expires tonight—don’t miss out! — Huzaifa Baig"""]

# Clean text same way
clean_email = [clean_text(new_email[0])]

new_email_vector = vectorizer.transform(clean_email)
prediction = model.predict(new_email_vector)

if prediction[0] == 1:
    print("SPAM EMAIL")
else:
    print("NOT SPAM")


SPAM EMAIL


In [14]:
pickle.dump(model, open("spam_model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))
