In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score 
import zipfile
import io
import requests
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

with zipfile.ZipFile(io.BytesIO(requests.get(url).content))as z:
    with z.open("SMSSpamCollection") as f:
        df = pd.read_csv(f, sep = '\t', names = ["label", "message"])
df.head(20)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
df.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
for i in range(10):
    print("Message: {}\n, Label : {}",df.message[i], df.label[i])

Message: {}
, Label : {} Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... ham
Message: {}
, Label : {} Ok lar... Joking wif u oni... ham
Message: {}
, Label : {} Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's spam
Message: {}
, Label : {} U dun say so early hor... U c already then say... ham
Message: {}
, Label : {} Nah I don't think he goes to usf, he lives around here though ham
Message: {}
, Label : {} FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv spam
Message: {}
, Label : {} Even my brother is not like to speak with me. They treat me like aids patent. ham
Message: {}
, Label : {} As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to co

In [6]:
#checking for missing values 
df.isnull().sum()

label      0
message    0
dtype: int64

In [7]:
#converting labels to binary values 
df['label'] = df['label'].map({'ham' : 0, 'spam' : 1})
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [8]:
## train single model 
#algorithm = MultinomialNB()
#model = algorithm.fit(X_train_tfidf, X_train)

In [9]:
# Step 3: Split data
x = df['message']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

# Step 5: Define models
models = {
    'MultinomialNB': MultinomialNB(),
    'GaussianNB': GaussianNB()
}

# Step 6: Train and evaluate models
for model_name, model in models.items():
    print(f"\nModel: {model_name}")
    
    if model_name == 'GaussianNB':
        # GaussianNB needs dense array input
        x_train_dense = x_train_tfidf.toarray()
        x_test_dense = x_test_tfidf.toarray()
        model.fit(x_train_dense, y_train)
        y_pred = model.predict(x_test_dense)
    else:
        model.fit(x_train_tfidf, y_train)
        y_pred = model.predict(x_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")



Model: MultinomialNB
Accuracy: 96.68%

Model: GaussianNB
Accuracy: 90.49%


In [10]:
for model in models:
    print(model)

MultinomialNB
GaussianNB


In [11]:
models.items()

dict_items([('MultinomialNB', MultinomialNB()), ('GaussianNB', GaussianNB())])

In [12]:
sample_inputs = [
    "Urgent, you are the lucky winner of our today's draw. Click here to claim it",
    "Hey , are we still meeting for lunch tomorrow?",
    "Your account has been suspended. Click here to verify account information.",
    "Lucky winner ! You have been choosed to recieve the $500 gift card"
]

#transforming the sample inputs into tf-idf features
sample_text = vectorizer.transform(sample_inputs)

#loop over the models to predict for each one
for model_name , model in models.items():
    if model_name == 'GaussianNB':
        sample_preds = model.predict(sample_text.toarray())
    else :
        sample_preds = model.predict(sample_text)
        
    # print predictions for each model
    for i, sample in enumerate(sample_inputs):
        print(f"Model: {model_name}")
        print(f"Message: {sample}")
        print(f" Predicted : {'Spam' if sample_preds[i] == 1 else 'Ham'}")
        print("-" * 50)

Model: MultinomialNB
Message: Urgent, you are the lucky winner of our today's draw. Click here to claim it
 Predicted : Ham
--------------------------------------------------
Model: MultinomialNB
Message: Hey , are we still meeting for lunch tomorrow?
 Predicted : Ham
--------------------------------------------------
Model: MultinomialNB
Message: Your account has been suspended. Click here to verify account information.
 Predicted : Ham
--------------------------------------------------
Model: MultinomialNB
Message: Lucky winner ! You have been choosed to recieve the $500 gift card
 Predicted : Ham
--------------------------------------------------
Model: GaussianNB
Message: Urgent, you are the lucky winner of our today's draw. Click here to claim it
 Predicted : Spam
--------------------------------------------------
Model: GaussianNB
Message: Hey , are we still meeting for lunch tomorrow?
 Predicted : Ham
--------------------------------------------------
Model: GaussianNB
Message: 

In [13]:
import joblib 

joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']