In [1]:
import pandas as pd
import numpy as np


In [2]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

data.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.shape


(5572, 2)

In [4]:
data['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [5]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data.head()


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
import re

def clean_text(text):
    text = text.lower()                       # lowercase
    text = re.sub(r'\d+', '', text)           # remove numbers
    text = re.sub(r'[^\w\s]', '', text)       # remove punctuation
    text = re.sub(r'\s+', ' ', text)          # remove extra spaces
    return text

data['clean_message'] = data['message'].apply(clean_text)
data[['message', 'clean_message']].head()


Unnamed: 0,message,clean_message
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=3000)

X = tfidf.fit_transform(data['clean_message'])
y = data['label']

X.shape


(5572, 3000)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

X_train.shape, X_test.shape


((4457, 3000), (1115, 3000))

In [10]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)


0.9802690582959641

In [12]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['message'])



In [15]:
from sklearn.model_selection import train_test_split

y = data['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [16]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)


In [17]:
test_msg = ["Congratulations! You won a free gift card. Claim now"]
test_msg_vec = vectorizer.transform(test_msg)

model.predict(test_msg_vec)


array([1])

In [18]:
test_msgs = [
    "Congratulations! You won a free gift card. Claim now",
    "Hi, can we meet tomorrow for the project discussion?",
    "Exclusive offer! Buy 1 get 1 free. Hurry!",
]

test_msgs_vec = vectorizer.transform(test_msgs)
predictions = model.predict(test_msgs_vec)

for msg, pred in zip(test_msgs, predictions):
    label = "SPAM ðŸš¨" if pred == 1 else "HAM âœ…"
    print(f"{msg} â†’ {label}")


Congratulations! You won a free gift card. Claim now â†’ SPAM ðŸš¨
Hi, can we meet tomorrow for the project discussion? â†’ HAM âœ…
Exclusive offer! Buy 1 get 1 free. Hurry! â†’ SPAM ðŸš¨


In [20]:
import joblib

# Save model
joblib.dump(model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Load model later
# model = joblib.load('spam_classifier_model.pkl')
# vectorizer = joblib.load('vectorizer.pkl')


['vectorizer.pkl']