In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('../data/spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['message']).toarray()
y = df['label']

In [6]:
import joblib
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')

['../models/tfidf_vectorizer.pkl']

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [10]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [11]:
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END .....................................C=0.1, gamma=1; total time=  54.0s
[CV] END .....................................C=0.1, gamma=1; total time=  54.4s
[CV] END .....................................C=0.1, gamma=1; total time=  52.4s
[CV] END .....................................C=0.1, gamma=1; total time=  52.5s
[CV] END .....................................C=0.1, gamma=1; total time=  53.8s
[CV] END ...................................C=0.1, gamma=0.1; total time=  37.7s
[CV] END ...................................C=0.1, gamma=0.1; total time=  38.7s
[CV] END ...................................C=0.1, gamma=0.1; total time=  38.6s
[CV] END ...................................C=0.1, gamma=0.1; total time=  37.8s
[CV] END ...................................C=0.1, gamma=0.1; total time=  39.0s
[CV] END ..................................C=0.1, gamma=0.01; total time=  34.3s
[CV] END ..................................C=0.1,

In [27]:
y_pred_lr = lr.predict(X_test)
y_pred_nb = nb.predict(X_test)
y_pred_svc = grid.predict(X_test)

In [29]:
print("Logistic Regression")
print(f'Accuracy: {accuracy_score(y_test, y_pred_lr)}')
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression
Accuracy: 0.9632286995515695
[[965   0]
 [ 41 109]]
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [31]:
print("Support Vector Machine")
print(f'Accuracy: {accuracy_score(y_test, y_pred_svc)}')
print(confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

Support Vector Machine
Accuracy: 0.979372197309417
[[962   3]
 [ 20 130]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.98      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [33]:
best_model = grid.best_estimator_

In [35]:
joblib.dump(best_model, '../models/spam_sms_model.pkl')

['../models/spam_sms_model.pkl']