In [5]:
import re, os
import unicodedata
import json

import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime

from wordcloud import WordCloud

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


from requests import get
from bs4 import BeautifulSoup

from wordcloud import WordCloud



plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

pd.set_option('display.max_rows', 200)


from prepare import basic_clean, lemmatize
import prepare_spam


In [6]:
# Import spam data

df = pd.read_csv('spam_clean.csv')
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Prepare spam data

df.rename(columns={'text': 'original'}, inplace=True)
df = prepare_spam.prep_article_data(df, 'original', extra_words = ['u', '2', 'ur', "'", '4'])


In [8]:
# Make word lists
lemmatized_ham_words =(' '.join(df[df.label == 'ham'].lemmatized))
lemmatized_spam_words =(' '.join(df[df.label == 'spam'].lemmatized))
lemmatized_all_words = (' '.join(df.lemmatized))

In [9]:
# Look at frequency
lemmatized_ham_freq = pd.Series(lemmatized_ham_words.split()).value_counts()
lemmatized_spam_freq = pd.Series(lemmatized_spam_words.split()).value_counts()
lemmatized_all_freq = pd.Series(lemmatized_all_words.split()).value_counts()

In [10]:
# Look at top 20 words
word_counts = (pd.concat([lemmatized_all_freq, lemmatized_ham_freq, lemmatized_spam_freq], axis=1, sort=True)
                .set_axis(['all', 'ham', 'spam'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

word_counts.sort_values(by='all', ascending=False).head(20)

Unnamed: 0,all,ham,spam
call,600,241,359
get,397,314,83
go,304,273,31
ok,277,272,5
ltgt,276,276,0
free,275,59,216
know,267,241,26
day,252,225,27
come,247,242,5
like,245,232,13


In [11]:
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [13]:
# Establish baseline - ham mode

df[df['label']=='ham'].label.value_counts()/sum(df.label.value_counts())

ham    0.865937
Name: label, dtype: float64

In [14]:
# Split XY and vectorize
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.label


In [15]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)


In [18]:
#Create evaluation dataframe
train_eval = pd.DataFrame(dict(actual=y_train))
train_eval['baseline']='ham'

In [20]:
from sklearn.naive_bayes import GaussianNB

In [21]:
X_train = X_train.toarray()

In [22]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

train_eval['gnb_predicted'] = gnb.predict(X_train)


In [23]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_eval.actual, train_eval.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_eval.gnb_predicted, train_eval.actual))
print('---')
print(classification_report(train_eval.actual, train_eval.gnb_predicted))

Accuracy: 94.53%
---
Confusion Matrix
actual          ham  spam
gnb_predicted            
ham            3615     0
spam            244   598
---
              precision    recall  f1-score   support

         ham       1.00      0.94      0.97      3859
        spam       0.71      1.00      0.83       598

    accuracy                           0.95      4457
   macro avg       0.86      0.97      0.90      4457
weighted avg       0.96      0.95      0.95      4457



In [30]:
from sklearn.model_selection import cross_val_score


In [31]:
cross_val_score(gnb, X_train, y_train, cv = 5)


array([0.8867713 , 0.90134529, 0.89450056, 0.88215488, 0.88103255])

In [24]:
# Baseline eval
# Calculate baseline model performance
print('Accuracy: {:.2%}'.format(accuracy_score(train_eval.actual, train_eval.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_eval.baseline, train_eval.actual))
print('---')
print(classification_report(train_eval.actual, train_eval.baseline))

Accuracy: 86.58%
---
Confusion Matrix
actual     ham  spam
baseline            
ham       3859   598
---
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      3859
        spam       0.00      0.00      0.00       598

    accuracy                           0.87      4457
   macro avg       0.43      0.50      0.46      4457
weighted avg       0.75      0.87      0.80      4457



  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
train = train_eval.copy()

In [26]:
train.head()

Unnamed: 0,actual,baseline,gnb_predicted
4620,ham,ham,ham
4143,ham,ham,ham
576,ham,ham,ham
239,spam,ham,spam
2103,ham,ham,ham


In [27]:
# Logistic regression model

# Make and fit the object
lm = LogisticRegression().fit(X_train, y_train)
# Use it to make predictions
train['lm_predicted'] = lm.predict(X_train)
# Asssess accuracy
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.lm_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.lm_predicted))

Accuracy: 96.48%
---
Confusion Matrix
actual         ham  spam
lm_predicted            
ham           3852   150
spam             7   448
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3859
        spam       0.98      0.75      0.85       598

    accuracy                           0.96      4457
   macro avg       0.97      0.87      0.92      4457
weighted avg       0.97      0.96      0.96      4457



In [32]:
cross_val_score(lm, X_train, y_train, cv = 5)


array([0.9338565 , 0.94058296, 0.95286195, 0.9382716 , 0.93602694])

In [28]:
# Decision tree model
# Make and fit the object
dtc = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)
# Use the object
train['dt_predicted'] = dtc.predict(X_train)
# Determine performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.dt_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.dt_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.dt_predicted))

Accuracy: 92.87%
---
Confusion Matrix
actual         ham  spam
dt_predicted            
ham           3834   293
spam            25   305
---
              precision    recall  f1-score   support

         ham       0.93      0.99      0.96      3859
        spam       0.92      0.51      0.66       598

    accuracy                           0.93      4457
   macro avg       0.93      0.75      0.81      4457
weighted avg       0.93      0.93      0.92      4457



In [33]:
cross_val_score(dtc, X_train, y_train, cv = 5)

array([0.92825112, 0.9293722 , 0.93378227, 0.92480359, 0.92031425])

In [34]:
# Random forest
# Make and fit object
rf = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 3,
                            n_estimators = 100,
                            max_depth = 8, 
                            random_state = 123).fit(X_train, y_train)
# Use it to make predictions
train['rf_predicted'] = rf.predict(X_train)
# Assess performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

Accuracy: 87.70%
---
Confusion Matrix
actual         ham  spam
rf_predicted            
ham           3859   548
spam             0    50
---
              precision    recall  f1-score   support

         ham       0.88      1.00      0.93      3859
        spam       1.00      0.08      0.15       598

    accuracy                           0.88      4457
   macro avg       0.94      0.54      0.54      4457
weighted avg       0.89      0.88      0.83      4457



In [35]:
cross_val_score(rf, X_train, y_train, cv = 5)


array([0.87668161, 0.87780269, 0.87991021, 0.87429854, 0.87542088])

In [36]:
# KNN
# Make and fit the object
knn = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)
# Use the object 
train['knn_predicted'] = knn.predict(X_train)
# Evaluate performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.knn_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.knn_predicted))

Accuracy: 91.32%
---
Confusion Matrix
actual          ham  spam
knn_predicted            
ham            3859   387
spam              0   211
---
              precision    recall  f1-score   support

         ham       0.91      1.00      0.95      3859
        spam       1.00      0.35      0.52       598

    accuracy                           0.91      4457
   macro avg       0.95      0.68      0.74      4457
weighted avg       0.92      0.91      0.89      4457



In [None]:
#cross_val_score(knn, X_train, y_train, cv = 5)

In [38]:
from sklearn.svm import LinearSVC

In [39]:
# SVC
# Make and fit the object
svc = LinearSVC(random_state=0).fit(X_train, y_train)
# Use the object
train['svc_predicted'] = svc.predict(X_train)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.svc_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.svc_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.svc_predicted))

Accuracy: 99.96%
---
Confusion Matrix
actual          ham  spam
svc_predicted            
ham            3858     1
spam              1   597
---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3859
        spam       1.00      1.00      1.00       598

    accuracy                           1.00      4457
   macro avg       1.00      1.00      1.00      4457
weighted avg       1.00      1.00      1.00      4457



In [40]:
cross_val_score(svc, X_train, y_train, cv = 5)

array([0.97309417, 0.97869955, 0.97755331, 0.97643098, 0.96969697])

# TEst best model


In [41]:
# Create testing dataframe
test = pd.DataFrame(dict(actual=y_test))
test['svc_predicted'] = svc.predict(X_test)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.svc_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.svc_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.svc_predicted))

Accuracy: 97.67%
---
Confusion Matrix
actual         ham  spam
svc_predicted           
ham            964    24
spam             2   125
---
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [42]:
(97.67-86.56)/86.56

0.12835027726432532

SVC performs well on in and out of sample data.

98% accurate - 13% improvement from baseline