In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

In [68]:
df = pd.read_csv('data/spam.csv', encoding='latin', names=['y', 'x1', 'x2', 'x3', 'x4'], header=0).fillna('')
y = df['y']
df['X'] = df['x1'] + df['x2'] + df['x3'] + df['x4']
df.drop(columns=['x1', 'x2', 'x3', 'x4'], inplace=True)
df

Unnamed: 0,y,X
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [72]:
# Clean not a-zA-Z content
df['X'] = df['X'].str.replace('[^A-Za-z\s]', '')
df['X'] = df['X'].str.lower()
df['X']

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in  a wkly comp to win fa cup final...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the nd time we have tried  contact u u...
5568                   will  b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: X, Length: 5572, dtype: object

In [93]:
cv = CountVectorizer(stop_words='english', min_df=0.0005)
X = cv.fit_transform(df['X']).toarray()
X_df = pd.DataFrame(X, columns=cv.get_feature_names())
X_df

Unnamed: 0,aah,aathilove,aathiwhere,abi,abiola,able,abt,abta,ac,acc,...,youd,youll,youre,youve,yr,yrs,yummy,yun,yup,zed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
score = cross_validate(MultinomialNB(), X, y, cv=10, scoring='accuracy', return_train_score=True)
print('Train accuracy: {:.4f}'.format(score['train_score'].mean()))
print('Test accuracy: {:.4f}'.format(score['test_score'].mean()))

Train accuracy: 0.9853
Test accuracy: 0.9779


In [121]:
tf_idf = TfidfVectorizer(max_features=2500, stop_words='english', ngram_range=(1,1))
X2 = tf_idf.fit_transform(df['X']).toarray()
X2_df = pd.DataFrame(X2, columns=tf_idf.get_feature_names())
X2_df

Unnamed: 0,aah,aathilove,aathiwhere,abi,abiola,able,abt,abta,ac,acc,...,youre,youve,yr,yrs,yummy,yun,yuo,yup,zed,zoe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
score2 = cross_validate(MultinomialNB(), X2, y, cv=10, scoring='accuracy', return_train_score=True)
print('Train accuracy: {:.4f}'.format(score2['train_score'].mean()))
print('Test accuracy: {:.4f}'.format(score2['test_score'].mean()))

Train accuracy: 0.9827
Test accuracy: 0.9772


In [123]:
tf_idf = TfidfVectorizer(max_features=7500, stop_words='english', ngram_range=(1,2))
X2 = tf_idf.fit_transform(df['X']).toarray()
score2 = cross_validate(MultinomialNB(), X2, y, cv=10, scoring='accuracy', return_train_score=True)
print('Train accuracy: {:.4f}'.format(score2['train_score'].mean()))
print('Test accuracy: {:.4f}'.format(score2['test_score'].mean()))

Train accuracy: 0.9802
Test accuracy: 0.9725
