In [61]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix

In [31]:
df = pd.read_csv('smss.csv', names = ['message','label'])
df

Unnamed: 0,message,label
0,"ham\tGo until jurong point, crazy.. Available ...",
1,ham\tOk lar... Joking wif u oni...,
2,spam\tFree entry in 2 a wkly comp to win FA Cu...,
3,ham\tU dun say so early hor... U c already the...,
4,"ham\tNah I don't think he goes to usf, he live...",
...,...,...
995,"ham\tI can't, I don't have her number!",
996,ham\tChange again... It's e one next to escala...,
997,ham\tYetunde i'm in class can you not run wate...,
998,ham\tNot a lot has happened here. Feels very q...,


In [32]:
message = []
label = []
for i in df.message:
    if str(i).startswith('ham'):
        m = i[4:]
        l = i[:3]
        message.append(m)
        label.append(l)
    elif str(i).startswith('spam'):
        m = i[5:]
        l = i[:4]
        message.append(m)
        label.append(l)
                   
df['message']=message
df['label']=label
df

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
995,"I can't, I don't have her number!",ham
996,Change again... It's e one next to escalator...,ham
997,Yetunde i'm in class can you not run water on ...,ham
998,Not a lot has happened here. Feels very quiet....,ham


In [33]:
df['label'] = df.label.map({'ham':0,'spam':1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('^\w\s', '')
df['message'] = df['message'].apply(nltk.word_tokenize)
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])
df

  df['message'] = df.message.str.replace('^\w\s', '')


Unnamed: 0,message,label
0,"[go, until, jurong, point, ,, crazi, .., avail...",0
1,"[ok, lar, ..., joke, wif, u, oni, ...]",0
2,"[free, entri, in, 2, a, wkli, comp, to, win, f...",1
3,"[dun, say, so, earli, hor, ..., u, c, alreadi,...",0
4,"[nah, i, do, n't, think, he, goe, to, usf, ,, ...",0
...,...,...
995,"[ca, n't, ,, i, do, n't, have, her, number, !]",0
996,"[chang, again, ..., it, 's, e, one, next, to, ...",0
997,"[yetund, i, 'm, in, class, can, you, not, run,...",0
998,"[not, a, lot, ha, happen, here, ., feel, veri,...",0


In [34]:
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df

Unnamed: 0,message,label
0,"go until jurong point , crazi .. avail onli in...",0
1,ok lar ... joke wif u oni ...,0
2,free entri in 2 a wkli comp to win fa cup fina...,1
3,dun say so earli hor ... u c alreadi then say ...,0
4,"nah i do n't think he goe to usf , he live aro...",0
...,...,...
995,"ca n't , i do n't have her number !",0
996,chang again ... it 's e one next to escal ...,0
997,yetund i 'm in class can you not run water on ...,0
998,not a lot ha happen here . feel veri quiet . b...,0


tf-idf

In [42]:
vec = TfidfVectorizer()

In [43]:
features = vec.fit_transform(df['message'])

In [44]:
pd.DataFrame(features.toarray(),columns=vec.vocabulary_.keys())

Unnamed: 0,go,until,jurong,point,crazi,avail,onli,in,bugi,great,...,stripe,skirt,miracl,jesu,escal,quiet,beth,aunt,charli,helen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
x_train,x_test,y_train,y_test = train_test_split(features,df['label'] ,test_size=0.2)

In [50]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train,y_train)

MultinomialNB()

In [56]:
pred = model.predict(x_test)
#print (np.mean(pred == y_test))

In [57]:
accuracy_score(y_test, pred)

0.925

In [59]:
print(confusion_matrix(y_test, pred))

[[175   0]
 [ 15  10]]


In [60]:
#0.925

counting

In [63]:
vec2 = CountVectorizer(binary=False)
features2 = vec2.fit_transform(df['message'])

In [64]:
pd.DataFrame(features2.toarray(),columns=vec.vocabulary_.keys())

Unnamed: 0,go,until,jurong,point,crazi,avail,onli,in,bugi,great,...,stripe,skirt,miracl,jesu,escal,quiet,beth,aunt,charli,helen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
x2_train,x2_test,y2_train,y2_test = train_test_split(features2,df['label'] ,test_size=0.2)
model2 = MultinomialNB()
model2.fit(x2_train,y2_train)

MultinomialNB()

In [67]:
pred2 = model2.predict(x2_test)
accuracy_score(y2_test, pred2)

0.965

In [68]:
print(confusion_matrix(y2_test, pred2))

[[159   4]
 [  3  34]]


binary encoding

In [70]:
vec = CountVectorizer(binary=True)
features3 = vec.fit_transform(df['message'])

In [71]:
pd.DataFrame(features3.toarray(),columns=vec.vocabulary_.keys())

Unnamed: 0,go,until,jurong,point,crazi,avail,onli,in,bugi,great,...,stripe,skirt,miracl,jesu,escal,quiet,beth,aunt,charli,helen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
x3_train,x3_test,y3_train,y3_test = train_test_split(features3,df['label'] ,test_size=0.2)
model3 = MultinomialNB()
model3.fit(x3_train,y3_train)

MultinomialNB()

In [73]:
pred3 = model3.predict(x3_test)
accuracy_score(y3_test, pred3)

0.985

In [74]:
print(confusion_matrix(y3_test, pred3))

[[165   2]
 [  1  32]]
