In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data-files/SMSSpamCollection.tsv', sep="\t",
                 header=None, names=['target', 'message'])

In [3]:
df

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
from sklearn.preprocessing import LabelEncoder
df['label'] = LabelEncoder().fit_transform(df['target'])
df.head()

Unnamed: 0,target,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
import re

In [6]:
df['message2'] = df['message'].map(lambda v: re.sub("[^\w\s]", '', v))
df

  df['message2'] = df['message'].map(lambda v: re.sub("[^\w\s]", '', v))


Unnamed: 0,target,message,label,message2
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?,0,Will ü b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that Soany other suggest...
5570,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like id ...


In [7]:
df['message3'] = df['message2'].str.lower()
df

Unnamed: 0,target,message,label,message2,message3
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?,0,Will ü b going to esplanade fr home,will ü b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that Soany other suggest...,pity was in mood for that soany other suggest...
5570,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like id ...,the guy did some bitching but i acted like id ...


In [8]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\human\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\human\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [9]:
df['message4'] = df['message3'].map(nltk.word_tokenize)
df

Unnamed: 0,target,message,label,message2,message3,message4
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,Will ü b going to esplanade fr home?,0,Will ü b going to esplanade fr home,will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that Soany other suggest...,pity was in mood for that soany other suggest...,"[pity, was, in, mood, for, that, soany, other,..."
5570,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like id ...,the guy did some bitching but i acted like id ...,"[the, guy, did, some, bitching, but, i, acted,..."


In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('apples'), stemmer.stem('apple'))
df['message5'] = df['message4'].map(lambda words: [ stemmer.stem(w) for w in words])
df

appl appl


Unnamed: 0,target,message,label,message2,message3,message4,message5
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazi, avail, onli,..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, i, dont, think, he, goe, to, usf, he, li..."
...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,...","[thi, is, the, 2nd, time, we, have, tri, 2, co..."
5568,ham,Will ü b going to esplanade fr home?,0,Will ü b going to esplanade fr home,will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]","[will, ü, b, go, to, esplanad, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that Soany other suggest...,pity was in mood for that soany other suggest...,"[pity, was, in, mood, for, that, soany, other,...","[piti, wa, in, mood, for, that, soani, other, ..."
5570,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like id ...,the guy did some bitching but i acted like id ...,"[the, guy, did, some, bitching, but, i, acted,...","[the, guy, did, some, bitch, but, i, act, like..."


In [13]:
df['message6'] = df['message5'].map(lambda words: ' '.join(words))
df

Unnamed: 0,target,message,label,message2,message3,message4,message5,message6
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazi, avail, onli,...",go until jurong point crazi avail onli in bugi...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]",ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entri, in, 2, a, wkli, comp, to, win, f...",free entri in 2 a wkli comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, earli, hor, u, c, alreadi, t...",u dun say so earli hor u c alreadi then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, i, dont, think, he, goe, to, usf, he, li...",nah i dont think he goe to usf he live around ...
...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,...","[thi, is, the, 2nd, time, we, have, tri, 2, co...",thi is the 2nd time we have tri 2 contact u u ...
5568,ham,Will ü b going to esplanade fr home?,0,Will ü b going to esplanade fr home,will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]","[will, ü, b, go, to, esplanad, fr, home]",will ü b go to esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that Soany other suggest...,pity was in mood for that soany other suggest...,"[pity, was, in, mood, for, that, soany, other,...","[piti, wa, in, mood, for, that, soani, other, ...",piti wa in mood for that soani other suggest
5570,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like id ...,the guy did some bitching but i acted like id ...,"[the, guy, did, some, bitching, but, i, acted,...","[the, guy, did, some, bitch, but, i, act, like...",the guy did some bitch but i act like id be in...


In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(df['message6'])
# vectorizer.vocabulary_

sparse_word_counts = vectorizer.transform(df['message6'])
dense_word_counts = sparse_word_counts.toarray()

tvectorizer = TfidfVectorizer()
tvectorizer.fit(df['message6'])
sparse_word_counts2 = tvectorizer.transform(df['message6'])
dense_word_counts2 = sparse_word_counts2.toarray()

In [23]:
X_train, X_test, y_train, y_test = train_test_split(dense_word_counts, df['label'], random_state=42)

X_train2, X_test2, y_train2, y_test2 = train_test_split(dense_word_counts2, df['label'], random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

mnb.score(X_train, y_train), mnb.score(X_test, y_test)


(0.990667623833453, 0.9813352476669059)

In [31]:
bnb = BernoulliNB()
bnb.fit(X_train2, y_train2)

bnb.score(X_train2, y_train2), bnb.score(X_test2, y_test2)


(0.9837281646326872, 0.9784637473079684)

In [27]:
print(dense_word_counts[0][[3336, 7497, 4128, 5635, 2248]])
print(dense_word_counts2[0][[3336, 7497, 4128, 5635, 2248]])

[1 1 1 1 1]
[0.13226686 0.23201273 0.32930302 0.22485506 0.25502252]
