In [0]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
import os

In [0]:
os.chdir('E:\\Profond Ananlytics\\SPAM-HAM')

In [0]:
df=pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [0]:
df=df[['v1', 'v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
#Text pre processing
#convert to lower case

df['v2']=df['v2'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df.head()

Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [0]:
#replace special characters
df['v2']=df['v2'].str.replace('[^\w\s]','')
df.head()

Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [0]:
#remove stop words
stop=stopwords.words('english')
df['v2']=df['v2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [0]:
#stemming
st=PorterStemmer()
df['v2']=df['v2'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [0]:
#lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

df['v2']=df['v2'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
df.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [0]:
#tokenization
df['v2']=df.apply(lambda x: word_tokenize(x['v2']), axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,"[ok, lar, joke, wif, u, oni]"
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"[nah, dont, think, goe, usf, live, around, tho..."


In [0]:
df['v2']=[" ".join(x) for x in df['v2'].values]

In [0]:
df.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [0]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest= train_test_split(df['v2'], df['v1'], test_size=0.3, random_state=100)

print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(3900,)
(1672,)
(3900,)
(1672,)


In [0]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
ytrain=lb.fit_transform(ytrain)
ytest=lb.transform(ytest)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfvect=TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfvect=tfvect.fit(df['v2'])

xtrain_new=tfvect.transform(xtrain)
xtest_new=tfvect.transform(xtest)

In [0]:
xtrain_new

<3900x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 33035 stored elements in Compressed Sparse Row format>

In [0]:
def train_model(classifier, xtrain, ytrain, xtest, ytest):
    mod=classifier.fit(xtrain, ytrain)
    predictions=mod.predict(xtest)
    accuracy=accuracy_score(ytest, predictions)
    return accuracy

In [0]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

accuracy1 = train_model(naive_bayes.MultinomialNB(), xtrain_new, ytrain, xtest_new, ytest)

print(accuracy1)

0.965311004784689


In [0]:


accuracy = train_model(LogisticRegression(), xtrain_new, ytrain, xtest_new, ytest)

print(accuracy)



0.9581339712918661
