In [1]:
import pandas as pd

# 1) Loading Data Set

In [2]:
dt=pd.read_csv("spam.csv",encoding=('ISO-8859-1'),low_memory=False)
dt.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
dt['spam']=dt['type'].map({'spam':1,'ham':0}).astype(int)
dt.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
print("Columns in the given data:")
for col in dt.columns:
    print(col)

Columns in the given data:
type
text
spam


In [5]:
t=len(dt['type'])
print("No. of rows in review column:",t)
t=len(dt['text'])
print("No. of rows in liked column:",t)

No. of rows in review column: 116
No. of rows in liked column: 116


# 2) Tokenization

In [6]:
dt['text'][1]

'Ok lar... Joking wif u oni...'

In [7]:
def tokenizer(text):
    return text.split()

In [8]:
dt['text']=dt['text'].apply(tokenizer)

In [9]:
dt['text'][1]#after

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

# 3) Stemming

In [10]:
dt['text'][1]#before

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [11]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english",ignore_stopwords=False)

In [12]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [13]:
dt['text']=dt['text'].apply(stem_it)

In [14]:
dt['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

# 4) Lemmitization

In [15]:
dt['text'][115]

['wa,', 'ur', 'openin', 'sentenc', 'veri', 'for']

In [16]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [17]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word,pos ="a") for word in text]

In [18]:
dt['text']=dt['text'].apply(lemmit_it)

In [20]:
dt['text'][115]#after

['wa,', 'ur', 'openin', 'sentenc', 'veri', 'for']

# 5) Stopword Removal

In [44]:
dt['text'][115]#before

'wa,uropeninsentencveri'

In [22]:
#from nltk.corpus import stopwords
#print(stopwords.words('english'))
#set(['a','t','d','y'])

In [45]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [24]:
def stop_it(text):
    review = [word for word in text if not word in stop_words ]
    return review

In [46]:
dt['text']=dt['text'].apply(stop_it)

In [47]:
dt['text'][115]#after

['w',
 ',',
 'u',
 'r',
 'p',
 'e',
 'n',
 'n',
 'e',
 'n',
 'e',
 'n',
 'c',
 'v',
 'e',
 'r']

In [48]:
dt.head(10)

Unnamed: 0,type,text,spam
0,ham,"[g, j, u, r, n, g, p, n, ,, c, r, z, ., ., v, ...",0
1,ham,"[k, l, r, ., ., ., j, k, e, w, f, u, n, ., ., .]",0
2,spam,"[f, r, e, e, e, n, r, 2, w, k, l, c, p, w, n, ...",1
3,ham,"[u, u, n, e, r, l, h, r, ., ., ., u, c, l, r, ...",0
4,ham,"[n, h, h, n, k, g, e, u, f, ,, l, v, e, r, u, ...",0
5,spam,"[f, r, e, e, g, h, e, r, l, 3, w, e, e, k, w, ...",1
6,ham,"[e, v, e, n, b, r, h, e, r, l, k, e, p, e, k, ...",0
7,ham,"[p, e, r, r, e, q, u, e, e, l, l, e, l, l, (, ...",0
8,spam,"[w, n, n, e, r, !, !, v, l, u, n, e, w, r, k, ...",1
9,spam,"[b, l, 1, 1, n, h, r, e, ?, u, r, e, n, l, u, ...",1


In [49]:
dt['text']=dt['text'].apply(''.join)

In [50]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"gjurngpn,crz..vlnlbugngrewrllebuffe...cnegrw...",0
1,ham,klr...jkewfun...,0
2,spam,freeenr2wklcpwnfcupfnlk212005.exf87121recevenr...,1
3,ham,uunerlhr...uclre...,0
4,ham,"nhhnkgeuf,lverunhugh",0


# 6) Transformation Text Data into TDF/TF_IDF Vectors

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
y=dt.spam.values
x=tfidf.fit_transform(dt['text'])

In [52]:
from sklearn.model_selection import train_test_split
x_train,x_text,y_train,y_text=train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

# 7) Classification using Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_text)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_text)*100
print("accuracy:",acc_log)

accuracy: 87.5


# 8) Classification using Linear SVC Accuracy


In [54]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_text)
acc_linear_svc =accuracy_score(y_pred, y_text)*100
print("accuracy:",acc_linear_svc)

accuracy: 87.5
