In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
import spacy
filterwarnings("ignore")

# Reading Data

In [3]:
df = pd.read_csv("spam (or) ham.csv")
df

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [4]:
pd.DataFrame({"nulls":df.isna().sum(),
             "dublecate":df.duplicated().sum(),
             "uneuq":df.nunique().sum()}).T

Unnamed: 0,Class,sms
nulls,0,0
dublecate,403,403
uneuq,5172,5172


# Droping Duplicates

In [5]:
df.drop_duplicates(inplace=True )

In [6]:
pd.DataFrame({"nulls":df.isna().sum(),
             "dublecate":df.duplicated().sum(),
             "uneuq":df.nunique().sum()}).T

Unnamed: 0,Class,sms
nulls,0,0
dublecate,0,0
uneuq,5172,5172


# Loading spacy for NLP

In [6]:
nlp = spacy.load("en_core_web_sm")

# Text analysis

In [23]:
tokens = []
for doc in df["sms"]:
    print(f"{doc}\n")
    tokens.append(doc)

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

Ok lar... Joking wif u oni...

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

U dun say so early hor... U c already then say...

Nah I don't think he goes to usf, he lives around here though

FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv

Even my brother is not like to speak with me. They treat me like aids patent.

As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune

WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.

Had your mobile 11 months or more? U

In [8]:
tokens

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though",
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
 'Had you

In [None]:
for token in tokens:
    token = nlp(token)
    print(token.ents)
    for t in token:
        print(f"{t.pos_} \t {t.dep_} \t {t.lemma_} \t {t.tag_}")

(jurong point,)
VERB 	 ROOT 	 go 	 VB
ADP 	 prep 	 until 	 IN
PROPN 	 compound 	 jurong 	 NNP
NOUN 	 pobj 	 point 	 NN
PUNCT 	 punct 	 , 	 ,
ADJ 	 acomp 	 crazy 	 JJ
PUNCT 	 punct 	 .. 	 .
ADJ 	 ROOT 	 available 	 JJ
ADV 	 advmod 	 only 	 RB
ADP 	 prep 	 in 	 IN
ADJ 	 amod 	 bugis 	 JJ
CCONJ 	 cc 	 n 	 CC
ADJ 	 amod 	 great 	 JJ
NOUN 	 compound 	 world 	 NN
PROPN 	 compound 	 la 	 NNP
PROPN 	 compound 	 e 	 NNP
NOUN 	 pobj 	 buffet 	 NN
PUNCT 	 punct 	 ... 	 :
PROPN 	 nsubj 	 Cine 	 NNP
ADV 	 expl 	 there 	 RB
VERB 	 ROOT 	 get 	 VBD
PROPN 	 compound 	 amore 	 NNP
NOUN 	 dobj 	 wat 	 NN
PUNCT 	 punct 	 ... 	 :
()
INTJ 	 intj 	 ok 	 UH
ADJ 	 ROOT 	 lar 	 JJ
PUNCT 	 punct 	 ... 	 :
VERB 	 ROOT 	 joke 	 VBG
NOUN 	 dative 	 wif 	 NNS
NOUN 	 npadvmod 	 u 	 NN
NOUN 	 nsubj 	 oni 	 NN
PUNCT 	 punct 	 ... 	 :
(2, FA Cup, May 2005, 87121, rate)T&C, 08452810075over18)
ADJ 	 amod 	 free 	 JJ
NOUN 	 ROOT 	 entry 	 NN
ADP 	 prep 	 in 	 IN
NUM 	 nummod 	 2 	 CD
DET 	 det 	 a 	 DT
ADJ 	 amod 	 wkly 	

In [11]:
tokens[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

# Get-Dummies

In [7]:
df = pd.get_dummies(columns=["Class"] , data = df , drop_first=True)
df

Unnamed: 0,sms,Class_spam
0,"Go until jurong point, crazy.. Available only ...",False
1,Ok lar... Joking wif u oni...,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,U dun say so early hor... U c already then say...,False
4,"Nah I don't think he goes to usf, he lives aro...",False
...,...,...
5568,Will ü b going to esplanade fr home?,False
5569,"Pity, * was in mood for that. So...any other s...",False
5570,The guy did some bitching but I acted like i'd...,False
5571,Rofl. Its true to its name,False


In [8]:
df

Unnamed: 0,sms,Class_spam
0,"Go until jurong point, crazy.. Available only ...",False
1,Ok lar... Joking wif u oni...,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,U dun say so early hor... U c already then say...,False
4,"Nah I don't think he goes to usf, he lives aro...",False
...,...,...
5568,Will ü b going to esplanade fr home?,False
5569,"Pity, * was in mood for that. So...any other s...",False
5570,The guy did some bitching but I acted like i'd...,False
5571,Rofl. Its true to its name,False


In [9]:
def convert_bool(x):
    if x == False:
        return  0
    elif x == True:
        return  1
        

In [10]:
df["Class_spam"] = df["Class_spam"].apply(convert_bool)

In [11]:
print(df.Class_spam)

0       0
1       0
2       1
3       0
4       0
       ..
5568    0
5569    0
5570    0
5571    0
5572    1
Name: Class_spam, Length: 5170, dtype: int64


# Create a PipeLine for Training Model

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])


In [14]:
from sklearn.model_selection import train_test_split

X = df["sms"]
y = df["Class_spam"]

X_train , X_test , y_train , y_test = train_test_split(X , y , random_state=42 )

In [25]:
text_clf.fit(X_train,y_train)

# Predictions

In [26]:
predictions = text_clf.predict(X_test)

In [27]:
from sklearn.metrics import confusion_matrix , classification_report

print(confusion_matrix(predictions , y_test))

[[1119   17]
 [   5  152]]


In [28]:
print(classification_report(predictions , y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1136
           1       0.90      0.97      0.93       157

    accuracy                           0.98      1293
   macro avg       0.95      0.98      0.96      1293
weighted avg       0.98      0.98      0.98      1293



# Testing the Model

In [31]:
text_clf.predict(["congratulations you won a price 550 $ please contact us"])

array([1], dtype=int64)

In [32]:
text_clf.predict(["hi how are you doing today ?"])

array([0], dtype=int64)

# GREAT JOB