In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [2]:
dset = pd.read_csv("https://raw.githubusercontent.com/ShubhamPy/Spam-Classifier/master/spam.tsv",sep='\t',names=['Class','Message'])
dset.head(8)

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!
5,ham,As per your request 'Melle Melle (Oru Minnamin...
6,spam,WINNER!! As a valued network customer you have...
7,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
dinfo=dset.info()
dinfo

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5567 non-null   object
 1   Message  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
dset.describe()

Unnamed: 0,Class,Message
count,5567,5567
unique,2,5164
top,ham,"Sorry, I'll call later"
freq,4821,30


In [5]:
dset['Length'] = dset['Message'].apply(len)
dset.head(8)

Unnamed: 0,Class,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
5,ham,As per your request 'Melle Melle (Oru Minnamin...,160
6,spam,WINNER!! As a valued network customer you have...,157
7,spam,Had your mobile 11 months or more? U R entitle...,154


In [6]:
dset.groupby('Class').count()

Unnamed: 0_level_0,Message,Length
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4821,4821
spam,746,746


In [7]:
dset['Length'].describe()

count    5567.000000
mean       80.450153
std        59.891023
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64

In [8]:
dObject = dset['Class'].values
dObject

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [9]:
dset.loc[dset['Class']=="ham","Class"] = 1

In [10]:
dset.loc[dset['Class']=="spam","Class"] = 0

In [11]:
dObject2=dset['Class'].values
dObject2

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [12]:
dset.head(8)

Unnamed: 0,Class,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
5,1,As per your request 'Melle Melle (Oru Minnamin...,160
6,0,WINNER!! As a valued network customer you have...,157
7,0,Had your mobile 11 months or more? U R entitle...,154


In [13]:
#clean message from punctuations
def cleanMessage(message):
    nonPunc = [char for char in message if char not in string.punctuation]
    nonPunc = "".join(nonPunc)
    return nonPunc

In [16]:
dset['Message'] = dset['Message'].apply(cleanMessage)

In [15]:
dset.head(8)

Unnamed: 0,Class,Message,Length
0,1,Ive been searching for the right words to than...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,Nah I dont think he goes to usf he lives aroun...,61
3,1,Even my brother is not like to speak with me T...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL,36
5,1,As per your request Melle Melle Oru Minnaminun...,160
6,0,WINNER As a valued network customer you have b...,157
7,0,Had your mobile 11 months or more U R entitled...,154


In [19]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def clean_sentences(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9^,!.\/']", " ", text)
    text = " ".join(text.split())
    text = " ".join(stemmer.stem(word) for word in text.split())
    return text

In [71]:
x = dset['Message']
y = dset['Class']

In [72]:
x = x.map(lambda a: clean_sentences(a))

In [73]:
x

0       ive been search for the right word to thank yo...
1       free entri in 2 a wkli comp to win fa cup fina...
2       nah i dont think he goe to usf he live around ...
3       even my brother is not like to speak with me t...
4                       i have a date on sunday with will
                              ...                        
5562    thi is the 2nd time we have tri 2 contact u u ...
5563                        will b go to esplanad fr home
5564         piti wa in mood for that soani other suggest
5565    the guy did some bitch but i act like id be in...
5566                              rofl it true to it name
Name: Message, Length: 5567, dtype: object

In [74]:
y

0       1
1       0
2       1
3       1
4       1
       ..
5562    0
5563    1
5564    1
5565    1
5566    1
Name: Class, Length: 5567, dtype: object

In [75]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42)

In [76]:
print(x_train)

4063    you are be contact by our date servic by someo...
585                         im in a meet call me later at
3412        uhhhhrmm isnt have tb test bad when your sick
5278                        yeah probabl here for a while
4898    free polyphon rington text super to 87131 to g...
                              ...                        
3772                              ok lor msg me b4 u call
5191    spook up your mob with a halloween collect of ...
5226    i realis you are a busi guy and im tri not to ...
5390       dunno lei shd b drive lor co i go sch 1 hr oni
860                dude ive been see a lotta corvett late
Name: Message, Length: 4175, dtype: object


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [78]:
vectorizer = TfidfVectorizer(stop_words='english')

In [79]:
x_train = vectorizer.fit_transform(x_train)

In [80]:
x_test = vectorizer.transform(x_test)

In [81]:
from sklearn.svm import LinearSVC

In [82]:
model = LinearSVC(C=1.05, tol=0.5)

In [84]:
print(x_train)

  (0, 4680)	0.4624572218570745
  (0, 211)	0.4624572218570745
  (0, 3593)	0.31412203882364786
  (0, 4043)	0.2553456131622503
  (0, 3546)	0.22322182659184345
  (0, 5480)	0.30492794555222874
  (0, 5265)	0.2871279853582961
  (0, 1995)	0.3232117406312384
  (0, 1836)	0.2804910932009272
  (1, 3613)	0.6186481974282129
  (1, 3936)	0.6324229087820719
  (1, 3227)	0.4661712906930737
  (2, 5362)	0.4054445343173905
  (2, 1180)	0.35712205619604015
  (2, 5881)	0.3667223508565842
  (2, 5831)	0.44416671777819683
  (2, 3337)	0.3783219790482957
  (2, 6169)	0.48288890123900313
  (3, 4800)	0.7673070686491501
  (3, 6692)	0.6412798627752537
  (4, 549)	0.2934612709347639
  (4, 5706)	0.26235333173883024
  (4, 690)	0.2934612709347639
  (4, 4324)	0.2934612709347639
  (4, 4675)	0.2934612709347639
  :	:
  (4172, 4941)	0.3367657697386846
  (4172, 1389)	0.3464901883518873
  (4172, 1568)	0.26015971882600747
  (4172, 4482)	0.3367657697386846
  (4172, 2923)	0.2419918181951952
  (4172, 6416)	0.21760420693602653
  (4172, 

In [87]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')
model.fit(x_train,y_train)

LinearSVC(C=1.05, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.5,
          verbose=0)

In [89]:
y_test

1168    1
765     1
465     1
1117    0
4930    1
       ..
668     0
218     1
4711    1
2970    1
3541    1
Name: Class, Length: 1392, dtype: int64

In [90]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
confusion_matrix(y_test,model.predict(x_test))

array([[ 164,   17],
       [   5, 1206]])

In [91]:
accuracy_score(y_test,model.predict(x_test))

0.9841954022988506

In [92]:
recall_score(y_test,model.predict(x_test))

0.9958711808422791

In [93]:
precision_score(y_test,model.predict(x_test))

0.9860997547015535

In [94]:
f1_score(y_test,model.predict(x_test))

0.9909613804437141

In [107]:
#giving custom inputs

def do_vectorize(sentence):
    return vectorizer.transform([sentence])

Sen = do_vectorize('I am going to work this night')
result = model.predict(Sen)
print(result)

Sen = do_vectorize('free entry in 2 courses ruppes 12')
result = model.predict(Sen)
print(result)

[1]
[0]
