In [28]:
import pandas as pd
import numpy as np

In [29]:
df=pd.read_table("smsspamcollection/SMSSpamCollection",header=None, encoding="utf-8")

  """Entry point for launching an IPython kernel.


In [30]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [32]:
df[0].value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

## Preprocessing

In [33]:
df["Label"]=df[0].replace(to_replace=["spam","ham"],value=[1,0])

In [34]:
df.head()

Unnamed: 0,0,1,Label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [35]:
text_message=df[1]
print(text_message[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


#### Regular expressions to replace email, addresses, urls, phone no, other numbers, symbols

In [36]:
#email replace
processed=text_message.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

# Replace URLs with 'webaddress'
processed=processed.str.replace(r'http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed=processed.str.replace(r'£|\$','moneysymb')

# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumr')

# Replace numbers with 'numbr'
processed=processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed=processed.str.replace(r'[^\w\d\s]',' ')

# Replace whites pace between terms with a single space
processed=processed.str.replace(r'\s+',' ')

# Remove leading and trailing whitespace
processed=processed.str.replace(r'^\s+|\s+?$', '')

In [37]:
# change words to lower case
processed=processed.str.lower()

In [38]:
processed[:50]

0     go until jurong point crazy available only in ...
1                               ok lar joking wif u oni
2     free entry in numbr a wkly comp to win fa cup ...
3           u dun say so early hor u c already then say
4     nah i don t think he goes to usf he lives arou...
5     freemsg hey there darling it s been numbr week...
6     even my brother is not like to speak with me t...
7     as per your request melle melle oru minnaminun...
8     winner as a valued network customer you have b...
9     had your mobile numbr months or more u r entit...
10    i m gonna be home soon and i don t want to tal...
11    six chances to win cash from numbr to numbr nu...
12    urgent you have won a numbr week free membersh...
13    i ve been searching for the right words to tha...
14                    i have a date on sunday with will
15    xxxmobilemovieclub to use your credit click th...
16                               oh k i m watching here
17    eh u remember how numbr spell his name yes

In [39]:
from nltk.corpus import stopwords

In [40]:
import nltk

In [41]:
# nltk.download('popular')
# nltk.download("third-party")

In [42]:
stop_words=set(stopwords.words('english'))

processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [43]:
# removing word stems using a Porter stemmer

ps=nltk.PorterStemmer()

processed=processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [44]:
processed

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl numbr week word back like fun...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea mon...
9       mobil numbr month u r entitl updat latest colo...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash numbr numbr numbr pound txt...
12      urgent numbr week free membership moneysymbnum...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [45]:
from nltk.tokenize import word_tokenize

In [46]:
#creating bag of words

all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)

In [47]:
all_words=nltk.FreqDist(all_words)

In [48]:
#print the total number of words and the 15 most common words

print(f"Number of words: {len(all_words)}")
print(f"Most common words: {all_words.most_common(15)}")

Number of words: 6576
Most common words: [('numbr', 2647), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [49]:
# use the 1500 most common words as features
word_features=list(all_words.keys())[:1500]

In [50]:
# define a features function

def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
    return features

# lets see some results
features=find_features(processed[0])

for key,value in features.items():
    if value== True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [51]:
processed[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [52]:
word_tokenize(processed[0])

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat']

In [70]:
# find features for all messages

messages=list(zip(processed,df["Label"].values))

np.random.seed=123
np.random.shuffle(messages)

#call find_features function for each SMS message

featuresets=[(find_features(text),label) for (text,label) in messages]

In [106]:


# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=123)

In [107]:
print(len(training))
print(len(testing))

4179
1393


#### Classifiers with NLTK

In [85]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [89]:
names=["KNN","decision Tree","Random Forest","Logistic Regression","SGD Classifier","Naive Bayes","SVM Linear"]

classifiers=[
    KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),
    LogisticRegression(),SGDClassifier(max_iter=100),MultinomialNB(),
    SVC(kernel="linear")
]

models=list(zip(names, classifiers))
models

[('KNN',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=None, n_neighbors=5, p=2,
             weights='uniform')),
 ('decision Tree',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, presort=False, random_state=None,
              splitter='best')),
 ('Random Forest',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
              oob_score=False, random_state=None, verbose=0,
    

In [90]:
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    print(f"{name}: Acc: {nltk.classify.accuracy(nltk_model,testing)*100}")

KNN: Acc: 93.03661162957646
decision Tree: Acc: 96.62598707824839




Random Forest: Acc: 97.70279971284997




Logistic Regression: Acc: 98.42067480258436




SGD Classifier: Acc: 97.91816223977028
Naive Bayes: Acc: 97.91816223977028
SVM Linear: Acc: 98.1335247666906


#### Ensemble

In [91]:
from sklearn.ensemble import VotingClassifier

names=["KNN","decision Tree","Random Forest","Logistic Regression","SGD Classifier","Naive Bayes","SVM Linear"]

classifiers=[
    KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),
    LogisticRegression(),SGDClassifier(max_iter=100),MultinomialNB(),
    SVC(kernel="linear")
]

models=list(zip(names,classifiers))

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models, voting="hard", n_jobs=-1))
nltk_ensemble.train(training)
print(f"Acc: {nltk.classify.accuracy(nltk_ensemble,testing)*100}")

Acc: 98.27709978463747


In [123]:
txt_features, labels=[i for i in testing]

ValueError: too many values to unpack (expected 2)

In [158]:
i=0
txt_features=[]
labels=[]
while i != len(testing):
    txt_features.append(testing[i][0])
    labels.append(testing[i][1])
    i+=1

In [161]:
prediction=nltk_ensemble.classify_many(txt_features)

In [162]:
print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index=[["actual","actual"],["ham","spam"]],
    columns=[["predicted","predicted"],["ham","spam"]])

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1190
           1       0.99      0.89      0.94       203

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1188,2
actual,spam,22,181
