In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy
import random

print("Python: ",sys.version)
print("NLTK: ",nltk.__version__)
print("sklearn: ",sklearn.__version__)
print("pandas: ",pandas.__version__)
print("numpy: ",numpy.__version__)

Python:  3.7.2 (tags/v3.7.2:9a3ffc0492, Dec 23 2018, 22:20:52) [MSC v.1916 32 bit (Intel)]
NLTK:  3.4
sklearn:  0.20.3
pandas:  0.24.2
numpy:  1.15.4


# Loading Datase

In [2]:
import pandas as pd
import numpy as np

#loading from a csv file
ds = pd.read_csv('spam.csv',header = None, encoding = 'ISO-8859-1') #encoding not utf-8

In [3]:
#user info data
print(ds.info())
print(ds.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
0    5572 non-null object
1    5572 non-null object
2    50 non-null object
3    12 non-null object
4    6 non-null object
dtypes: object(5)
memory usage: 108.9+ KB
None
      0                                                  1    2    3    4
0   ham  Go until jurong point, crazy.. Available only ...  NaN  NaN  NaN
1   ham                      Ok lar... Joking wif u oni...  NaN  NaN  NaN
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...  NaN  NaN  NaN
3   ham  U dun say so early hor... U c already then say...  NaN  NaN  NaN
4   ham  Nah I don't think he goes to usf, he lives aro...  NaN  NaN  NaN


In [4]:
#check class distribution
classes = ds[0]
fds = nltk.FreqDist(classes)
print("Ham: ", fds['ham'])
print("Spam: ",fds['spam'])
print()
print(classes.value_counts())

Ham:  4825
Spam:  747

ham     4825
spam     747
Name: 0, dtype: int64


# Data Preprocessing

In [5]:
#convert class labels to binary values 0 = ham, 1 = spam

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
print(classes[:15])
print(Y[:15])

0      ham
1      ham
2     spam
3      ham
4      ham
5     spam
6      ham
7      ham
8     spam
9     spam
10     ham
11    spam
12    spam
13     ham
14     ham
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1 0 1 1 0 0]


In [6]:
#store sms data
text_messages = ds[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


# Regular Expressions

In [7]:
#using regular expressions for email, mobile num, symbols, url addresses, other numbers

#replacing email
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailAddr')

#Replacing web address
processed = processed.str.replace(r'^(http(s?)\:\/\/)*[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$','webAddr')

#Replacing money symbol 
processed = processed.str.replace(r'£|\$','moneySymb')

#Replacing Phone number
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phoneNbr')

#Replacing Normal Number
processed = processed.str.replace(r'\d+(\.\d+)?','numbr')

In [8]:
#remove punctuations
processed = processed.str.replace(r'[^\w\d\s]',' ')

#replace multiple space with single space
processed = processed.str.replace(r'\s+',' ')

#replacing leading and trailing white spaces
processed = processed.str.replace(r'^s+|\s+?$','')

In [9]:
#changing all words to lowercase
processed = processed.str.lower()

In [10]:
processed

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [11]:
#Remove stopwords from text messages

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x :' '.join(term for term in x.split() if term not in stop_words))
processed[5060:5070]

5060                         sorry call later meeting sir
5061    havent stuck orchard dad car going numbr dinne...
5062              ok also wan numbr watch e numbr pm show
5063                               dunno lei like dun haf
5064                    brother transfered lt gt lt gt pa
5065    calls later afternoon onwords mtnl service get...
5066    numbr numbr åmoneysymbnumbr uk break accommoda...
5067                                             talk g x
5068    hai dear friends new amp present number rajith...
5069    numbrp numbr alfie moon children need song ur ...
Name: 1, dtype: object

In [12]:
#Remove word stems using Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

In [13]:
processed[5060:5070]


5060                            sorri call later meet sir
5061    havent stuck orchard dad car go numbr dinner u...
5062              ok also wan numbr watch e numbr pm show
5063                               dunno lei like dun haf
5064                      brother transfer lt gt lt gt pa
5065    call later afternoon onword mtnl servic get pr...
5066    numbr numbr åmoneysymbnumbr uk break accommoda...
5067                                             talk g x
5068    hai dear friend new amp present number rajitha...
5069    numbrp numbr alfi moon children need song ur m...
Name: 1, dtype: object

In [14]:
# Breaking the paragraph into words
from nltk.tokenize import word_tokenize

all_words = []
for msg in processed:
    words=word_tokenize(msg)
    for w in words:
        all_words.append(w)

all_word = nltk.FreqDist(all_words)

In [15]:
#Printing no of all and most common 15 words
print("No. of words: ",len(all_word))
print("Most Commom words: ",all_word.most_common(15))

No. of words:  6544
Most Commom words:  [('numbr', 2627), ('u', 1192), ('call', 672), ('go', 453), ('get', 451), ('ur', 385), ('gt', 318), ('lt', 316), ('come', 301), ('åmoneysymbnumbr', 288), ('free', 284), ('know', 274), ('ok', 273), ('day', 273), ('love', 260)]


In [16]:
# We'll use 2000 most common words as features

word_features = [w[0] for w in all_word.most_common(1500)]

In [17]:
def find_features(message):
    words = word_tokenize(message)
    features= {}
    for word in word_features:
        features[word] = (word in words)
        
    return features
    

In [18]:
#testing the function

feature = find_features(processed[9])
for key,value in feature.items():
    if (value == True):
        print(key)

numbr
u
call
free
r
mobil
co
month
latest
camera
colour
updat
entitl


In [19]:
feature

{'numbr': True,
 'u': True,
 'call': True,
 'go': False,
 'get': False,
 'ur': False,
 'gt': False,
 'lt': False,
 'come': False,
 'åmoneysymbnumbr': False,
 'free': True,
 'know': False,
 'ok': False,
 'day': False,
 'love': False,
 'like': False,
 'got': False,
 'time': False,
 'good': False,
 'want': False,
 'text': False,
 'send': False,
 'txt': False,
 'need': False,
 'one': False,
 'today': False,
 'take': False,
 'see': False,
 'home': False,
 'think': False,
 'stop': False,
 'repli': False,
 'lor': False,
 'r': True,
 'sorri': False,
 'still': False,
 'tell': False,
 'numbrp': False,
 'n': False,
 'back': False,
 'mobil': True,
 'da': False,
 'make': False,
 'k': False,
 'dont': False,
 'week': False,
 'phone': False,
 'pleas': False,
 'hi': False,
 'say': False,
 'new': False,
 'work': False,
 'pl': False,
 'later': False,
 'hope': False,
 'ask': False,
 'co': True,
 'miss': False,
 'meet': False,
 'msg': False,
 'messag': False,
 'dear': False,
 'wait': False,
 'night': False

In [20]:
messages = list(zip(processed, Y))
seed = 1
np.random.seed = seed
random.shuffle(messages)

messages[320]
#call find_features for each message
feature_sets = [(find_features(text),label) for (text,label) in messages]

In [21]:
from sklearn import model_selection

training, testing = model_selection.train_test_split(feature_sets, test_size = .20, random_state = seed)

In [22]:
print("Training : ",len(training))
print("Testing : ",len(testing))

Training :  4457
Testing :  1115


# Scikit-learn Classifier with NLTK

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [24]:
#defining models to train

names = ['K Nearest Neighbors', 'Decision Tree','Random Forest','Logistic Regression','SGD','Naive Bayes','SVM Linear']

classifiers = [KNeighborsClassifier(),
              DecisionTreeClassifier(),
              RandomForestClassifier(),
              LogisticRegression(),
              SGDClassifier(max_iter = 100),
              MultinomialNB(),
              SVC(kernel = 'linear')]

models = list(zip(names,classifiers))

In [25]:
# wrap models in NLTK

from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("Accuracy : ",name, accuracy)

Accuracy :  K Nearest Neighbors 93.99103139013452
Accuracy :  Decision Tree 97.9372197309417




Accuracy :  Random Forest 98.38565022421525




Accuracy :  Logistic Regression 98.7443946188341




Accuracy :  SGD 98.47533632286995
Accuracy :  Naive Bayes 98.11659192825111
Accuracy :  SVM Linear 98.56502242152466


In [26]:
from sklearn.ensemble import VotingClassifier

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing)*100
print("Ensemble Accuracy : ",accuracy)

Ensemble Accuracy :  98.7443946188341


In [29]:
#making class label prediction for testing set

txt_features, labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)


In [30]:
# confusion matrix and a classification report

print(classification_report(labels,prediction))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       969
           1       1.00      0.90      0.95       146

   micro avg       0.99      0.99      0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [31]:
pd.DataFrame(
    confusion_matrix(labels,prediction),
    index = [['actual','actual'],['ham','spam']],
    columns = [['predicted','predicted'],['ham','spam']])

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,969,0
actual,spam,14,132
