In [1]:
import nltk
import numpy as np
import pandas as pd

# Loading the dataset

In [2]:
#load the dataset of sms messages
df=pd.read_table(r"C:\Users\rukmani\Dropbox\My PC (jayanth)\Desktop\spam collection\SMSSpamCollection",header=None,encoding='utf-8')

In [4]:
#analyzing the dataset
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
#check class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


# Data preprocessing

In [9]:
#convert class labels to binary labels
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
Y=encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [10]:
#storing the sms message data
text_messages=df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


### regular expressions to replace email addresses, url, phone numbers,others numbers,symbols


In [11]:

#email addresses as 'email addr'
processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')
# Replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$', 'moneysymb')
    

# Replace 10 digit phone numbers with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

  """
  import sys
  # This is added back by InteractiveShellApp.init_path()
  


In [12]:
#removing punctuations
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

  
  """
  


In [14]:
#changing words to lower case
processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [15]:
#remove stop words
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))
processed=processed.apply(lambda x:' '.join(term for term in x.split() if term not in stop_words))

In [16]:
#words stemming using porter stemmer
ps=nltk.PorterStemmer()

processed=processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

In [17]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [36]:
#feature engineering
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [37]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [38]:
# use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [40]:
# The find_features function will determine which of the 1500 word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Lets see an example!
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print (key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [42]:
#find features in messages
messages=list(zip(processed,Y))

#define a seed for reproducibility
seed=1
np.random.seed=seed
np.random.shuffle(messages)

#call find_features function for each sms
featuresets=[(find_features(text),label)for (text,label) in messages]

In [43]:
#split into training and test sets
from sklearn import model_selection
training,testing=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

In [44]:
print('Training:{}'.format(len(training)))
print('Test:{}'.format(len(testing)))

Training:4179
Test:1393


# Building models

In [47]:
#classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [50]:
#models to train
names=['Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
classifier=[DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression(),SGDClassifier(max_iter=100),MultinomialNB(),SVC(kernel='linear')]
models=zip(names,classifier)


In [52]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

In [55]:
for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{}:Accuracy:{}'.format(name,accuracy))

Decision Tree:Accuracy:97.91816223977028
Random Forest:Accuracy:98.63603732950466
Logistic Regression:Accuracy:99.4256999282125
SGD Classifier:Accuracy:98.85139985642498
Naive Bayes:Accuracy:98.34888729361091
SVM Linear:Accuracy:98.92318736539842


In [57]:
#ensemble method-voting classifier
from sklearn.ensemble import VotingClassifier

names=['Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
classifier=[DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression(),SGDClassifier(max_iter=100),MultinomialNB(),SVC(kernel='linear')]
models=list(zip(names,classifier))

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100
print('Ensebmle Accuracy:{}'.format(accuracy))

Ensebmle Accuracy:99.28212491026561


In [58]:
#make class_predictions
txt_features,labels=zip(*testing)
prediction=nltk_ensemble.classify_many(txt_features)

In [60]:
#classification report and confusion matrix
print(classification_report(labels,prediction))
pd.DataFrame(confusion_matrix(labels,prediction),index=[['actual','actual'],['ham','ham']],columns=[['predicted','predicted'],['ham','spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1212
           1       0.99      0.96      0.97       181

    accuracy                           0.99      1393
   macro avg       0.99      0.98      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1210,2
actual,ham,8,173
