In [124]:
import sys
import nltk
import sklearn
import numpy as np
import pandas as pd

## 1. Loading Dataset
Obtained from https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [125]:
dataset = pd.read_table('SMSSpamCollection', header = None, encoding = 'utf-8')

In [126]:
print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [127]:
# Class distribution
print(dataset[0].value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Pre-processing Data

In [128]:
from sklearn.preprocessing import LabelEncoder

# Convert ham and spam to 0 and 1 respectively
encoder = LabelEncoder()
Y = encoder.fit_transform(dataset[0])

print(Y[:10])
print(dataset[0][:10])

[0 0 1 0 0 1 0 0 1 1]
0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object


In [129]:
# Getting text messages
text_messages = dataset[1]
print(text_mes[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


### 2.1. Pre-processing using Regex
Regex from https://www.regexlib.com/

In [130]:
# Email addresses
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

# Web sites
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'unirloc')

# Money symbols
processed = processed.str.replace(r'£|\$', 'monysymb')

# Phone numbers
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phnnumb')

# Numbers
processed = processed.str.replace(r'\d+(\.\d+)?', 'nmbr')

# remove Punctuations
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# trim white spaces
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [131]:
# To lower case
processed = processed.str.lower()

In [132]:
# Removing stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(w for w in x.split() if w not in stop_words))

# Stemming words
stemmer = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(stemmer.stem(w) for w in x.split()))

In [133]:
# Result
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri nmbr wkli comp win fa cup final tkt...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    nmbrnd time tri nmbr contact u u monysymbnmbr ...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [134]:
from nltk.tokenize import word_tokenize

# Creating a bag of words
word_tokens = [w for s in processed for w in word_tokenize(s)]

word_tokens = nltk.FreqDist(word_tokens)

In [135]:
print(word_tokens)
print('Most common: {}'.format(word_tokens.most_common(20)))

<FreqDist with 6584 samples and 53376 outcomes>
Most common: [('nmbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('monysymbnmbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261), ('got', 252), ('time', 252), ('good', 248), ('want', 247)]


In [136]:
# Using 1500 most common words as features
word_features = list(word_tokens.keys())[:3000]

In [137]:
# Function to find features in a message
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for w in words:
        features[w] = w in word_features
    
    return features

In [138]:
# Example: Extracting features
for m in processed[:5]:
    features = find_features(m)
    tokens = [key for key, val in features.items() if val is True]
    print(m, '\n', tokens)

go jurong point crazi avail bugi n great world la e buffet cine got amor wat 
 ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']
ok lar joke wif u oni 
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni']
free entri nmbr wkli comp win fa cup final tkt nmbrst may nmbr text fa nmbr receiv entri question std txt rate c appli nmbrovernmbr 
 ['free', 'entri', 'nmbr', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', 'nmbrst', 'may', 'text', 'receiv', 'question', 'std', 'txt', 'rate', 'c', 'appli', 'nmbrovernmbr']
u dun say earli hor u c alreadi say 
 ['u', 'dun', 'say', 'earli', 'hor', 'c', 'alreadi']
nah think goe usf live around though 
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though']


In [139]:
# Preparing dataset
messages = list(zip(processed, Y))

# seed for reproducibility
np.random.seed = 1
np.random.shuffle(messages)

# Finding features and making dataset
feature_set = [(find_features(msg), label) for (msg, label) in messages]

In [140]:
# Splitting into training and testing data
from sklearn.model_selection import train_test_split

train, test = train_test_split(feature_set, test_size = 0.25, random_state = 1)

In [141]:
print('Train size: {}'.format(len(train)))
print('Test size: {}'.format(len(test)))

Train size: 4179
Test size: 1393


## 3. Training Sci-kit Learn models with NLTK

In [142]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [143]:
# Models to train
names = ['Random Forest', 'Logistic Regression', 'SGD', 'Decision Trees', 'K Neighbors', 'Multinomial NB', 'SVC Linear']
classifiers = [RandomForestClassifier(),
              LogisticRegression(),
              SGDClassifier(max_iter = 100),
              DecisionTreeClassifier(),
              KNeighborsClassifier(),
              MultinomialNB(),
              SVC(kernel = 'linear')]

models = list(zip(names, classifiers))

In [144]:
# Wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import accuracy

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train)
    acc = accuracy(nltk_model, test) * 100
    print('{} model accuracy: {}'.format(name, acc))

Random Forest model accuracy: 98.49246231155779
Logistic Regression model accuracy: 98.7078248384781
SGD model accuracy: 98.49246231155779
Decision Trees model accuracy: 95.908111988514
K Neighbors model accuracy: 94.75951184493898
Multinomial NB model accuracy: 98.49246231155779
SVC Linear model accuracy: 98.34888729361091


In [145]:
# Ensemble Method: Training all models at once and voting on the result - Voting Classifier
from sklearn.ensemble import VotingClassifier

# voting = 'hard' is taking class labels as votes, 'soft' is taking probabilities as votes(like softmax)
# n_jobs = -1 is use all available cores in CPU
nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(train)

acc = accuracy(nltk_ensemble, test) * 100
print('Voting Classifier model accuracy: {}'.format(acc))

Voting Classifier model accuracy: 98.7078248384781


## 4. Results

In [146]:
# Class label predictions
txt_feat, labels = zip(*test)

prediction = nltk_ensemble.classify_many(txt_feat)

In [147]:
# Generating confusion matrix and classification reports
print(classification_report(labels, prediction))

pd.DataFrame(confusion_matrix(labels, prediction),
             index = [['actual', 'actual'], ['ham', 'spam']],
            columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1209
           1       1.00      0.90      0.95       184

    accuracy                           0.99      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1209,0
actual,spam,18,166
