In [None]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier

from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

print("Python:", sys.version)
print("Ntlk:", nltk.__version__)
print("sklearn:", sklearn.__version__)
print("pandas:", pd.__version__)
print("numpy:", np.__version__)

pd.set_option("display.max_colwidth", None)


## 1. Load Dataset

In [2]:
df = pd.read_table('SMSSpamCollection', header = None, encoding = 'utf-8')

In [3]:
df.info()
df.head()
classes = df[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## 2. Preprocess Data

In [4]:
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
text_messages = df[1]
Y[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [5]:
# replace money symbols with 'moneysymb'
money_pattern = r'£|\$'
processed_text = text_messages.str.replace(money_pattern, 'moneysymb', regex=True)

# replace email with 'emailaddr'
email_pattern = r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
processed_text = processed_text.str.replace(email_pattern, 'emailaddr', regex=True)

# replace urls with 'webaddress'
url_pattern = r'^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$'
processed_text = processed_text.str.replace(url_pattern, 'webaddress', regex=True)

# replace phone numbers with 'phonenumber'
phone_pattern = r'(?:\(\d{3}\)[- ]?\d{3}[- ]?\d{4}|0\d{10})'
processed_text = processed_text.str.replace(phone_pattern, 'phonenumber', regex=True)

# repalce normal numbers with 'numberpat'
number_pattern = r'\d+(\.\d+)?'
processed_text = processed_text.str.replace(number_pattern, 'numberpat', regex=True)

# remove punctuation
punctuations = r'[^\w\d\s]'
processed_text = processed_text.str.replace(punctuations, '', regex=True)

# replace extra whitespaces with single whitespace
extra_whitespace_pattern = r'\s+|^\s+|\s+?$'
processed_text = processed_text.str.replace(extra_whitespace_pattern, ' ', regex=True)

# to lower case
processed_text = processed_text.str.lower()

In [6]:
processed_text[:20]

0                                                                                                              go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
1                                                                                                                                                                                             ok lar joking wif u oni
2                                free entry in numberpat a wkly comp to win fa cup final tkts numberpatst may numberpat text fa to numberpat to receive entry questionstd txt ratetcs apply phonenumberovernumberpats
3                                                                                                                                                                         u dun say so early hor u c already then say
4                                                                                                                                               

In [None]:
# remove stopwords
stop_words = set(stopwords.words('english'))
processed_text = processed_text.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [8]:
# strip suffixes, keeping only stem
ps = nltk.PorterStemmer()
processed_text = processed_text.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
processed_text

0                                                                                                 go jurong point crazi avail bugi n great world la e buffet cine got amor wat
1                                                                                                                                                        ok lar joke wif u oni
2           free entri numberpat wkli comp win fa cup final tkt numberpatst may numberpat text fa numberpat receiv entri questionstd txt ratetc appli phonenumberovernumberpat
3                                                                                                                                          u dun say earli hor u c alreadi say
4                                                                                                                                    nah dont think goe usf live around though
                                                                                         ...                                 

## 3. Generate Features

In [None]:
# each word in our processed text is a feature
features = []
for message in processed_text:
  features += word_tokenize(message)
features_freq = nltk.FreqDist(features)

In [10]:
most_common_features = [word for word, _ in features_freq.most_common(1000)]
print("most common words", most_common_features)

most common words ['numberpat', 'u', 'call', 'im', 'go', 'get', 'ur', 'phonenumb', 'come', 'dont', 'ok', 'ltgt', 'free', 'know', 'moneysymbnumberpat', 'like', 'got', 'love', 'want', 'ill', 'day', 'time', 'good', 'text', 'send', 'need', 'one', 'txt', 'see', 'today', 'ü', 'think', 'home', 'take', 'lor', 'stop', 'repli', 'tell', 'sorri', 'still', 'r', 'back', 'mobil', 'make', 'n', 'phone', 'say', 'new', 'work', 'pleas', 'well', 'week', 'later', 'hi', 'da', 'ask', 'miss', 'cant', 'hope', 'meet', 'happi', 'night', 'tri', 'give', 'claim', 'wait', 'thing', 'oh', 'much', 'great', 'hey', 'pl', 'dear', 'wat', 'messag', 'number', 'na', 'friend', 'thank', 'that', 'way', 'prize', 'right', 'feel', 'msg', 'wan', 'even', 'let', 'pick', 'alreadi', 'tomorrow', 'said', 'ye', 'realli', 'yeah', 'min', 'e', 'amp', 'leav', 'care', 'co', 'didnt', 'babe', 'morn', 'win', 'c', 'life', 'last', 'sure', 'servic', 'ive', 'anyth', 'would', 'keep', 'cash', 'find', 'year', 'contact', 'buy', 'sleep', 'lol', 'tone', 'loo

In [11]:
def find_features(message):
  words = word_tokenize(message)
  features = {}
  for word in most_common_features:
    features[word] = (word in words)
  return features

In [12]:
#find features for all messages
messages = list(zip(processed_text, Y))
np.random.shuffle(messages)
featuresets = [(find_features(text), label) for (text, label) in messages]


## 4. Train model

In [13]:
# split data set for training and testing
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25)

In [15]:
# defining models
names = [
  # 'K Nearest Neighbor', KNN too slow and accuracy is very poor, so will not be using
  'Decision Tree',
  ' Random Forest',
  'Logistic Regression',
  ' SGD Classifier',
  'Naive Bayes',
  'SVM Linear'
]

classifier = [
  # KNeighborsClassifier(),
  DecisionTreeClassifier(),
  RandomForestClassifier(),
  LogisticRegression(),
  SGDClassifier(max_iter = 100),
  MultinomialNB(),
  SVC(kernel = 'linear')
]

models = list(zip(names, classifier))

In [21]:
# training and testing models
for name, model in models:
  nltk_model = SklearnClassifier(model)
  nltk_model.train(training)
  accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
  print('{}: Accuracy: {}'.format(name, accuracy))

K Nearest Neighbor: Accuracy: 93.96984924623115
Decision Tree: Accuracy: 97.70279971284997
 Random Forest: Accuracy: 98.77961234745155
Logistic Regression: Accuracy: 98.85139985642498
 SGD Classifier: Accuracy: 98.77961234745155
Naive Bayes: Accuracy: 98.06173725771716
SVM Linear: Accuracy: 98.85139985642498


In [23]:
# building ensemble classifier
nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('Ensemble Method Accuracy: {}'.format(accuracy))

Ensemble Method Accuracy: 98.99497487437185


In [27]:
txt_features, labels = list(zip(*testing))
prediction = nltk_ensemble.classify_many(txt_features)

In [34]:
#confusion matrix and classification report
print(classification_report(labels, prediction))

pd.DataFrame(
  confusion_matrix(labels, prediction),
  index = [['actual', 'actual'], ['ham', 'spam']],
  columns = [['predicted', 'predicted'], ['ham', 'spam']] 
)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1209
           1       1.00      0.92      0.96       184

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1209,0
actual,spam,14,170
