In [None]:
import sys
import nltk
import sklearn
import pandas
import numpy

nltk.download('stopwords')
nltk.download('punkt')

print('Python:{}'.format(sys.version))
print('NLTK:{}'.format(nltk.__version__))
print('Scikit:{}'.format(sklearn.__version__))
print('Pandas:{}'.format(pandas.__version__))
print('Numpy:{}'.format(numpy.__version__))



## 1. Load the dataset


In [3]:
import pandas as pd
import numpy as np

#load the dataset
df = pd.read_table('SMSSpamCollection' , header = None , encoding = 'utf-8')

#print useful information

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:

classes = df[0]

print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Preprocess

In [11]:
# convert to binary values 0 - hame , 1 - spam

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

Y = encoder.fit_transform(classes)



print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [12]:
#store the sms message data

text_messages = df[1]

print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [13]:

#use regex to replace emails,urls , phone numbers , money symbols

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')


In [14]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [16]:
#change words to lower case

processed = processed.str.lower()

print(processed[:20])

0     go until jurong point crazy available only in ...
1                               ok lar joking wif u oni
2     free entry in numbr a wkly comp to win fa cup ...
3           u dun say so early hor u c already then say
4     nah i don t think he goes to usf he lives arou...
5     freemsg hey there darling it s been numbr week...
6     even my brother is not like to speak with me t...
7     as per your request melle melle oru minnaminun...
8     winner as a valued network customer you have b...
9     had your mobile numbr months or more u r entit...
10    i m gonna be home soon and i don t want to tal...
11    six chances to win cash from numbr to numbr nu...
12    urgent you have won a numbr week free membersh...
13    i ve been searching for the right words to tha...
14                    i have a date on sunday with will
15    xxxmobilemovieclub to use your credit click th...
16                               oh k i m watching here
17    eh u remember how numbr spell his name yes

In [23]:
#remove stopwords

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

#extract the word stems

ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))


## 3. Generating Features

In [None]:

from nltk.tokenize import word_tokenize

#bag_of_words model

all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

print('Number of words: {}'.format(len(all_words)))
print('15 most common: {}'.format(all_words.most_common(15)))
    

In [31]:
#use 1500 most common as features

word_list = all_words.most_common(1500)

word_features = []

for el in word_list:
    word_features.append(el[0])

#define a function find_features

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

# Lets see an example

features = find_features(processed[0])

for key,value in features.items():
    if value == True:
        print(key)

go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine


In [34]:
#find features for all messages

messages = list(zip(processed , Y))
#seed

seed = 1
np.random.seed = seed
np.random.shuffle(messages)

#call for each

featuresets = [(find_features(text) , label) for (text , label) in messages]

In [36]:

from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets , test_size = 0.25 , random_state = seed)

print(len(training))
print(len(testing))

4179
1393


## 4. Scikit-Learn Classifiers with NLTK


In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [39]:
#define models to train

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names , classifiers))
print(models)


[('K Nearest Neighbors', KNeighborsClassifier()), ('Decision Tree', DecisionTreeClassifier()), ('Random Forest', RandomForestClassifier()), ('Logistic Regression', LogisticRegression()), ('SGD Classifier', SGDClassifier(max_iter=100)), ('Naive Bayes', MultinomialNB()), ('SVM Linear', SVC(kernel='linear'))]


In [40]:
#wrap models in nltk

from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model,testing)*100
    print("{} Accuracy {}".format(name , accuracy))

K Nearest Neighbors Accuracy 95.47738693467338
Decision Tree Accuracy 96.55419956927494
Random Forest Accuracy 98.49246231155779
Logistic Regression Accuracy 98.99497487437185
SGD Classifier Accuracy 98.63603732950466
Naive Bayes Accuracy 98.77961234745155
SVM Linear Accuracy 98.34888729361091


In [41]:
# ensemble method - Voting classifier

from sklearn.ensemble import VotingClassifier

#define models to train

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))


Voting Classifier: Accuracy: 98.34888729361091


In [42]:
# class label predictions

txt_features , labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1201
           1       0.99      0.93      0.96       192

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1200,1
actual,spam,14,178
