In [45]:
import sys
import nltk
import sklearn
import pandas
import numpy
nltk.download('stopwords')


print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 2.7.16 |Anaconda, Inc.| (default, Mar 14 2019, 15:42:17) [MSC v.1500 64 bit (AMD64)]
NLTK: 3.4
Scikit-learn: 0.20.3
Pandas: 0.24.2
Numpy: 1.16.2


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


# 1. Load the Dataset

In [16]:
import pandas as pd
import numpy as np

# load the dataset of sms messages
df = pd.read_csv('SMSSpamCollection', header = None, encoding='utf-8', sep='\t')

In [18]:
# print useful information about the data set
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [22]:
# check class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


# 2. Preprocess the Data

In [24]:
# convert class labels to binary values, 0 = ham, 1 = spam
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [33]:
# store the SMS message data
text_messages = df[1]
print(text_message[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [34]:
# use regular expressions to check email 
# replace email addresses with regex
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

# replace urls with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# replace money symboles with 'moneysymb'
processed = processed.str.replace(r'£|\$','moneysymb')

# replace 10 digit phone numbers with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')
                                      
# replace normal numbers with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')                                     

In [36]:
# remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', '')

# replace whitespace
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [39]:
# change words to lower case
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
5       freemsg hey there darling its been number week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile number months or more u r enti...
10      im gonna be home soon and i dont want to talk ...
11      six chances to win cash from number to numbern...
12      urgent you have won a number week free members...
13      ive been searching for the right words to than...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                   oh kim watching here
17      eh u r

In [77]:
# remove stop words from text messages
#nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [51]:
# remove word stem using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [54]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4               nah dont think goe usf live around though
5       freemsg hey darl number week word back id like...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea num...
9       mobil number month u r entitl updat latest col...
10      im gonna home soon dont want talk stuff anymor...
11      six chanc win cash number numbernumb pound txt...
12      urgent number week free membership numbernumb ...
13      ive search right word thank breather promi won...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                           oh kim watch
17      eh u r

In [68]:
from nltk.tokenize import word_tokenize

# creating a bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [70]:
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 7345
Most common words: [(u'number', 2607), (u'u', 1132), (u'call', 655), (u'im', 474), (u'go', 452), (u'get', 446), (u'ur', 390), (u'come', 300), (u'dont', 298), (u'ok', 278), (u'ltgt', 276), (u'free', 275), (u'know', 270), (u'like', 257), (u'got', 251)]


In [73]:
word_features = list(all_words.keys())[:1500]

In [75]:
# define a find_features function
def find_features(message):
    words = word_tokenize(message)
    features =  {}
    for word in word_features:
        features[word] = (word in words)
    return features

features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print key

avail
buffet
world
great


In [78]:
processed[0]

u'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [83]:
# find features for all messages
messages = zip(processed, Y)

seed = 1
np.random.seed = seed
np.random.shuffle(messages)
featuresets = [(find_features(text), label) for (text, label) in messages]

In [88]:
featuresets[0]

({u'yellow': False,
  u'elvi': False,
  u'didntgiv': False,
  u'captain': False,
  u'lorw': False,
  u'scold': False,
  u'buddi': False,
  u'lord': False,
  u'numberpress': False,
  u'digit': False,
  u'callin': False,
  u'claimcod': False,
  u'gover': False,
  u'sleepsweet': False,
  u'appar': False,
  u'oceand': False,
  u'four': False,
  u'disturb': False,
  u'prize': False,
  u'oficegot': False,
  u'btnationalr': False,
  u'wednesday': False,
  u'somewheresomeon': False,
  u'numbertxtnumberp': False,
  u'oooh': False,
  u'cheeto': False,
  u'careswt': False,
  u'nigh': False,
  u'miller': False,
  u'second': False,
  u'titleso': False,
  u'txtno': False,
  u'dialogu': False,
  u'wreck': False,
  u'mondaynxt': False,
  u'numberrespect': False,
  u'maraikara': False,
  u'uworld': False,
  u'pooki': False,
  u'med': False,
  u'heri': False,
  u'hero': False,
  u'numbergot': False,
  u'never': False,
  u'dahe': False,
  u'chine': False,
  u'china': False,
  u'dogwood': False,
  u'dorm'

In [91]:
# split training and testing data sets using sklearn
from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [93]:
print('Training:{}'.format(len(training))
print('Testing:{}'.format(len(testing)))

Training:4179
Testing:1393


# 4. Scikit-learn classifier with NLTK

In [96]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [105]:
# Define models to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

In [109]:
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuray = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{} accuray: {}'.format(name, accuray))

K Nearest Neighbors accuray: 92.9648241206
Decision Tree accuray: 94.9030868629
Random Forest accuray: 95.6927494616
Logistic Regression accuray: 95.9798994975
SGD Classifier accuray: 95.7645369706
Naive Bayes accuray: 95.0466618808
SVM Linear accuray: 95.4055994257


In [113]:
# ensemble learning - voting classifier
from sklearn.ensemble import VotingClassifier

# Define models to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting='hard',  n_jobs = -1))
nltk_ensemble.train(training)
accuray = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('Ensemble classifier accuray: {}'.format(accuray))

Ensemble classifier accuray: 96.0516870065


In [116]:
# predict test data label
txt_features, labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)

In [119]:
print(classification_report(labels, prediction))
pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']]
)

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1208
           1       0.94      0.75      0.83       185

   micro avg       0.96      0.96      0.96      1393
   macro avg       0.95      0.87      0.91      1393
weighted avg       0.96      0.96      0.96      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1199,9
actual,spam,46,139
