In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import string
import re 
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words

pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', None) 

In [2]:
df = pd.read_csv('SMSSpamCollection.tsv', sep='\t')
df.columns = ['label', 'body_text']

Let's take a look at the data, and understand the size, the shape, data type that it contains in the dataframe and what kind of data cleaning is needed for this dataset

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5567 non-null   object
 1   body_text  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB
None


In [4]:
print(df.isnull().sum())

label        0
body_text    0
dtype: int64


In [5]:
print(df['body_text'].duplicated().sum())

403


In [6]:
print(df[df.duplicated(keep=False)]) #Even though there are duplicates, for the purpose of this project, we will not remove them as spam emails can be sent multiple times and since

     label                                          body_text
0     spam  Free entry in 2 a wkly comp to win FA Cup fina...
4      ham  As per your request 'Melle Melle (Oru Minnamin...
5     spam  WINNER!! As a valued network customer you have...
6     spam  Had your mobile 11 months or more? U R entitle...
8     spam  SIX chances to win CASH! From 100 to 20,000 po...
9     spam  URGENT! You have won a 1 week FREE membership ...
40     ham                   No calls..messages..missed calls
57     ham                          Its a part of checking IQ
60    spam  As a valued customer, I am pleased to advise y...
61     ham  Today is "song dedicated day.." Which song wil...
62    spam  Urgent UR awarded a complimentary trip to Euro...
75     ham                             Sorry, I'll call later
78     ham               You will be in the place of that man
80     ham  I call you later, don't have network. If urgnt...
87     ham  Smile in Pleasure Smile in Pain Smile when tro...
88    sp

In [7]:
print(df['label'].value_counts())
print(df['label'].value_counts(normalize=True))

label
ham     4821
spam     746
Name: count, dtype: int64
label
ham     0.865996
spam    0.134004
Name: proportion, dtype: float64


In [8]:
df['label'] = df['label'].map({'ham':0, 'spam':1})
df['body_len'] = df['body_text'].apply(lambda x:len(x)- x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: round(sum([1 for char in x if char in string.punctuation]) / (len(x)- x.count(" ")),3))

df.head()

Unnamed: 0,label,body_text,body_len,punct%
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041
2,0,Even my brother is not like to speak with me. ...,62,0.032
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044


In [9]:
df['split_text'] = df['body_text'].apply(lambda x: x.split())

df['misspelled%'] = df['split_text'].apply(lambda x: round(sum([1 for word in x if word.lower() not in words.words()])/ (len(x)- x.count(" ")),3))
df['count_lower'] = df['split_text'].apply(lambda x: len([word for word in x if word.islower()]))
df['count_upper'] = df['split_text'].apply(lambda x: len([word for word in x if word.isupper()]))

df.head()

Unnamed: 0,label,body_text,body_len,punct%,split_text,misspelled%,count_lower,count_upper
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047,"[Free, entry, in, 2, a, wkly, comp, to, win, F...",0.393,18,2
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041,"[Nah, I, don't, think, he, goes, to, usf,, he,...",0.308,11,1
2,0,Even my brother is not like to speak with me. ...,62,0.032,"[Even, my, brother, is, not, like, to, speak, ...",0.188,14,0
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071,"[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL!!]",0.25,0,8
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044,"[As, per, your, request, 'Melle, Melle, (Oru, ...",0.462,15,0


In [10]:
df.drop('split_text', axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,label,body_text,body_len,punct%,misspelled%,count_lower,count_upper
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047,0.393,18,2
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041,0.308,11,1
2,0,Even my brother is not like to speak with me. ...,62,0.032,0.188,14,0
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071,0.25,0,8
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044,0.462,15,0


In [12]:
stop_words = stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [ps.stem(word) for word in tokens if word not in stop_words]
    return text

# df['body_text_clean'] = df['body_text'].apply(lambda x: clean_text(x))
# df.drop('body_text_clean',axis=1, inplace=True) 
df.head()

Unnamed: 0,label,body_text,body_len,punct%,misspelled%,count_lower,count_upper
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047,0.393,18,2
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041,0.308,11,1
2,0,Even my brother is not like to speak with me. ...,62,0.032,0.188,14,0
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071,0.25,0,8
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044,0.462,15,0


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df[['body_text','body_len','punct%','misspelled%','count_lower','count_upper']], df['label'],test_size=0.2)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
count_vect_fit = count_vect.fit(x_train['body_text'])

count_vect_train = count_vect_fit.transform(x_train['body_text'])
count_vect_test = count_vect_fit.transform(x_test['body_text'])

x_train_count_vect = pd.concat([x_train[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(count_vect_train.toarray())], axis=1)

x_test_count_vect = pd.concat([x_test[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(count_vect_test.toarray())], axis=1)

x_train_count_vect.columns =x_train_count_vect.columns.astype(str) 
x_test_count_vect.columns =x_test_count_vect.columns.astype(str)

x_train_count_vect.head()

Unnamed: 0,body_len,punct%,misspelled%,count_lower,count_upper,0,1,2,3,4,...,7289,7290,7291,7292,7293,7294,7295,7296,7297,7298
0,123,0.008,0.28,17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,0.125,0.571,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0.091,0.75,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20,0.0,0.25,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,0.0,0.4,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(x_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(x_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(x_test['body_text'])

x_train_tfidf_vect = pd.concat([x_train[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(tfidf_train.toarray())], axis=1)

x_test_tfidf_vect = pd.concat([x_test[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(tfidf_test.toarray())], axis=1)

x_train_tfidf_vect.columns =x_train_tfidf_vect.columns.astype(str) 
x_test_tfidf_vect.columns =x_test_tfidf_vect.columns.astype(str) 

x_train_tfidf_vect.head()

Unnamed: 0,body_len,punct%,misspelled%,count_lower,count_upper,0,1,2,3,4,...,7289,7290,7291,7292,7293,7294,7295,7296,7297,7298
0,123,0.008,0.28,17,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,0.125,0.571,5,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22,0.091,0.75,2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20,0.0,0.25,3,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20,0.0,0.4,0,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time 

In [17]:
rf = RandomForestClassifier()
param = {'n_estimators': [10,150,300],
        'max_depth': [30,60,90,None]}
gs = GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit = gs.fit(x_train_tfidf_vect, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,5.893821,0.207215,0.179389,0.144674,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.976431,0.978676,0.976431,0.969663,0.982022,0.976645,0.004047,1
3,1.187822,0.170807,0.066586,0.008803,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.977553,0.975309,0.971942,0.975281,0.982022,0.976421,0.003324,2
8,12.027765,0.306665,0.171601,0.02591,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.975309,0.977553,0.976431,0.967416,0.983146,0.975971,0.005056,3
10,5.954963,0.096335,0.091282,0.012121,,150,"{'max_depth': None, 'n_estimators': 150}",0.976431,0.978676,0.975309,0.968539,0.980899,0.975971,0.004183,4
11,9.571881,0.052481,0.148547,0.014323,,300,"{'max_depth': None, 'n_estimators': 300}",0.975309,0.977553,0.976431,0.966292,0.983146,0.975746,0.005442,5


In [18]:
rf = RandomForestClassifier()
param = {'n_estimators': [10,150,300],
        'max_depth': [30,60,90,None]}
gs = GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit = gs.fit(x_train_count_vect, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,5.866759,0.233885,0.27708,0.113428,,150,"{'max_depth': None, 'n_estimators': 150}",0.976431,0.976431,0.977553,0.97191,0.979775,0.97642,0.002564,1
8,12.339653,0.045627,0.187547,0.022811,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.975309,0.976431,0.976431,0.966292,0.982022,0.975297,0.005077,2
7,5.844348,0.102173,0.142058,0.078924,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.976431,0.978676,0.977553,0.964045,0.979775,0.975296,0.005735,3
11,9.849542,0.274159,0.136977,0.008996,,300,"{'max_depth': None, 'n_estimators': 300}",0.975309,0.977553,0.978676,0.965169,0.978652,0.975072,0.005101,4
5,10.809065,0.1475,0.376721,0.044274,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.976431,0.976431,0.976431,0.964045,0.978652,0.974398,0.005247,5


In [19]:
gb = GradientBoostingClassifier()
param = {'n_estimators': [100,150],
        'max_depth': [7,11,15],
        'learning_rate': [0.1]}
gs_tfidf_gb = GridSearchCV(gb,param,cv=5,n_jobs=-1)
gs_tfidf_gb_fit = gs_tfidf_gb.fit(x_train_tfidf_vect, y_train)
pd.DataFrame(gs_tfidf_gb_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,58.606187,0.590729,0.150552,0.042532,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.977553,0.975309,0.978676,0.967416,0.980899,0.97597,0.004642,1
3,86.58859,0.859306,0.151335,0.060867,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.977553,0.969697,0.979798,0.967416,0.98427,0.975747,0.006298,2
0,39.155661,0.353729,0.179858,0.062435,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.977553,0.971942,0.977553,0.967416,0.983146,0.975522,0.005383,3
5,84.170735,1.490483,0.107651,0.017735,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.974186,0.969697,0.978676,0.970787,0.983146,0.975298,0.005019,4
4,68.650592,0.961166,0.164221,0.088106,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.975309,0.968575,0.978676,0.968539,0.98427,0.975074,0.006042,5


In [20]:
gb = GradientBoostingClassifier()
param = {'n_estimators': [100,150],
        'max_depth': [7,11,15],
        'learning_rate': [0.1]}
gs_count_gb = GridSearchCV(gb,param,cv=5,n_jobs=-1)
gs_count_gb_fit = gs_count_gb.fit(x_train_count_vect, y_train)
pd.DataFrame(gs_count_gb_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,55.14032,0.561105,0.197989,0.047792,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.978676,0.976431,0.978676,0.97191,0.98427,0.977992,0.003994,1
0,35.976986,0.451867,0.189931,0.060845,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.976431,0.976431,0.977553,0.968539,0.985393,0.97687,0.005346,2
3,78.326739,0.892715,0.116524,0.017604,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.974186,0.970819,0.98092,0.97191,0.982022,0.975972,0.004633,3
5,78.951026,1.626138,0.121002,0.010289,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.973064,0.971942,0.978676,0.964045,0.983146,0.974174,0.006474,4
4,59.495049,0.882302,0.120115,0.004561,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.973064,0.969697,0.978676,0.964045,0.980899,0.973276,0.006088,5


 ### Base on the data it does seem that the differences are very small with notable things like the mean_fit_time being very long but other than that they probably have the same mean test score. Hence we will pick the one that is the fastest out of all of them which is the randomforestclassifier. But we will compare them against each other in their score function.

In [21]:
rf = RandomForestClassifier(n_estimators=150,max_depth=90,n_jobs=-1)

start_fit = time.time()
rf_model = rf.fit(x_train_tfidf_vect, y_train)
end_fit = time.time()
fit_time = end_fit - start_fit

start_pred = time.time()
rf_pred = rf_model.predict(x_test_tfidf_vect)
end_pred = time.time()
pred_time = end_pred - start_pred

precision, recall, fscore, train_support = score(y_test, rf_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),round((rf_pred==y_test).sum()/len(rf_pred),3)))

Fit time: 0.992 / Predict time: 0.091 ---- Precision: 1.0 / Recall: 0.785 / Accuracy: 0.974


In [22]:
rf = RandomForestClassifier(n_estimators=300,max_depth=None,n_jobs=-1)

start_fit = time.time()
rf_model = rf.fit(x_train_count_vect, y_train)
end_fit = time.time()
fit_time = end_fit - start_fit

start_pred = time.time()
rf_pred = rf_model.predict(x_test_count_vect)
end_pred = time.time()
pred_time = end_pred - start_pred

precision, recall, fscore, train_support = score(y_test, rf_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),round((rf_pred==y_test).sum()/len(rf_pred),3)))

# Precision is the ratio of correctly predicted positive observations to the total predicted positives. 
# Eg. of all the emails that were predicted as spam, how many were actually spam.

# Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
# Eg. of all the emails that were actually spam, how many were predicted as spam.

# F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account.

Fit time: 1.752 / Predict time: 0.095 ---- Precision: 1.0 / Recall: 0.778 / Accuracy: 0.973


In [23]:
gb = GradientBoostingClassifier(n_estimators=150,max_depth=11,learning_rate=0.1)

start_fit = time.time()
gb_fit = gb.fit(x_train_tfidf_vect, y_train)
end_fit = time.time()
fit_time = end_fit - start_fit

start_pred = time.time()
gb_pred = gb_fit.predict(x_test_tfidf_vect)
end_pred = time.time()
pred_time = end_pred - start_pred

precision, recall, fscore, train_support = score(y_test, gb_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),round((gb_pred==y_test).sum()/len(gb_pred),3)))

# Precision is the ratio of correctly predicted positive observations to the total predicted positives. 
# Eg. of all the emails that were predicted as spam, how many were actually spam.

# Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
# Eg. of all the emails that were actually spam, how many were predicted as spam.

# F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account.

Fit time: 84.774 / Predict time: 0.107 ---- Precision: 0.991 / Recall: 0.844 / Accuracy: 0.98
