In [32]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import string
import re 
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words

In [19]:
df = pd.read_csv('SMSSpamCollection.tsv', sep='\t')
df.columns = ['label', 'body_text']

df['label'] = df['label'].map({'ham':0, 'spam':1})
df['body_len'] = df['body_text'].apply(lambda x:len(x)- x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x:
round(
    sum(
        [1 for char in x if char in string.punctuation]) / (len(x)- x.count(" ")
    )
,3)
)

df.head()

Unnamed: 0,label,body_text,body_len,punct%
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041
2,0,Even my brother is not like to speak with me. ...,62,0.032
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044


In [20]:
df['split_text'] = df['body_text'].apply(lambda x: x.split())

df['misspelled%'] = df['split_text'].apply(lambda x: round(sum([1 for word in x if word.lower() not in words.words()])/ (len(x)- x.count(" ")),3))
df['count_lower'] = df['split_text'].apply(lambda x: len([word for word in x if word.islower()]))
df['count_upper'] = df['split_text'].apply(lambda x: len([word for word in x if word.isupper()]))

df.head()

Unnamed: 0,label,body_text,body_len,punct%,split_text,misspelled%,count_lower,count_upper
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047,"[Free, entry, in, 2, a, wkly, comp, to, win, F...",0.393,18,2
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041,"[Nah, I, don't, think, he, goes, to, usf,, he,...",0.308,11,1
2,0,Even my brother is not like to speak with me. ...,62,0.032,"[Even, my, brother, is, not, like, to, speak, ...",0.188,14,0
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071,"[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL!!]",0.25,0,8
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044,"[As, per, your, request, 'Melle, Melle, (Oru, ...",0.462,15,0


In [23]:
df.drop('split_text', axis=1, inplace=True)

KeyError: "['split_text'] not found in axis"

In [24]:
df.head()

Unnamed: 0,label,body_text,body_len,punct%,misspelled%,count_lower,count_upper
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047,0.393,18,2
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041,0.308,11,1
2,0,Even my brother is not like to speak with me. ...,62,0.032,0.188,14,0
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071,0.25,0,8
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044,0.462,15,0


In [37]:
stop_words = stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [ps.stem(word) for word in tokens if word not in stop_words]
    return text

# df['body_text_clean'] = df['body_text'].apply(lambda x: clean_text(x))
# df.drop('body_text_clean',axis=1, inplace=True) 
df.head()

Unnamed: 0,label,body_text,body_len,punct%,misspelled%,count_lower,count_upper
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,0.047,0.393,18,2
1,0,"Nah I don't think he goes to usf, he lives aro...",49,0.041,0.308,11,1
2,0,Even my brother is not like to speak with me. ...,62,0.032,0.188,14,0
3,0,I HAVE A DATE ON SUNDAY WITH WILL!!,28,0.071,0.25,0,8
4,0,As per your request 'Melle Melle (Oru Minnamin...,135,0.044,0.462,15,0


In [41]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df[['body_text','body_len','punct%','misspelled%','count_lower','count_upper']], df['label'],test_size=0.2)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(x_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(x_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(x_test['body_text'])

x_train_tfidf_vect = pd.concat([x_train[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(tfidf_train.toarray())], axis=1)

x_test_tfidf_vect = pd.concat([x_test[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(tfidf_test.toarray())], axis=1)

x_train_tfidf_vect.columns =x_train_tfidf_vect.columns.astype(str) 
x_test_tfidf_vect.columns =x_test_tfidf_vect.columns.astype(str) 

x_train_tfidf_vect.head()

Unnamed: 0,body_len,punct%,misspelled%,count_lower,count_upper,0,1,2,3,4,...,7146,7147,7148,7149,7150,7151,7152,7153,7154,7155
0,19,0.316,0.667,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,54,0.056,0.125,14,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,41,0.073,0.273,10,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,89,0.09,0.375,15,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,123,0.057,0.333,19,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
count_vect_fit = count_vect.fit(x_train['body_text'])

count_vect_train = count_vect_fit.transform(x_train['body_text'])
count_vect_test = count_vect_fit.transform(x_test['body_text'])

x_train_count_vect = pd.concat([x_train[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(count_vect_train.toarray())], axis=1)

x_test_count_vect = pd.concat([x_test[['body_len','punct%','misspelled%','count_lower','count_upper']].reset_index(drop=True)
            ,pd.DataFrame(count_vect_test.toarray())], axis=1)

x_train_count_vect.columns =x_train_count_vect.columns.astype(str) 
x_test_count_vect.columns =x_test_count_vect.columns.astype(str)

x_train_count_vect.head()

Unnamed: 0,body_len,punct%,misspelled%,count_lower,count_upper,0,1,2,3,4,...,7146,7147,7148,7149,7150,7151,7152,7153,7154,7155
0,19,0.316,0.667,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,54,0.056,0.125,14,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,41,0.073,0.273,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,89,0.09,0.375,15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,123,0.057,0.333,19,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time 

In [45]:
rf = RandomForestClassifier()
param = {'n_estimators': [10,150,300],
        'max_depth': [30,60,90,None]}
gs = GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit = gs.fit(x_train_tfidf_vect, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,5.532954,0.12466,0.291926,0.357877,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.971942,0.98541,0.973064,0.979775,0.974157,0.97687,0.00505,1
11,10.040416,0.16215,0.159551,0.036358,,300,"{'max_depth': None, 'n_estimators': 300}",0.970819,0.98541,0.970819,0.976404,0.976404,0.975971,0.005339,2
5,9.890446,0.238805,0.283517,0.078698,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.973064,0.983165,0.970819,0.979775,0.97191,0.975747,0.004847,3
10,6.377703,0.10567,0.1121,0.016485,,150,"{'max_depth': None, 'n_estimators': 150}",0.970819,0.982043,0.971942,0.980899,0.970787,0.975298,0.00507,4
8,12.944284,0.366972,0.160782,0.014997,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.969697,0.983165,0.971942,0.979775,0.970787,0.975073,0.005379,5


In [46]:
rf = RandomForestClassifier()
param = {'n_estimators': [10,150,300],
        'max_depth': [30,60,90,None]}
gs = GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit = gs.fit(x_train_count_vect, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,9.695344,0.162681,0.172313,0.027221,,300,"{'max_depth': None, 'n_estimators': 300}",0.971942,0.984287,0.970819,0.979775,0.974157,0.976196,0.005088,1
8,12.500582,0.082253,0.244724,0.027192,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.969697,0.983165,0.971942,0.979775,0.97191,0.975298,0.005216,2
10,6.150525,0.159996,0.138761,0.03256,,150,"{'max_depth': None, 'n_estimators': 150}",0.969697,0.984287,0.970819,0.979775,0.97191,0.975298,0.005723,3
5,11.057484,0.126267,0.46688,0.0182,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.970819,0.982043,0.969697,0.979775,0.973034,0.975074,0.004936,4
7,5.749557,0.049706,0.165575,0.085135,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.969697,0.982043,0.970819,0.977528,0.974157,0.974849,0.004521,5


In [47]:
gb = GradientBoostingClassifier()
param = {'n_estimators': [100,150],
        'max_depth': [7,11,15],
        'learning_rate': [0.1]}
gs_tfidf_gb = GridSearchCV(gb,param,cv=5,n_jobs=-1)
gs_tfidf_gb_fit = gs_tfidf_gb.fit(x_train_tfidf_vect, y_train)
pd.DataFrame(gs_tfidf_gb_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,84.589173,0.900306,0.14444,0.024969,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.976431,0.984287,0.96633,0.982022,0.975281,0.97687,0.006251,1
2,58.334555,1.072242,0.139996,0.025437,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.977553,0.982043,0.969697,0.982022,0.970787,0.97642,0.005315,2
5,79.270134,1.181677,0.11161,0.021344,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.975309,0.98092,0.968575,0.98427,0.97191,0.976197,0.00574,3
0,37.876659,0.261181,0.298045,0.064644,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.975309,0.982043,0.967452,0.980899,0.973034,0.975747,0.005339,4
1,58.249934,0.680959,0.183177,0.04866,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.974186,0.982043,0.969697,0.979775,0.973034,0.975747,0.004522,5


In [48]:
gb = GradientBoostingClassifier()
param = {'n_estimators': [100,150],
        'max_depth': [7,11,15],
        'learning_rate': [0.1]}
gs_count_gb = GridSearchCV(gb,param,cv=5,n_jobs=-1)
gs_count_gb_fit = gs_count_gb.fit(x_train_count_vect, y_train)
pd.DataFrame(gs_count_gb_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,74.456795,0.684483,0.12062,0.018651,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.978676,0.983165,0.968575,0.983146,0.97191,0.977094,0.005923,1
5,73.105423,0.785539,0.10726,0.009142,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.978676,0.98092,0.965208,0.98427,0.968539,0.975523,0.007359,2
0,34.978914,0.223799,0.242415,0.065605,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.975309,0.982043,0.965208,0.980899,0.973034,0.975298,0.006063,3
1,52.524562,0.352642,0.163598,0.054052,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.973064,0.982043,0.967452,0.979775,0.973034,0.975074,0.005234,4
4,55.302371,1.188716,0.120369,0.012217,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.976431,0.978676,0.965208,0.983146,0.969663,0.974625,0.006412,5


 ### Base on the data it does seem that the differences are very small with notable things like the mean_fit_time being very long but other than that they probably have the same mean test score. Hence we will pick the one that is the fastest out of all of them which is the randomforestclassifier. But we will compare them against each other in their score function.

In [49]:
rf = RandomForestClassifier(n_estimators=150,max_depth=90,n_jobs=-1)

start_fit = time.time()
rf_model = rf.fit(x_train_tfidf_vect, y_train)
end_fit = time.time()
fit_time = end_fit - start_fit

start_pred = time.time()
rf_pred = rf_model.predict(x_test_tfidf_vect)
end_pred = time.time()
pred_time = end_pred - start_pred

precision, recall, fscore, train_support = score(y_test, rf_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),round((rf_pred==y_test).sum()/len(rf_pred),3)))

Fit time: 1.094 / Predict time: 0.082 ---- Precision: 1.0 / Recall: 0.813 / Accuracy: 0.975


In [52]:
rf = RandomForestClassifier(n_estimators=300,max_depth=None,n_jobs=-1)

start_fit = time.time()
rf_model = rf.fit(x_train_count_vect, y_train)
end_fit = time.time()
fit_time = end_fit - start_fit

start_pred = time.time()
rf_pred = rf_model.predict(x_test_count_vect)
end_pred = time.time()
pred_time = end_pred - start_pred

precision, recall, fscore, train_support = score(y_test, rf_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),round((rf_pred==y_test).sum()/len(rf_pred),3)))

# Precision is the ratio of correctly predicted positive observations to the total predicted positives. 
# Eg. of all the emails that were predicted as spam, how many were actually spam.

# Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
# Eg. of all the emails that were actually spam, how many were predicted as spam.

# F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account.

Fit time: 1.808 / Predict time: 0.103 ---- Precision: 1.0 / Recall: 0.807 / Accuracy: 0.974


In [51]:
gb = GradientBoostingClassifier(n_estimators=150,max_depth=11,learning_rate=0.1)

start_fit = time.time()
gb_fit = gb.fit(x_train_tfidf_vect, y_train)
end_fit = time.time()
fit_time = end_fit - start_fit

start_pred = time.time()
gb_pred = gb_fit.predict(x_test_tfidf_vect)
end_pred = time.time()
pred_time = end_pred - start_pred

precision, recall, fscore, train_support = score(y_test, gb_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),round((gb_pred==y_test).sum()/len(gb_pred),3)))

# Precision is the ratio of correctly predicted positive observations to the total predicted positives. 
# Eg. of all the emails that were predicted as spam, how many were actually spam.

# Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
# Eg. of all the emails that were actually spam, how many were predicted as spam.

# F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account.

Fit time: 82.43 / Predict time: 0.118 ---- Precision: 0.969 / Recall: 0.827 / Accuracy: 0.973
