# Model Selection

### Read and Clean the text

In [6]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import os
base_path = "datasets"

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv(os.path.join(base_path,"SMSSpamCollection.tsv"), sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

### Vectorize the Data

In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len','punct%']].reset_index(drop=True),pd.DataFrame(tfidf_train.toarray())],axis=1)
X_test_vect = pd.concat([X_test[['body_len','punct%']].reset_index(drop=True),pd.DataFrame(tfidf_test.toarray())],axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7204,7205,7206,7207,7208,7209,7210,7211,7212,7213
0,38,5.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,67,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,38,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Random Forest Results

In [9]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = round(end - start,3)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = round(end - start,3)

precision,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
precision_r = round(precision,3)
recall_r = round(recall,3)
accuracy = round((y_pred==y_test).sum()/len(y_pred),3)
print(f"Fit Time: {fit_time} / Predict Time: {pred_time} / Precision: {precision_r} / Recall: {recall_r} / Accuracy: {accuracy}")

Fit Time: 2.395 / Predict Time: 0.141 / Precision: 1.0 / Recall: 0.873 / Accuracy: 0.982


### Gradient Boosting Results

In [10]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
rf_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = round(end - start,3)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = round(end - start,3)

precision,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
precision_r = round(precision,3)
recall_r = round(recall,3)
accuracy = round((y_pred==y_test).sum()/len(y_pred),3)
print(f"Fit Time: {fit_time} / Predict Time: {pred_time} / Precision: {precision_r} / Recall: {recall_r} / Accuracy: {accuracy}")

Fit Time: 148.159 / Predict Time: 0.141 / Precision: 0.908 / Recall: 0.88 / Accuracy: 0.97


### Comparing the models:

* Fit Time - Since most time we will fit the model few times and then don't do that again, the fit time is not that critical for these models
* Predict Time - Predict time is more critical since it can create a bottlneck in out Machine Learning Pipeline. Both models have the same results so it doens't matter which one we'll choose
* Precision - Precision is the ability to classify not-spam correctly. We don't want our model to classify real emails as spam so we want high precision. Point for RF
* Recall - Recall is the ability to classify spam correctly. We want this value to be high, but it is ok to allow spam to enter our inbox from time to time. Both results are pretty close so it doesn't matter which one we'll choose
* Accuracy - Accuracy is the ability to classify mails correctly out of all mails we get. Point for RF

### Overall, RF has better prediction both in time and results, so this will be the best model for us.