# Random Forest Model for NLP

### We will use a basic Random Forest model to analyze our SMS Spam Collection dataset


### Read in Raw Text, and perform all preprocessing steps we did before (cleaning the text, feature engineering and text vectorizing steps)

In [3]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import os
base_path = 'datasets'
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv(os.path.join(base_path,"SMSSpamCollection.tsv"), sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Using K-Fold cross validation method

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

# n_jobs = -1 - means we can run everything in parallel
rf = RandomForestClassifier(n_jobs=-1)
# Split the training set to 5 subsets
k_fold = KFold(n_splits=5)
cross_val_score(rf,X=X_features,y=data['label'],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.97486535, 0.98114901, 0.97484277, 0.96675651, 0.97304582])

### Using Holdout Test Set method

In [5]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

# Split to train and test sets
x_train,x_test,y_train,y_test = train_test_split(X_features,data['label'],test_size=0.2)

In [6]:
rf2 = RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
rf2_model = rf2.fit(X=x_train,y=y_train)



In [9]:
# Print the most important features
# Body Length is the most important feature
# The other column names are numbers due to how the vecrotization process occurres (assign a number to each column)
sorted(zip(rf2_model.feature_importances_,x_train.columns),reverse=True)[0:10]

[(0.06759716769808131, 'body_len'),
 (0.05236945935131036, 7350),
 (0.0430298966460655, 1803),
 (0.03135372877876165, 4796),
 (0.02353401544537904, 3134),
 (0.02129444375633566, 2031),
 (0.02019702873181215, 6746),
 (0.019996640655553843, 7027),
 (0.019344240311034152, 5724),
 (0.01845058202116578, 690)]

In [14]:
# Predict on every x_test
y_pred = rf2_model.predict(x_test)
# Getting the evaluation metrics with "spam" as the important label
precision,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
precision_r = round(precision,3)
recall_r = round(recall,3)
accuracy = round((y_pred==y_test).sum()/len(y_pred),3)
print(f"Precision: {precision_r} / Recall: {recall_r} / Accuracy: {accuracy}")
# From the results:
# 100% of not-spam were correctly identified as not-span - great!
# 57% of spam were correctly identified as spam - not great since 43% of spam went into the inbox
# 94% of all emails went into our email, were correctly identified as spam/not-spam - great!

Precision: 1.0 / Recall: 0.577 / Accuracy: 0.946




### Grid Search
Defining a grid of hyperparameters and changing the hyperparameters of model by the grid to find the best combination 

In [16]:
# Perform the same training process as before
def train_rf(n_est,depth):
    rf = RandomForestClassifier(n_estimators=n_est,n_jobs=-1)
    rf_model = rf.fit(x_train,y_train)
    y_pred = rf_model.predict(x_test)
    precision,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
    precision_r = round(precision,3)
    recall_r = round(recall,3)
    accuracy = round((y_pred==y_test).sum()/len(y_pred),3)
    print(f"Est: {n_est}, Depth {depth} ----- Precision: {precision_r} / Recall: {recall_r} / Accuracy: {accuracy}")

In [19]:
# Supress sklearn warnings (something about the column names not string)
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Testing the training process for different combinations
for n_est in [10,50,100]:
    for depth in [10,20,30,None]:
        train_rf(n_est,depth)
# It's hard to tell which model had the best results, but I would choose Est:100 and Depth: None (no depth limit)

Est: 10, Depth 10 ----- Precision: 0.981 / Recall: 0.746 / Accuracy: 0.966
Est: 10, Depth 20 ----- Precision: 1.0 / Recall: 0.803 / Accuracy: 0.975
Est: 10, Depth 30 ----- Precision: 0.991 / Recall: 0.761 / Accuracy: 0.969
Est: 10, Depth None ----- Precision: 1.0 / Recall: 0.768 / Accuracy: 0.97
Est: 50, Depth 10 ----- Precision: 1.0 / Recall: 0.831 / Accuracy: 0.978
Est: 50, Depth 20 ----- Precision: 1.0 / Recall: 0.817 / Accuracy: 0.977
Est: 50, Depth 30 ----- Precision: 0.992 / Recall: 0.824 / Accuracy: 0.977
Est: 50, Depth None ----- Precision: 0.992 / Recall: 0.831 / Accuracy: 0.978
Est: 100, Depth 10 ----- Precision: 1.0 / Recall: 0.817 / Accuracy: 0.977
Est: 100, Depth 20 ----- Precision: 0.991 / Recall: 0.81 / Accuracy: 0.975
Est: 100, Depth 30 ----- Precision: 1.0 / Recall: 0.803 / Accuracy: 0.975
Est: 100, Depth None ----- Precision: 1.0 / Recall: 0.81 / Accuracy: 0.976


### Evaluating Grid Search with GridSearchCV

In [22]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
params = {
    'n_estimators':[10,150,300],
    'max_depth':[30,60,90,None]
}

gs = GridSearchCV(rf,params,cv=5,n_jobs=-1)
gs_fit = gs.fit(X_tfidf,data['label'])
# Focus on the mean_test_score
results = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,17.517458,2.106193,0.101224,0.006583,,150,"{'max_depth': None, 'n_estimators': 150}",0.983842,0.977558,0.973944,0.964061,0.967655,0.973412,0.007026,1
8,32.199205,1.10982,0.186668,0.011874,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977558,0.974865,0.973944,0.967655,0.969452,0.972695,0.003629,2
7,15.365716,0.625012,0.118424,0.014606,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.975763,0.977558,0.975741,0.964061,0.965858,0.971796,0.00565,3
11,27.392953,0.435013,0.163121,0.023004,,300,"{'max_depth': None, 'n_estimators': 300}",0.975763,0.97307,0.973046,0.96496,0.969452,0.971258,0.003734,4
3,0.711262,0.084473,0.006241,0.000714,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.978456,0.970377,0.968553,0.966757,0.964061,0.969641,0.004876,5
