# Random Forest Classifer - Grid Searching and Stats

### Cleaning Data, Creating Feature Data and Vectorizing Data

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("spam.csv", encoding="latin-1")
data = data[['v1', 'v2']][:3000]
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5712,5713,5714,5715,5716,5717,5718,5719,5720,5721
0,92,9.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,25.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,39,15.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Grid Searching

In this section of the code, we are using a TFIDF verctorized data set and exploring the different combinations of some parameters for the Random Forest classifier to see the more effective values

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat, data['label'], test_size=0.2)

In [4]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators = n_est, max_depth = depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} ------ Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_pred),3)) )

In [5]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ------ Precision: 1.0 / Recall: 0.371 / Accuracy: 0.927
Est: 10 / Depth: 20 ------ Precision: 1.0 / Recall: 0.586 / Accuracy: 0.952
Est: 10 / Depth: 30 ------ Precision: 1.0 / Recall: 0.743 / Accuracy: 0.97
Est: 10 / Depth: None ------ Precision: 0.965 / Recall: 0.786 / Accuracy: 0.972
Est: 50 / Depth: 10 ------ Precision: 1.0 / Recall: 0.443 / Accuracy: 0.935
Est: 50 / Depth: 20 ------ Precision: 1.0 / Recall: 0.614 / Accuracy: 0.955
Est: 50 / Depth: 30 ------ Precision: 1.0 / Recall: 0.743 / Accuracy: 0.97
Est: 50 / Depth: None ------ Precision: 1.0 / Recall: 0.786 / Accuracy: 0.975
Est: 100 / Depth: 10 ------ Precision: 1.0 / Recall: 0.429 / Accuracy: 0.933
Est: 100 / Depth: 20 ------ Precision: 1.0 / Recall: 0.671 / Accuracy: 0.962
Est: 100 / Depth: 30 ------ Precision: 1.0 / Recall: 0.714 / Accuracy: 0.967
Est: 100 / Depth: None ------ Precision: 0.983 / Recall: 0.814 / Accuracy: 0.977


## GridSearchCV 

In this section of the code, we will test two different things. First we will compare Count vectorizig and TFIDF vectorizing and second we are testing the more effective parameter value combinations

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

##### Parameter Combinations

In [7]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)

##### GridSearchCV with TFIDF

In [8]:
gs_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,17.412857,0.978151,0.251568,0.03574,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.981667,0.971667,0.96,0.975,0.968333,0.971333,0.00718,1
10,20.90679,1.349503,0.234422,0.025334,,150,"{'max_depth': None, 'n_estimators': 150}",0.976667,0.97,0.963333,0.976667,0.968333,0.971,0.005121,2
8,39.182673,2.315365,0.385707,0.076753,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978333,0.966667,0.96,0.978333,0.97,0.970667,0.00704,3
11,36.936411,1.779641,0.321164,0.031418,,300,"{'max_depth': None, 'n_estimators': 300}",0.976667,0.968333,0.96,0.976667,0.971667,0.970667,0.0062,3
7,20.454104,1.841162,0.22547,0.007641,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.98,0.968333,0.956667,0.975,0.971667,0.970333,0.007846,5


##### GridSearchCV with Count Vectorizer

In [9]:
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,41.423633,3.848613,0.352749,0.015342,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978333,0.97,0.961667,0.98,0.968333,0.971667,0.006749,1
10,21.275763,1.368233,0.249182,0.021009,,150,"{'max_depth': None, 'n_estimators': 150}",0.978333,0.97,0.955,0.981667,0.97,0.971,0.009226,2
5,37.871765,3.899631,0.362415,0.015303,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.98,0.971667,0.956667,0.978333,0.966667,0.970667,0.008472,3
7,21.309292,2.000412,0.273512,0.035558,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.978333,0.971667,0.956667,0.976667,0.97,0.970667,0.007645,3
11,37.857802,3.343498,0.314159,0.03441,,300,"{'max_depth': None, 'n_estimators': 300}",0.98,0.971667,0.953333,0.978333,0.97,0.970667,0.009463,3


## Best Random Forest Case

In this section of the code, we are using the best parameter combination determined from the sections above and getting the fit time and pred time which will be later used to compare with different classifers.

#### Splitting Training and Testing Data Set

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

#### Vectorizing Training and Testing Data Sets

In [11]:
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5019,5020,5021,5022,5023,5024,5025,5026,5027,5028
0,41,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,126,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,134,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Evaluating the model

In [12]:
import time
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

#Getting Fit time
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

#Getting Prediction time
start =time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 2.416 / Predict time: 0.35 ---- Precision: 0.986 / Recall: 0.85 / Accuracy: 0.978
