In [5]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("../../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split('\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text

 

vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data['Content'])


all_data = pd.concat([pd.DataFrame(vect_final.toarray()), data['Content_len'], data['punctuation_rate']], axis=1)

In [6]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [9]:
alg_RandomForest = RandomForestClassifier()
params = {'n_estimators': [5, 20, 50,100],
        'max_depth': [10, 20, 30]}

hyper_params_grid = GridSearchCV(alg_RandomForest, params, cv=4, n_jobs=-1)
hyper_params_models = hyper_params_grid.fit(all_data, data['label'])


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
10,2.744531,0.128249,0.160172,0.017032,30,50,"{'max_depth': 30, 'n_estimators': 50}",0.965542,0.96267,0.959081,0.956927,0.961055,0.003304,1
9,1.580662,0.036187,0.143929,0.005245,30,20,"{'max_depth': 30, 'n_estimators': 20}",0.958363,0.965542,0.959081,0.95621,0.959799,0.00348,2
11,4.702921,0.039317,0.175772,0.006771,30,100,"{'max_depth': 30, 'n_estimators': 100}",0.966978,0.959081,0.95621,0.95621,0.95962,0.004407,3
8,0.857629,0.033958,0.126257,0.005226,30,5,"{'max_depth': 30, 'n_estimators': 5}",0.950467,0.955492,0.953338,0.946877,0.951543,0.00323,4
7,3.71778,0.115462,0.162829,0.006797,20,100,"{'max_depth': 20, 'n_estimators': 100}",0.951902,0.944724,0.947595,0.946159,0.947595,0.002686,5


In [10]:
pd.DataFrame(hyper_params_models.cv_results_).sort_values('mean_test_score', ascending=False)[0:6]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
10,2.744531,0.128249,0.160172,0.017032,30,50,"{'max_depth': 30, 'n_estimators': 50}",0.965542,0.96267,0.959081,0.956927,0.961055,0.003304,1
9,1.580662,0.036187,0.143929,0.005245,30,20,"{'max_depth': 30, 'n_estimators': 20}",0.958363,0.965542,0.959081,0.95621,0.959799,0.00348,2
11,4.702921,0.039317,0.175772,0.006771,30,100,"{'max_depth': 30, 'n_estimators': 100}",0.966978,0.959081,0.95621,0.95621,0.95962,0.004407,3
8,0.857629,0.033958,0.126257,0.005226,30,5,"{'max_depth': 30, 'n_estimators': 5}",0.950467,0.955492,0.953338,0.946877,0.951543,0.00323,4
7,3.71778,0.115462,0.162829,0.006797,20,100,"{'max_depth': 20, 'n_estimators': 100}",0.951902,0.944724,0.947595,0.946159,0.947595,0.002686,5
6,2.491084,0.289365,0.19346,0.039023,20,50,"{'max_depth': 20, 'n_estimators': 50}",0.954056,0.948313,0.944006,0.94257,0.947236,0.004469,6
