In [1]:
import nltk
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import string


data = pd.read_csv("../Data/SMSSpamCollection.txt", sep="\t", header=None)
data.columns = ["label", "Content"]

en_stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

data["Content_len"] = data["Content"].apply(lambda x: len(x) - x.count(" "))


def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation]
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation / (total), 4) * 100


data["punctuation_rate"] = data["Content"].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split(r"\W+", result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text


vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data["Content"])


all_data = pd.concat(
    [pd.DataFrame(vect_final.toarray()), data["Content_len"], data["punctuation_rate"]],
    axis=1,
)

In [2]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
alg_RandomForest = RandomForestClassifier()
params = {"n_estimators": [5, 20, 50, 100], "max_depth": [10, 20, 30]}

all_data.columns = all_data.columns.astype(str)

hyper_params_grid = GridSearchCV(alg_RandomForest, params, cv=4, n_jobs=-1)
hyper_params_models = hyper_params_grid.fit(all_data, data["label"])

In [4]:
pd.DataFrame(hyper_params_models.cv_results_).sort_values(
    "mean_test_score", ascending=False
)[0:6]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
11,7.364491,0.160779,0.164154,0.012585,30,100,"{'max_depth': 30, 'n_estimators': 100}",0.969849,0.961953,0.959081,0.95621,0.961773,0.005086,1
10,4.718936,0.074571,0.248333,0.02079,30,50,"{'max_depth': 30, 'n_estimators': 50}",0.96267,0.961953,0.957645,0.95621,0.95962,0.002751,2
9,2.627815,0.164234,0.252761,0.057799,30,20,"{'max_depth': 30, 'n_estimators': 20}",0.96267,0.956927,0.957645,0.959799,0.959261,0.002234,3
8,1.406085,0.052453,0.226107,0.020356,30,5,"{'max_depth': 30, 'n_estimators': 5}",0.963388,0.946159,0.949749,0.946877,0.951543,0.006969,4
7,6.20316,0.15761,0.256131,0.01906,20,100,"{'max_depth': 20, 'n_estimators': 100}",0.949031,0.945441,0.946877,0.946877,0.947057,0.001282,5
6,3.608375,0.025722,0.22183,0.008811,20,50,"{'max_depth': 20, 'n_estimators': 50}",0.947595,0.949749,0.945441,0.941852,0.946159,0.002916,6
