In [10]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("../../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split('\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text

 

vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data['Content'])


all_data = pd.concat([pd.DataFrame(vect_final.toarray()), data['Content_len'], data['punctuation_rate']], axis=1)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(all_data, data['label'], test_size=0.2)

In [13]:
def random_forest_hyper_params(n_decision_tree, depth):
    alg_RandomForest = RandomForestClassifier(n_estimators=n_decision_tree, max_depth=depth, n_jobs=-1)
    model = alg_RandomForest.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    precision, recall, fscore, support = score(Y_test, predictions, pos_label='spam', average='binary')
    print('nb decision tree: {} / Depth: {} | Precision: {} / Recall: {}'.format(
        n_decision_tree, depth, round(precision, 3), round(recall, 3)))

In [14]:
for ith_decision_tree in [5, 20, 50,100]:
    for depth in [10, 20, 30]:
        random_forest_hyper_params(ith_decision_tree, depth)

nb decision tree: 5 / Depth: 10 | Precision: 1.0 / Recall: 0.336
nb decision tree: 5 / Depth: 20 | Precision: 0.988 / Recall: 0.611
nb decision tree: 5 / Depth: 30 | Precision: 1.0 / Recall: 0.649
nb decision tree: 20 / Depth: 10 | Precision: 1.0 / Recall: 0.298
nb decision tree: 20 / Depth: 20 | Precision: 1.0 / Recall: 0.55
nb decision tree: 20 / Depth: 30 | Precision: 1.0 / Recall: 0.679
nb decision tree: 50 / Depth: 10 | Precision: 1.0 / Recall: 0.26
nb decision tree: 50 / Depth: 20 | Precision: 1.0 / Recall: 0.58
nb decision tree: 50 / Depth: 30 | Precision: 1.0 / Recall: 0.702
nb decision tree: 100 / Depth: 10 | Precision: 1.0 / Recall: 0.275
nb decision tree: 100 / Depth: 20 | Precision: 1.0 / Recall: 0.626
nb decision tree: 100 / Depth: 30 | Precision: 1.0 / Recall: 0.702
