In [2]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split(r'\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text

 

vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data['Content'])


all_data = pd.concat([pd.DataFrame(vect_final.toarray()), data['Content_len'], data['punctuation_rate']], axis=1)

In [6]:
from sklearn.model_selection import KFold, cross_val_score

all_data.columns = all_data.columns.astype(str)

In [7]:
from sklearn.ensemble import RandomForestClassifier

alg_RandomForest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

K_fold = KFold(n_splits=4)
cross_val_score(alg_RandomForest, all_data, data['label'], cv=K_fold, scoring='accuracy', n_jobs=-1)

array([0.94113424, 0.94687724, 0.94544149, 0.94472362])