In [2]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split(r'\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text
 

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data[['Content', 'Content_len', 'punctuation_rate']], data['label'], test_size=0.2)

In [5]:
vectorisation = TfidfVectorizer(analyzer=clean_email)
vectorisation_model = vectorisation.fit(X_train['Content'])

vect_train = vectorisation_model.transform(X_train['Content'])
vect_test = vectorisation_model.transform(X_test['Content'])

final_train_vect = pd.concat([pd.DataFrame(vect_train.toarray()), X_train[['Content_len', 'punctuation_rate']].reset_index(drop=True)], axis=1)
final_test_vect = pd.concat([pd.DataFrame(vect_test.toarray()), X_test[['Content_len', 'punctuation_rate']].reset_index(drop=True)], axis=1)

final_train_vect

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7303,7304,7305,7306,7307,7308,7309,7310,Content_len,punctuation_rate
0,0.113699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,90,1.11
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,51,5.88
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,21,0.00
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,23,4.35
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,116,9.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,48,6.25
4453,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,25,8.00
4454,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,46,6.52
4455,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.292912,0.0,0.0,36,16.67


In [6]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import precision_recall_fscore_support as score

In [8]:
alg_RandomForest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
final_train_vect.columns = final_train_vect.columns.astype(str)
model = alg_RandomForest.fit(final_train_vect, Y_train)

In [10]:
final_test_vect.columns = final_test_vect.columns.astype(str)
predictions = model.predict(final_test_vect)
precision, recall, fscore, support = score(Y_test, predictions, pos_label='spam', average='binary')

In [11]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((predictions==Y_test).sum() / len(predictions),3)))

Precision: 1.0 / Recall: 0.667 / Accuracy: 0.955
