In [5]:
import nltk
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import string


data = pd.read_csv("SMSSpamCollection.txt", sep="\t", header=None)
data.columns = ['label', 'content']

en_stopwords = nltk.corpus.stopwords.words('english')

data['content_len'] = data['content'].apply(lambda x:len(x)- x.count(" "))


def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['content'].apply(lambda x:count_punctuation(x))

def clean_mail(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split('\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data[['content', 'content_len', 'punctuation_rate']], data['label'], test_size=0.2)

In [11]:
ps = nltk.PorterStemmer()
vectorisation = TfidfVectorizer(analyzer=clean_mail)
vectorisation_model = vectorisation.fit(X_train['content'])

vect_train = vectorisation_model.transform(X_train['content'])
vect_test = vectorisation_model.transform(X_test['content'])

final_train_vect = pd.concat([pd.DataFrame(vect_train.toarray()), X_train[['content_len', 'punctuation_rate']].reset_index(drop=True)], axis=1)
final_test_vect = pd.concat([pd.DataFrame(vect_test.toarray()), X_test[['content_len', 'punctuation_rate']].reset_index(drop=True)], axis=1)

final_train_vect

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7241,7242,7243,7244,7245,7246,7247,7248,content_len,punctuation_rate
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,21,9.52
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,6,0.00
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,36,2.78
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,19,21.05
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,28,10.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,25,4.00
4453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,68,1.47
4454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,60,8.33
4455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,37,2.70


In [12]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import precision_recall_fscore_support as score
 

In [13]:
alg_RandomForest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
model = alg_RandomForest.fit(final_train_vect, Y_train)

In [14]:
predictions = model.predict(final_test_vect)
precision, recall, fscore, support = score(Y_test, predictions, pos_label='spam', average='binary')

In [15]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((predictions==Y_test).sum() / len(predictions),3)))

Precision: 1.0 / Recall: 0.594 / Accuracy: 0.943
