In [2]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split(r'\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text

 

vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data['Content'])


all_data = pd.concat([pd.DataFrame(vect_final.toarray()), data['Content_len'], data['punctuation_rate']], axis=1)

In [3]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [4]:
all_data.columns = all_data.columns.astype(str)
X_train, X_test, Y_train, Y_test = train_test_split(all_data, data['label'], test_size=0.2)

In [5]:
from sklearn.ensemble import RandomForestClassifier

alg_RandomForest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
model = alg_RandomForest.fit(X_train, Y_train)

In [31]:
predictions = model.predict(X_test)

Evaluating Model Performance

In [33]:
precision, recall, fscore, _ = score(Y_test, predictions, pos_label='spam', average='binary')
print('Precision: {} \n Recall: {} \n Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((predictions==Y_test).sum() / len(predictions),3)))

Precision: 1.0 
 Recall: 0.626 
 Accuracy: 0.951


In [34]:
len(predictions[predictions=='spam'])

92

In [35]:
len(Y_test[Y_test=='spam'])

147

In [36]:
len(predictions)

1115

In [29]:
pd.DataFrame({"Label": data['label'][:1115], "Email Predictions": predictions})[0:51]

Unnamed: 0,Label,Email Predictions
0,ham,ham
1,ham,ham
2,spam,ham
3,ham,ham
4,ham,ham
5,spam,ham
6,ham,ham
7,ham,ham
8,spam,spam
9,spam,ham
