## EL-GY-9133 Machine Learning for Cyber-Security
### Lab 1: E-mail Spam Filtering
##### Release Date: 02/22/2018; Due Date: Midnight, 03/07/2018

Student Name: Manuel Serrano Rebuelta

Student Netid: msr542

In [263]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import glob
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import svm

In [254]:
### Load all emails and labels in a data frame ###

data_train = pd.DataFrame(columns=['X', 'y'])
data_test = pd.DataFrame(columns=['X', 'y'])
count_train = 0
count_test = 0

#Iterate over the 9 folds
for i in range(1, 11):
    #Load spam emails
    for spam_path in glob.glob('lingspam_public/lemm_stop/part'+str(i)+'/spm*.txt'):
        with open(spam_path, 'r') as spam_file:
            opened_spam_file = spam_file.read().replace('\n', '')
            if(i==10):
                data_test.loc[count_test] = [opened_spam_file, 1]
                count_test += 1
            else:
                data_train.loc[count_train] = [opened_spam_file, 1]
                count_train += 1
            
    #Load legitimate emails
    for legit_path in glob.glob('lingspam_public/lemm_stop/part'+str(i)+'/[!spm]*.txt'):
        with open(legit_path, 'r') as legit_file:
            opened_legit_file = legit_file.read().replace('\n', '')
            if(i==10):
                data_test.loc[count_test] = [opened_legit_file, 0]
                count_test += 1
            else:
                data_train.loc[count_train] = [opened_legit_file, 0]
                count_train += 1

In [255]:
### Manually create our document-term matrix (binary, since we are not counting terms)
### Then, we calculate information gain of all terms within the dataset
### Once we have calculated all, we could select the top-N ones, and discard the rest features (feature selection)

CV = CountVectorizer(binary=True)
X_vectorized = CV.fit_transform(data_train['X'])
info_gain_features = dict(zip(CV.get_feature_names(), 
                              feature_selection.mutual_info_classif(X_vectorized, data_train['y'].tolist())))
# Print terms with higher information gain
sorted(info_gain_features, key=info_gain_features.get)[-10:];

In [256]:
### It turns out there is a faster, more Pythonic way to do this. This is, creating a pipeline and using SelectKBest

N = [10, 100, 1000]
classifiers = {}

for i in N:
    classifiers['Bernoulli_{0}'.format(i)] = Pipeline([('vectorizer', CountVectorizer(binary=True)),
                                                       ('feature_selector', SelectKBest(mutual_info_classif, k=i)), 
                                                       ('classifier', BernoulliNB())])
    classifiers['Multinomial_{0}'.format(i)] = Pipeline([('vectorizer', CountVectorizer(binary=True)),
                                                       ('feature_selector', SelectKBest(mutual_info_classif, k=i)), 
                                                       ('classifier', MultinomialNB())])
    # Double check this one
    classifiers['MultinomialTF_{0}'.format(i)] = Pipeline([('vectorizer', CountVectorizer()),
                                                       ('feature_selector', SelectKBest(mutual_info_classif, k=i)), 
                                                       ('classifier', MultinomialNB())])

In [257]:
for i in classifiers:
    classifiers[i].fit(data_train['X'], data_train['y'].tolist())

In [261]:
results = []
for count, i in enumerate(classifiers):
    print(i)
    results.append([])
    y_test_pred = classifiers[i].predict(data_test['X'])
    results[count].append(accuracy_score(data_test['y'].tolist(), y_test_pred))
    results[count].append(precision_score(data_test['y'].tolist(), y_test_pred))
    results[count].append(recall_score(data_test['y'].tolist(), y_test_pred))

Bernoulli_10
Multinomial_10
MultinomialTF_10
Bernoulli_100
Multinomial_100
MultinomialTF_100
Bernoulli_1000
Multinomial_1000
MultinomialTF_1000


In [262]:
results

[[0.94501718213058417, 0.8666666666666667, 0.79591836734693877],
 [0.94845360824742264, 0.88636363636363635, 0.79591836734693877],
 [0.95876288659793818, 0.84905660377358494, 0.91836734693877553],
 [0.94501718213058417, 1.0, 0.67346938775510201],
 [0.97938144329896903, 0.97777777777777775, 0.89795918367346939],
 [0.98281786941580751, 0.95833333333333337, 0.93877551020408168],
 [0.93470790378006874, 1.0, 0.61224489795918369],
 [0.98969072164948457, 1.0, 0.93877551020408168],
 [0.98969072164948457, 1.0, 0.93877551020408168]]

In [264]:
### SVM ###

SVM_classifier = Pipeline([('vectorizer', CountVectorizer()), ('svm', svm.SVC(kernel='linear'))])
SVM_classifier.fit(data_train['X'], data_train['y'].tolist())

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [268]:
results_SVM = []
y_test_pred = SVM_classifier.predict(data_test['X'])
results_SVM.append(accuracy_score(data_test['y'].tolist(), y_test_pred))
results_SVM.append(precision_score(data_test['y'].tolist(), y_test_pred))
results_SVM.append(recall_score(data_test['y'].tolist(), y_test_pred))

In [269]:
results_SVM

[0.96907216494845361, 0.90000000000000002, 0.91836734693877553]