### Modeling with NB vs KNN vs PCA+GNB vs PCA+KNN
- Features used:
    - Content Type (One hot encoded)
    - Subject line (Expanded to TFIDF)
    - Is the Email a reply
    - Is the Email a forward
    - Number of actual data lines


In [49]:
! pip3 install -r requirements.txt



In [111]:
import pandas as pd
import numpy as np
import glob
import socket
import spacy
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import sklearn.model_selection as ms
from sklearn import metrics as mt
from sklearn.metrics import (accuracy_score,brier_score_loss, precision_score, recall_score,f1_score)
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 
import time
# from spacy.lang.en import English
# from spacy.lang.en.stop_words import STOP_WORDS

In [51]:
if socket.gethostname() != 'Rohits-MacBook-Pro.local':
    rootdir = '/Users/rohitchanne/Documents/QTW_case_study3/SpamAssassinMessages/'
else: 
    rootdir = '' ##Enter file path name here 

In [52]:
df_email_data = pd.DataFrame()
email_content = []
email_type = []
for filename in glob.iglob(rootdir+'**/*.*', recursive=True):
    file_parts = filename.split('/')
    if len(file_parts) == 3 and 'cmds' not in file_parts and '' not in file_parts:
        # print(filename)
        email_type.append(file_parts[1])
        with open(filename, encoding='latin1') as f:
            data = f.read()
            email_content.append(data)

df_email_data = pd.DataFrame({'type': email_type, 'content': email_content})
# del email_content, email_type

In [53]:
df_email_data.groupby('type').count()

Unnamed: 0_level_0,content
type,Unnamed: 1_level_1
easy_ham,5051
easy_ham_2,1400
hard_ham,500
spam,1000
spam_2,1397


### Transformations
- New relevant columns:
    - Extracting Content Type
    - Extracting Subject line
    - Is the Email a reply
    - Is the Email a forward
    - Number of actual data lines (Normalized)

In [54]:
df_email_expanded = df_email_data.apply(lambda x: x.str.split('\n').explode())
df_email_data['is_spam'] = df_email_data['type'].str.contains('spam')
df_content_type = df_email_expanded[df_email_expanded['content']
                                    .str.contains("^Content-Type.*$", regex=True)]['content'].str.split(';').str[0].str.lower().str.strip()
df_subject = df_email_expanded[df_email_expanded['content']
                  .str.strip().str.contains("^Subject:.*$", regex=True)]['content']
df_email_data['content_type'] = df_content_type.groupby(df_content_type.index).first()
df_email_data['subject'] = df_subject.groupby(df_subject.index).first().str.replace("Subject: ","")
df_email_data['is_reply'] = df_email_data['subject'].str.lower().str.startswith("re:")
df_email_data['is_forward'] = df_email_data['subject'].str.lower().str.startswith("fwd:")
df_email_data['is_upper'] = df_email_data['subject'].str.isupper()
df_email_data['num_lines'] = df_email_data['content'].str.split('\n').str.len()
df_email_data['blank_lines'] = df_email_expanded['content'][df_email_expanded['content'].str.strip()==""].groupby(df_email_expanded[df_email_expanded['content'].str.strip()==""].index).count()
df_email_data['data_lines'] = df_email_data['num_lines'] - df_email_data['blank_lines']
df_email_data['data_lines_gaussian'] = (df_email_data['data_lines'] - df_email_data['data_lines'].min())/(df_email_data['data_lines'].max() - df_email_data['data_lines'].min())

In [55]:
df_email_data

Unnamed: 0,type,content,is_spam,content_type,subject,is_reply,is_forward,is_upper,num_lines,blank_lines,data_lines,data_lines_gaussian
0,spam,From pamela4701@eudoramail.com Mon Sep 9 10:...,True,content-type: text/plain,Let us find the right mortgage lender for you ...,False,False,False,59,18,41,0.005079
1,spam,From 102192086381143-17090200005-example.com?z...,True,content-type: multipart/alternative,"Friend, Copy ANY DVD or Playstation Game with ...",False,False,False,184,37,147,0.021901
2,spam,From sh@insiq.us Fri Sep 20 11:41:16 2002\nRe...,True,content-type: multipart/alternative,5% Guaranteed for Eight Years,False,False,False,273,17,256,0.039200
3,spam,From OWNER-NOLIST-SGODAILY*JM**NETNOTEINC*-COM...,True,content-type: text/html,Congratulations! You Get a Free Handheld Organ...,False,False,False,116,21,95,0.013649
4,spam,From apf@wu-wien.ac.at Thu Sep 19 13:01:55 20...,True,content-type: text/plain,PROTECT YOUR INFORMATION AND YOUR COMPUTER,False,False,True,66,16,50,0.006507
...,...,...,...,...,...,...,...,...,...,...,...,...
9343,easy_ham_2,From razor-users-admin@lists.sourceforge.net ...,False,content-type: text/plain,RE: [Razor-users] What's wrong with the Razor ...,True,False,False,88,13,75,0.010475
9344,easy_ham_2,From fork-admin@xent.com Tue Aug 6 11:58:11 ...,False,,Re: W3C approves HTML 4 'emotitags' [...],True,False,False,50,7,43,0.005396
9345,easy_ham_2,From rpm-list-admin@freshrpms.net Mon Jul 22 ...,False,content-type: text/plain,Re: Ximian apt repos?,True,False,False,77,8,69,0.009522
9346,easy_ham_2,From rpm-list-admin@freshrpms.net Tue Jul 30 ...,False,content-type: text/plain,Re: Installing RPM,True,False,False,84,10,74,0.010316


### Using Spacy for creating a TFIDF vectorzied feature space

In [70]:
nlp = spacy.load("en_core_web_lg")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(document):
    tokens = nlp(document)
    tokens = [token.lemma_ for token in tokens if (
        token.is_stop == False and \
        token.is_punct == False and \
        # token.is_digit == False and \
        token.lemma_.strip()!= '')]
    return tokens

In [71]:
vectorizer = TfidfVectorizer(input = 'content', tokenizer = spacy_tokenizer)
V = vectorizer.fit_transform(df_email_data['subject'].replace(np.nan, ""))

In [72]:
df_for_nb = pd.DataFrame(V.toarray(), columns=vectorizer.get_feature_names())

In [73]:
df_for_nb.shape

(9348, 5824)

In [74]:
df_for_nb = pd.concat([df_email_data[['is_spam','is_reply','is_forward','is_upper','data_lines_gaussian']]
                       ,pd.get_dummies(df_email_data['content_type'], columns = ['content_type'])
                       ,df_for_nb],axis=1)

### Inspecting first few columns

In [75]:
df_for_nb.columns[:100]

Index(['is_spam', 'is_reply', 'is_forward', 'is_upper', 'data_lines_gaussian',
       'content-type:', 'content-type: multipart/alternative',
       'content-type: multipart/mixed', 'content-type: multipart/related',
       'content-type: multipart/report', 'content-type: multipart/signed',
       'content-type: text/html', 'content-type: text/plain',
       'content-type: text/plain charset=us-ascii', '"(was', '$', ''em', '+',
       '--allowe', '--breake', '--with', '-25', '-9', '-adv-', '-approved-',
       '-c', '-colonize', '-f', '-h', '-hgh', '-lm', '-lrmjq', '-pkqolhil',
       '-t', '-the-', '."announcing', '.+', '.biz', '.cf', '.com', '.doc',
       '.info', '.message', '.mp3', '//ytu855', '/d', '/etc', '/home', '/s',
       '0', '0.04', '0.1', '0.11.28', '0.11.31', '0.3', '0.5', '0.5.7', '0.91',
       '00324', '00326', '00328', '00330', '01', '0137cpnx4', '0179mfrp5',
       '02', '02/2002', '03', '03.07.02', '04', '040jxgt0341bxgg-24', '05',
       '05152', '05775748', '057

In [91]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

### Cross Validation with 5 iterations and test size as 25 % of the data set

In [100]:
def cv_train(name,model,x,y, num_cv_iterations = 5):
    print(f"====Performing Cross Validation for {name}")
    print(f" Iteration ", end = '')
    iter_num=0
    res=pd.DataFrame()
    
    cv_object = ms.StratifiedShuffleSplit(n_splits=num_cv_iterations,
                         random_state=0,
                         test_size  = 0.25)

    for train_indices, test_indices in cv_object.split(x,y):
        t = time.time()
        print(f" {iter_num},", end = '')

        X_train = x.iloc[train_indices]
        y_train = y.iloc[train_indices]

        X_test = x.iloc[test_indices]
        y_test = y.iloc[test_indices]

        model.fit(X_train,y_train)  # train object
        y_hat = model.predict(X_test) # get test set precitions

        conf = mt.confusion_matrix(y_test,y_hat)
        
        row=pd.DataFrame([iter_num],columns=['Iteration'])
        row["accuracy"]=accuracy_score(y_test, y_hat)
        row["precision"]=precision_score(y_test, y_hat, average="macro")
        row["recall"]=recall_score(y_test, y_hat, average="macro")
        row["f1"]=f1_score(y_test, y_hat, average="macro")
        row["average_seconds"]= np.round(time.time() - t)

        res=res.append(row)
        iter_num+=1
    
    #Summarize CV Results 
    summary=res.drop("Iteration",axis=1).agg("mean").to_frame().T
    #summary.insert(0,"model",model)
    summary.insert(0,"name",name)
    print("  Cross-validation complete")

    return(summary)

In [94]:
clean_dataset(df_for_nb)
X = df_for_nb.drop('is_spam', axis=1)
y = df_for_nb['is_spam']
# rs = ShuffleSplit(n_splits=5, test_size=.3, random_state=0)

In [98]:
mnb = MultinomialNB()

In [101]:
cv_results=cv_train('nb',mnb,X,y)

====Performing Cross Validation for nb
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


### Untrained Naive Bayes

In [102]:
cv_results

Unnamed: 0,name,accuracy,precision,recall,f1,average_seconds
0,nb,0.930651,0.917803,0.897368,0.906831,4.0


In [116]:
base_knn = KNeighborsClassifier(n_neighbors=3)

pca_gnb = Pipeline(
    [('PCA',PCA(n_components=1000,svd_solver='randomized')),
     ('CLF',GaussianNB())] # Using Gaussian because MultinomialNB cannot support negative values
)

pca_knn = Pipeline(
    [('PCA',PCA(n_components=1000,svd_solver='randomized')),
     ('CLF',KNeighborsClassifier(n_neighbors=3))]
)

In [118]:
other_random_models = {"base_knn":base_knn,"pca_gnb":pca_gnb,"pca_knn":pca_knn}
# other_random_models = {"pca_gnb":pca_gnb,"pca_knn":pca_knn}

In [119]:
for name,model in other_random_models.items():
    cv_results=cv_results.append(cv_train(name,model,X,y))

====Performing Cross Validation for pca_gnb
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete
====Performing Cross Validation for pca_knn
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


In [120]:
cv_results

Unnamed: 0,name,accuracy,precision,recall,f1,average_seconds
0,nb,0.930651,0.917803,0.897368,0.906831,4.0
0,base_knn,0.906079,0.871063,0.891892,0.88031,80.2
0,pca_gnb,0.671318,0.691995,0.749784,0.656936,17.2
0,pca_knn,0.926712,0.897137,0.917251,0.906339,24.0
