In [81]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hagen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 1. Read in files
    - Loop over ham directory and extract the email body (= Text after the first \n)
    - Save into a pandas data-frame with the label 0
    - Same for spam but with label 1

In [82]:
import os
root = "./"
def convert_files(directory):
    # build directory path
    directory_path = os.path.join(root, directory)
    
    for mail in os.listdir(directory_path):
        file_path = directory_path + "/" + mail
        with open(file_path, "r", encoding='latin-1') as m:
            mail_dict = parse_message(m)
            yield mail_dict

In [83]:
import re
from nltk.corpus import stopwords

def parse_message(msg):
    body = ''
    email = {}
    #email['subject'] = ''
    in_body = False
    exclude_terms = ['URL:', 'Date:', 'Return-Path:']
    sw = stopwords.words("english")
    
    for line in msg:
        if line == '\n':
            in_body = True
            continue
            
        if any(term in line for term in exclude_terms):
            continue
            
        
        #get rid of html markup
        line = re.sub('<[^>]*>', '', line)
        
        #get rid of stopwords
        line = ' '.join([word for word in line.split() if word.lower() not in sw])
        
        
        if in_body:
            body += line.strip()
            email['body'] = body
#         elif line.startswith('From:'):
#             sender = line.strip()
#             sender = sender.replace('"', '')
#             sender = line[5:]
#             email['sender'] = sender
#         elif line.startswith('Subject:'):
#             subject = line.strip()
#             subject = line[8:]
#             email['subject'] = subject
            
        # Optionally an else branch could extract more features
        
    return email

In [84]:
def make_df(path, label):
    rows = []
    index = []
    for i, text in enumerate(convert_files(path)):
        rows.append({'body': text['body'], 'label': label}) #'subject': text['subject'], 'sender' : text['sender'],
        index.append(i)
    
    df = pd.DataFrame(rows, index=index)
    return df

In [85]:
df_ham = make_df('corpus/train-ham', 0)
df_spam = make_df('corpus/train-spam', 1)

### 2. Create Pipeline for e-mail classification
    - Vectorization
    - Classifier (NaiveBayes / SVM / LogReg)
    - (optional später: feature selection, z.B. IG)
   

In [86]:
# Create test and training data
df_final = pd.concat([df_ham, df_spam])
df_final = df_final.sample(frac=1).reset_index(drop=True)
X = df_final.body.values
y = df_final.label.values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [87]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None) #use defaults for smoothing and ngrams

# LogisticRegression
lr_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=0)) # default C to 1.0, mb tune later
])

# Bayes
bayes_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', MultinomialNB()) # default smoothing, and settings
])

# SVM
svm_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', SVC(C=1.0, gamma=1e-5)) # default C to 1.0, mb tune later
])

# RandomForest
rf_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', RandomForestClassifier()) # default n-trees to 10, mb tune later
])



### 3. K-fold Cross-Validation on the data, compare different models
    - create k-fold test for each classifier and train data + make predictions on validation-sets
    - figure out best model

In [88]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

k_fold = KFold(n_splits=5)
confusion_lr = np.array([[0,0], [0,0]])
confusion_bayes = np.array([[0,0], [0,0]])
confusion_svm = np.array([[0,0], [0,0]])
confusion_rf = np.array([[0,0], [0,0]])

for i, (train, validate) in enumerate(k_fold.split(X_train)):
    X_tr, X_val = X_train[train], X_train[validate]
    y_tr, y_val = y_train[train], y_train[validate]
    
    lr_tfidf.fit(X_tr, y_tr)
    bayes_tfidf.fit(X_tr, y_tr)
    svm_tfidf.fit(X_tr, y_tr)
    rf_tfidf.fit(X_tr, y_tr)

    print("Test Accuracy: {} (Fold: {})".format(lr_tfidf.score(X_val, y_val), i))
    print("Test Accuracy: {} (Fold: {})".format(bayes_tfidf.score(X_val, y_val), i))
    print("Test Accuracy: {} (Fold: {})".format(svm_tfidf.score(X_val, y_val), i))
    print("Test Accuracy: {} (Fold: {})".format(rf_tfidf.score(X_val, y_val), i))
    
    confusion_lr += confusion_matrix(y_val, lr_tfidf.predict(X_val))
    confusion_bayes += confusion_matrix(y_val, bayes_tfidf.predict(X_val))
    confusion_svm += confusion_matrix(y_val, svm_tfidf.predict(X_val))
    confusion_rf += confusion_matrix(y_val, rf_tfidf.predict(X_val))
    
    
print('Confusion - LR')
print(confusion_lr)
print('Confusion - Bayes')
print(confusion_bayes)
print('Confusion - SVM')
print(confusion_svm)
print('Confusion - Random Forest')
print(confusion_rf)

Test Accuracy: 0.9648854961832061 (Fold: 0)
Test Accuracy: 0.9740458015267176 (Fold: 0)
Test Accuracy: 0.49923664122137407 (Fold: 0)
Test Accuracy: 0.9221374045801527 (Fold: 0)
Test Accuracy: 0.9694656488549618 (Fold: 1)
Test Accuracy: 0.9648854961832061 (Fold: 1)
Test Accuracy: 0.4900763358778626 (Fold: 1)
Test Accuracy: 0.9358778625954198 (Fold: 1)
Test Accuracy: 0.966412213740458 (Fold: 2)
Test Accuracy: 0.966412213740458 (Fold: 2)
Test Accuracy: 0.5557251908396946 (Fold: 2)
Test Accuracy: 0.9129770992366413 (Fold: 2)
Test Accuracy: 0.9755725190839695 (Fold: 3)
Test Accuracy: 0.9694656488549618 (Fold: 3)
Test Accuracy: 0.5740458015267176 (Fold: 3)
Test Accuracy: 0.9236641221374046 (Fold: 3)
Test Accuracy: 0.9709480122324159 (Fold: 4)
Test Accuracy: 0.9709480122324159 (Fold: 4)
Test Accuracy: 0.5489296636085627 (Fold: 4)
Test Accuracy: 0.9235474006116208 (Fold: 4)
Confusion - LR
[[1703   44]
 [  56 1471]]
Confusion - Bayes
[[1680   67]
 [  34 1493]]
Confusion - SVM
[[1747    0]
 [152

In [44]:
# svm looks odd, kinda as if it just classifies naive --> need to check later

### 4. Tune LR model via GridSearch

In [89]:
# Tune LR 
from sklearn.grid_search import GridSearchCV
param_grid = [
    {
        'count__ngram_range': [(1,2), (1,3)],
        'count__lowercase': [True, False],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [0.1, 1.0, 10.0, 100.0]
    }
]

gs_lr_tfidf = GridSearchCV(
    lr_tfidf,
    param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

gs_lr_tfidf.fit(X_train, y_train)
print('Parameter: {}'.format(gs_lr_tfidf.best_params_))
print('Accuracy (CV): {}'.format(gs_lr_tfidf.best_score_))
best_classifier = gs_lr_tfidf.best_estimator_
print('Accuracy (Test): {}'.format(best_classifier.score(X_test, y_test)))



Parameter: {'clf__C': 100.0, 'clf__penalty': 'l2', 'count__lowercase': False, 'count__ngram_range': (1, 3)}
Accuracy (CV): 0.9795357361026268
Accuracy (Test): 0.9793447293447294


#### Result:
    The best estimator is a logistic regression in a pipeline with a CountVectorizer & tfidf-transformer
    Optimal parameters for Count-Vec: lowercase = False, ngram_range = (1,3)
    Optimal parameters for LR: C = 100, penalty = l2
    Accuracy on the test-set: 0.979

### 5. Refactor code into train  / predict script