In [63]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hagen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 1. Read in files
    - Loop over ham directory and extract the email body (= Text after the first \n)
    - Save into a pandas data-frame with the label 0
    - Same for spam but with label 1

In [64]:
import os
root = "./"
def convert_files(directory):
    # build directory path
    directory_path = os.path.join(root, directory)
    
    for mail in os.listdir(directory_path):
        file_path = directory_path + "/" + mail
        with open(file_path, "r", encoding='latin-1') as m:
            mail_dict = parse_message(m)
            yield mail_dict

In [65]:
import re
from nltk.corpus import stopwords

def parse_message(msg):
    body = ''
    email = {}
    email['subject'] = ''
    in_body = False
    exclude_terms = ['URL:', 'Date:', 'Return-Path:']
    sw = stopwords.words("english")
    
    for line in msg:
        if line == '\n':
            in_body = True
            continue
            
        if any(term in line for term in exclude_terms):
            continue
            
        
        #get rid of html markup
        line = re.sub('<[^>]*>', '', line)
        
        #get rid of stopwords
        line = ' '.join([word for word in line.split() if word.lower() not in sw])
        
        
        if in_body:
            body += line.strip()
            email['body'] = body
        elif line.startswith('From:'):
            sender = line.strip()
            sender = sender.replace('"', '')
            sender = line[5:]
            email['sender'] = sender
        elif line.startswith('Subject:'):
            subject = line.strip()
            subject = line[8:]
            email['subject'] = subject
            
        # Optionally an else branch could extract more features
        
    return email

In [66]:
def make_df(path, label):
    rows = []
    index = []
    for i, text in enumerate(convert_files(path)):
        rows.append({'body': text['body'],'subject': text['subject'], 'sender' : text['sender'], 'label': label}) 
        index.append(i)
    
    df = pd.DataFrame(rows, index=index)
    return df

In [67]:
df_ham = make_df('corpus/train-ham', 0)
df_spam = make_df('corpus/train-spam', 1)

### 2. Create Pipeline for e-mail classification
    - Vectorization
    - Classifier (NaiveBayes / SVM / LogReg)
    - (optional später: feature selection, z.B. IG)
   

In [72]:
# Create test and training data
from sklearn.feature_extraction.text import CountVectorizer
df_final = pd.concat([df_ham, df_spam])
df_final = df_final.sample(frac=1).reset_index(drop=True)

X = df_final[['body', 'subject', 'sender']]
y = df_final.label.values

#### 2.1 Create ItemSelector class to be used in FeatureUnion later. (To handle multiple text-columns in X)

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]

In [69]:
# create Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

lr_tfidf = Pipeline([
    # Create a feature union to combine the columns
    ('union', FeatureUnion(
        transformer = [
            # body pipeline
            ('body', Pipeline([
                ('selector', ItemSelector(key='body')),
                ('tfidf', TfidfVectorizer()),
            ])),
            
            # subject pipeline
            ('subject', Pipeline([
                ('selector', ItemSelector(key='subject')),
                ('tfidf', TfidfVectorizer()),
            ])),
            
            # sender pipeline
            ('sender', Pipeline([
                ('selector', ItemSelector(key='sender')),
                ('tfidf', TfidfVectorizer()),
            ])),
        ],
        transformer_weights={
            'body' : 1.0,
            'subject' : 1.0,
            'sender' : 1.0,
        },
    )),
    ('clf', LogisticRegression(random_state=0)),
])

TypeError: __init__() got an unexpected keyword argument 'transformer'

In [73]:
import scipy.sparse as sp
vectorizer = TfidfVectorizer(ngram_range=(1,3), lowercase=False)
X = sp.hstack(X.apply(lambda col: vectorizer.fit_transform(col)))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None) #use defaults for smoothing and ngrams

# LogisticRegression
lr_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=0)) # default C to 1.0, mb tune later
])

# Bayes
bayes_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', MultinomialNB()) # default smoothing, and settings
])

# SVM
svm_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', SVC(C=1.0, gamma=1e-5)) # default C to 1.0, mb tune later
])

# RandomForest
rf_tfidf = Pipeline([
    ('count', CountVectorizer(ngram_range=(1,2))),
    ('vect', TfidfTransformer()),
    ('clf', RandomForestClassifier()) # default n-trees to 10, mb tune later
])



### 3. K-fold Cross-Validation on the data, compare different models
    - create k-fold test for each classifier and train data + make predictions on validation-sets
    - figure out best model

In [44]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

k_fold = KFold(n_splits=5)
confusion_lr = np.array([[0,0], [0,0]])
confusion_bayes = np.array([[0,0], [0,0]])
confusion_svm = np.array([[0,0], [0,0]])
confusion_rf = np.array([[0,0], [0,0]])

lr = LogisticRegression(random_state = 0)
bayes = MultinomialNB()
svm = SVC(C=1.0, gamma=1e-5)
rf = RandomForestClassifier()

for i, (train, validate) in enumerate(k_fold.split(X_train)):
    X_tr, X_val = X_train[train], X_train[validate]
    y_tr, y_val = y_train[train], y_train[validate]
    
    lr.fit(X_tr, y_tr)
    bayes.fit(X_tr, y_tr)
    svm.fit(X_tr, y_tr)
    rf.fit(X_tr, y_tr)

    print("Test Accuracy: {} (Fold: {})".format(lr.score(X_val, y_val), i))
    print("Test Accuracy: {} (Fold: {})".format(bayes.score(X_val, y_val), i))
    print("Test Accuracy: {} (Fold: {})".format(svm.score(X_val, y_val), i))
    print("Test Accuracy: {} (Fold: {})".format(rf.score(X_val, y_val), i))
    
    confusion_lr += confusion_matrix(y_val, lr.predict(X_val))
    confusion_bayes += confusion_matrix(y_val, bayes.predict(X_val))
    confusion_svm += confusion_matrix(y_val, svm.predict(X_val))
    confusion_rf += confusion_matrix(y_val, rf.predict(X_val))
    
    
print('Confusion - LR')
print(confusion_lr)
print('Confusion - Bayes')
print(confusion_bayes)
print('Confusion - SVM')
print(confusion_svm)
print('Confusion - Random Forest')
print(confusion_rf)

Test Accuracy: 0.9801526717557252 (Fold: 0)
Test Accuracy: 0.983206106870229 (Fold: 0)
Test Accuracy: 0.5221374045801527 (Fold: 0)
Test Accuracy: 0.9358778625954198 (Fold: 0)
Test Accuracy: 0.9770992366412213 (Fold: 1)
Test Accuracy: 0.9893129770992366 (Fold: 1)
Test Accuracy: 0.5450381679389313 (Fold: 1)
Test Accuracy: 0.934351145038168 (Fold: 1)
Test Accuracy: 0.9770992366412213 (Fold: 2)
Test Accuracy: 0.9786259541984733 (Fold: 2)
Test Accuracy: 0.5312977099236641 (Fold: 2)
Test Accuracy: 0.9282442748091603 (Fold: 2)
Test Accuracy: 0.9679389312977099 (Fold: 3)
Test Accuracy: 0.9740458015267176 (Fold: 3)
Test Accuracy: 0.5587786259541985 (Fold: 3)
Test Accuracy: 0.9297709923664123 (Fold: 3)
Test Accuracy: 0.9770642201834863 (Fold: 4)
Test Accuracy: 0.9709480122324159 (Fold: 4)
Test Accuracy: 0.5076452599388379 (Fold: 4)
Test Accuracy: 0.9617737003058104 (Fold: 4)
Confusion - LR
[[1673   72]
 [   7 1522]]
Confusion - Bayes
[[1712   33]
 [  35 1494]]
Confusion - SVM
[[1745    0]
 [1529

In [44]:
# svm looks odd, kinda as if it just classifies naive --> need to check later

### 4. Tune LR model via GridSearch

In [45]:
# Tune LR 
from sklearn.grid_search import GridSearchCV
param_grid = [
    {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1.0, 10.0, 100.0]
    }
]

gs_lr = GridSearchCV(
    lr,
    param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

gs_lr.fit(X_train, y_train)
print('Parameter: {}'.format(gs_lr.best_params_))
print('Accuracy (CV): {}'.format(gs_lr.best_score_))
best_classifier = gs_lr.best_estimator_
print('Accuracy (Test): {}'.format(best_classifier.score(X_test, y_test)))





Parameter: {'C': 100.0, 'penalty': 'l2'}
Accuracy (CV): 0.9792302993280391
Accuracy (Test): 0.9821937321937322


In [75]:
df_test = make_df('corpus/test', 0)


In [76]:
X = df_test[['body', 'subject', 'sender']]

In [None]:
X = df_test[['body', 'subject', 'sender']]
X = sp.hstack(X.apply(lambda col: vectorizer.transform(col)))
clf = best_classifier
y = clf.predict(X)
with open(result_file, 'w') as r:
        r.write(y)

In [47]:
best_classifier.fit(X, y)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Result:
    The best estimator is a logistic regression in a pipeline with a CountVectorizer 
    Optimal parameters for Count-Vec: lowercase = False, ngram_range = (1,3)
    Optimal parameters for LR: C = 100, penalty = l2
    Accuracy on the test-set: 0.982

### 5. Refactor code into train  / predict script