In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hagen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 1. Read in files
    - Loop over ham directory and extract the email body (= Text after the first \n)
    - Save into a pandas data-frame with the label 0
    - Same for spam but with label 1

In [2]:
import os
root = "./"
def convert_files(directory):
    # build directory path
    directory_path = os.path.join(root, directory)
    
    for mail in os.listdir(directory_path):
        file_path = directory_path + "/" + mail
        with open(file_path, "r", encoding='latin-1') as m:
            mail_dict = parse_message(m)
            yield mail_dict

In [3]:
import re
from nltk.corpus import stopwords

def parse_message(msg):
    body = ''
    email = {}
    email['subject'] = ''
    in_body = False
    exclude_terms = ['URL:', 'Date:', 'Return-Path:']
    sw = stopwords.words("english")
    
    for line in msg:
        if line == '\n':
            in_body = True
            continue
            
        if any(term in line for term in exclude_terms):
            continue
            
        
        #get rid of html markup
        line = re.sub('<[^>]*>', '', line)
        
        #get rid of stopwords
        line = ' '.join([word for word in line.split() if word.lower() not in sw])
        
        
        if in_body:
            body += line.strip()
            email['body'] = body
        elif line.startswith('From:'):
            sender = line.strip()
            sender = sender.replace('"', '')
            sender = line[5:]
            email['sender'] = sender
        elif line.startswith('Subject:'):
            subject = line.strip()
            subject = line[8:]
            email['subject'] = subject
            
        # Optionally an else branch could extract more features
        
    return email

In [4]:
def make_df(path, label):
    rows = []
    index = []
    for i, text in enumerate(convert_files(path)):
        rows.append({'body': text['body'],'subject': text['subject'], 'sender' : text['sender'], 'label': label}) 
        index.append(i)
    
    df = pd.DataFrame(rows, index=index)
    return df

In [6]:
df_ham = make_df('corpus/train-ham', 0)
df_spam = make_df('corpus/train-spam', 1)

### 2. Create Pipeline for e-mail classification
    - Vectorization
    - Classifier (NaiveBayes / SVM / LogReg)
    - (optional später: feature selection, z.B. IG)
   

In [7]:
# Create test and training data
from sklearn.feature_extraction.text import CountVectorizer
df_final = pd.concat([df_ham, df_spam])
df_final = df_final.sample(frac=1).reset_index(drop=True)

X = df_final[['body', 'subject', 'sender']]
y = df_final.label.values

#### 2.1 Create ItemSelector class to be used in FeatureUnion later. (To handle multiple text-columns in X)

In [8]:
from sklearn.base import TransformerMixin
class ItemSelector(TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.column]

In [9]:
# create Pipeline
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# body pipe
body_pipe = make_pipeline(
    ItemSelector('body'),
    TfidfVectorizer(encoding='latin1', lowercase=False, ngram_range=(1,3))
)

# subject pipe
subject_pipe = make_pipeline(
    ItemSelector('subject'),
    TfidfVectorizer(encoding='latin1', lowercase=False, ngram_range=(1,3))
)

# sender pipe
sender_pipe = make_pipeline(
    ItemSelector('sender'),
    TfidfVectorizer(encoding='latin1', lowercase=False, ngram_range=(1,3))
)

feature_union = make_union(body_pipe, subject_pipe, sender_pipe)

lr_tfidf = Pipeline([
    ('union', feature_union),
    ('clf', LogisticRegression(random_state=0))
])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 3. K-fold Cross-Validation on the data
    - create k-fold test for each classifier and train data + make predictions on validation-sets
    - figure out best model

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

k_fold = KFold(n_splits=5)
confusion_lr = np.array([[0,0], [0,0]])
acc_lr = []

for i, (train, validate) in enumerate(k_fold.split(X_train)):
    X_tr, X_val = X_train.values[train], X_train.values[validate]
    y_tr, y_val = y_train[train], y_train[validate]
    
    X_tr = pd.DataFrame(X_tr, columns=['body', 'subject', 'sender'])
    X_val = pd.DataFrame(X_val, columns=['body', 'subject', 'sender'])
    
    lr_tfidf.fit(X_tr, y_tr)
    
    acc_lr.append(lr_tfidf.score(X_val, y_val))
    confusion_lr += confusion_matrix(y_val, lr_tfidf.predict(X_val))
    
    
print('Confusion - LR')
print(confusion_lr)
acc_lr = np.asarray(acc_lr)
print('Mean-Accuracy LR: {}'.format(np.mean(acc_lr)))
print('Standard Deviation LR: {}'.format(np.std(acc_lr)))
print('Variance LR: {}'.format(np.var(acc_lr)))

Confusion - LR
[[1684   57]
 [  14 1519]]
Mean-Accuracy LR: 0.9783145411676821
Standard Deviation LR: 0.005052148526313248
Variance LR: 2.5524204731929125e-05


### 4. Tune LR pipe  via GridSearch

In [None]:
# Tune LR 
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [0.1, 1.0, 10.0, 100.0]
    }
]

gs_lr = GridSearchCV(
    lr_tfidf,
    param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

gs_lr.fit(X_train, y_train)
print('Parameter: {}'.format(gs_lr.best_params_))
print('Accuracy (CV): {}'.format(gs_lr.best_score_))
best_classifier = gs_lr.best_estimator_
print('Accuracy (Test): {}'.format(best_classifier.score(X_test, y_test)))



In [75]:
df_test = make_df('corpus/test', 0)
X = df_test[['body', 'subject', 'sender']]
y = best_classifier.predict(X)
my_list = y.tolist()
my_list = ['SPAM' if elem == 1  else 'HAM' for elem in my_list]
filenames = os.listdir('corpus/test')
file = open('result', 'w')

for i, item in enumerate(my_list):
    file.write("{} \t {}\n".format(filenames[i],item))
file.close()


In [47]:
best_classifier.fit(X, y)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Result:
    The best estimator is a logistic regression in a pipeline with a CountVectorizer 
    Optimal parameters for Count-Vec: lowercase = False, ngram_range = (1,3)
    Optimal parameters for LR: C = 100, penalty = l2
    Accuracy on the test-set: 0.982

### 5. Refactor code into train  / predict script