## Import Libraries

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import string
import nltk
import re
from collections import Counter

## 1. Import Data

In [75]:
enron_emails = pd.read_csv('../data/enron_spam_data.csv')

In [76]:
trec_emails = pd.read_csv("../data/trec2007_spam_data_cached.csv")

## 2. Raw Data Processing

### 2.1 Remove Unecessary Columns

In [77]:
enron_emails.drop(['Message ID', 'Date'], axis = 1, inplace = True)

In [78]:
trec_emails.drop(['filepath'], axis = 1, inplace = True)

### 2.2 Drop Missing Values

In [79]:
enron_emails.dropna(subset = ['Message'], inplace = True)
enron_emails.Subject.fillna('None', inplace = True)

In [80]:
trec_emails.dropna(inplace = True)

### 2.3 Create a binary label encoding on Spam/Ham

In [81]:
enron_emails['target'] = enron_emails['Spam/Ham'].map({'ham':0, 'spam':1})

In [82]:
trec_emails['target'] = trec_emails['class'].map({'ham':0, 'spam':1})

## 3. Cleaning Message

- #### Turn words into lowercase letters
- #### Remove numerical digits
- #### Remove punctuation
- #### Tokenization - split a sentence into a list of words 
- #### Remove stopwords - to remove tokens not contributing to the overall meaning of a sentence
- #### Lemmatization - condense variations of the same word to its root form

In [83]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

### 3.1 Clean Message into Tokenized Words

In [84]:
def clean_msg_tokenize(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    tokens = re.findall('\S+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [85]:
enron_emails['clean_msg_tokens'] = enron_emails['Message'].apply(lambda x: clean_msg_tokenize(x))

In [86]:
trec_emails['clean_msg_tokens'] = trec_emails['contents'].apply(lambda x: clean_msg_tokenize(x))

### 3.2 Clean Message for N-Grams Vectorization

In [87]:
def clean_msg(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.findall('\S+', text)
    text = " ".join([wn.lemmatize(word) for word in tokens if word not in stopwords])
    return text

In [88]:
enron_emails['cleaned_msg'] = enron_emails['Message'].apply(lambda x: clean_msg(x))

In [89]:
trec_emails['cleaned_msg'] = trec_emails['contents'].apply(lambda x: clean_msg(x))

### 3.3 Remove Extra Stopwords

In [90]:
# Define extra stopwords
# https://github.com/kavgan/stop-words/blob/master/terrier-stop.txt
extra_stopwords = ['c', 'r', 'u', 'let', 'get', 'would', 'please', 'may', 'also', \
                   'like', 'thanks', 'within', 'go', 'inc', 'make', 'could', 'want', \
                   'need', 'new', 'know', 'best', 'e', 'j', 'p', 'b', 'de', 'see', \
                   'take', 'made', 'ect', 'hou', 'com', 'recipient', 'to', 'cc', 'subject', \
                   'http','from','sent', 'fwd', 'www', 'sara', 'shackleton', 'germani', \
                   'sshacklensf', 'cgermannsf', 'also', 'x', 'px', 'utc', 'rev', 'char', \
                   'listhttpsstatethzchmailmanlistinforhelpplease', 'much', 'dont', \
                   'available', 'said']

In [91]:
enron_emails['clean_msg_tokens'] = enron_emails['clean_msg_tokens'].apply(lambda x: [word for word in x if word not in extra_stopwords])

In [92]:
trec_emails['clean_msg_tokens'] = trec_emails['clean_msg_tokens'].apply(lambda x: [word for word in x if word not in extra_stopwords])

### 3.4 Merge Tokenized Words into Cleaned Text

In [93]:
def clean_msg_rm_lst(msg_tokens):
    
    text = ' '.join(msg_tokens)
    return text

In [94]:
enron_emails['clean_msg_no_lst'] = enron_emails['clean_msg_tokens'].apply(lambda x: clean_msg_rm_lst(x))

In [95]:
trec_emails['clean_msg_no_lst'] = trec_emails['clean_msg_tokens'].apply(lambda x: clean_msg_rm_lst(x))

### 3.5 Save Processed Data

In [96]:
# https://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
# https://stackoverflow.com/questions/16923281/writing-a-pandas-dataframe-to-csv-file
enron_emails.to_csv("../data/enron_emails_processed1.csv", encoding='utf-8', index = None)
trec_emails.to_csv("../data/trec_emails_processed.csv", encoding='utf-8', index = None)