In [11]:
import os.path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import email
from string import punctuation
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn import tree
stop_words = set(stopwords.words('english'))
from sklearn.utils import resample

In [36]:
hamdir = r'../data/Compressed/Ham_Unprocessed'
spamdir = r'../data/Compressed/Spam_Unprocessed'
dirs=[hamdir,spamdir]

def count_dirs(dirs):
    for d in dirs:
        folders = 0
        for _, directories, files in os.walk(d):
            folders += len(directories)
        print('There are',folders,'folders in the Ham Directory')

count_dirs(dirs)

There are 249 folders in the Ham Directory
There are 34 folders in the Ham Directory


In [37]:
def getlist(directory):
    # this function takes the directory name, iterates over each file subdirectories
    # and appends them to the list 'mylist'
    mylist=[]
    for directory, subdirectory, filenames in  os.walk(directory):
        for filename in filenames:
            with open(os.path.join(directory, filename), "r",encoding="latin-1") as f:
                data = f.read()
             
            # the below code checks for multi part emails and appends them to 'mylist'
            b = email.message_from_string(data)
            if b.is_multipart():
                for payload in b.get_payload():
                    if payload.is_multipart():
                        for payload1 in payload.get_payload():
                            mylist.append(payload1.get_payload())
                    else:        
                        mylist.append(payload.get_payload())
            else:
                mylist.append(b.get_payload())

    return mylist

In [38]:
# sending the ham and spam directories to the getlist function
# this will return a list of emails 

hamlist=getlist(hamdir)
spamlist=getlist(spamdir)

In [44]:
def create_df(hamlist,spamlist):
    
    # takes both ham and spam lists and concatenated them into a dataframe
    # returns dataframe once complete

    # create dataframe of ham
    ham=pd.DataFrame(hamlist,columns =['email'])
    # add target column for ham
    ham['target']=0
    
    # do same process for spam
    spam=pd.DataFrame(spamlist,columns =['email'])
    spam['target']=1
    #concatenate ham and spam and shuffle dataframe
    all_emails=pd.concat([ham,spam])
    all_emails = all_emails.sample(frac=1).reset_index(drop=True)

    
    return all_emails

In [45]:
# send the newly created ham list and spam list to the create_df function
# this will return a new dataframe which will be saved as 'all_emails'

all_emails = create_df(hamlist,spamlist)

In [47]:
all_emails.head(5)

Unnamed: 0,email,target
0,Return-Path: <0@aol.com>\nReceived: from rly-...,0
1,<HTML>\n<HEAD>\n<TITLE>mailer1</TITLE>\n<META ...,1
2,amoebae caustic emeriti extracellular \nbeauti...,1
3,In response to a concern Stan brought up yeste...,0
4,"Good day,\n<br><br>\n\nA Well Wisher showed me...",1


In [48]:
def clean_regex(m):
        
        #defining the cleaning function which will run several regex search & replace
        # processes on each email
        # m for message in email column
        m = re.sub(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$','',str(m)) # email addresses
        m = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', ' ', str(m)) # http/url regex
        m = re.sub(r'\d+', '', str(m)) # numbers
        m = re.sub(r'<[^<]+?>', '', str(m)) # html <tags>
        m = m.replace(r'[^a-zA-Z]', '') # non alphanumerics
        m = m.replace('nbsp', '') # common in unprocessed spam files, new html
        m = m.translate(str.maketrans('', '', punctuation)) # remove punctuation
        m = m.lower() # lower case

        return m
    
def clean_column(df,col_name):
    
    # apply clean regex to column
    df[col_name]=df[col_name].apply(clean_regex)
    
    # apply stop word removal to column
    df[col_name]=df[col_name].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
    
    # keep only words between 3 and 15 characters long (some very long gibberish strings in some emails)
    df[col_name]=df[col_name].apply(lambda x: ' '.join([item for item in x.split() if 3 <= len(item) <= 15]))
    
    # initialize Lemmatizer
    lem = WordNetLemmatizer()
    
    # apply verb lemmatizer
    df[col_name]=df[col_name].apply(lambda x: ' '.join([lem.lemmatize(word,pos='v') for word in x.split()]))

    # apply noun lemmatizer
    df[col_name]=df[col_name].apply(lambda x: ' '.join([lem.lemmatize(word,pos='n') for word in x.split()]))
    
    return df

In [49]:
# send all_emails dataset to clean_column function
# arguments are:  dataset, column to clean

all_emails_clean = clean_column(all_emails,'email')
all_emails_clean.head(5)

Unnamed: 0,email,target
0,returnpath receive rlyyamxaolcom rlyyamailaolc...,0
1,mailer twoday market plan product plan financi...,1
2,amoeba caustic emeriti extracellular beautify ...,1
3,response concern stan bring yesterday revise v...,0
4,good day well wisher show way get ppv pay anyt...,1


In [50]:
def downsamp_maj(df,target):

    # this function can be used for downsampling the majority class to match the minority class 
    # this function works for any panda dataframe with two classes labeled 0 and 1
    # arguents: panadas dataframe and response variable column name as string
    
    # get counts of both classes
    lenclass_1 = len(df[df[target]==1])
    lenclass_0 = len(df[df[target]==0])

    # if class 1 is larger than class 0
    if lenclass_1 > lenclass_0:

        # set class 1 as the majority and class 0 as minority
        df_majority = df[df[target]==1]
        df_minority = df[df[target]==0]
        
        # downsample majority to the number of classes in minority
        df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=lenclass_0,     # to match minority class
                                 random_state=123) # reproducible results 

        # join downsampled majortiy and minority into dataframe
        df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)
        
    # otherwise downsample class 0
    else:

        df_majority = df[df[target]==0]
        df_minority = df[df[target]==1]

        df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=lenclass_1,     # to match minority class
                                 random_state=123) # reproducible results 

        # Combine minority class with downsampled majority class
        df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)

    return df

In [51]:
# send the email dataset to the function downsamp_maj which takes the dataframe, and the name of the column
# containing the class labels, in this case the name is 'target'

balanced_emails = downsamp_maj(all_emails,'target')

# get value counts for printout
s=balanced_emails.target.value_counts()
print('There are',s.values[0],'spam emails and',s.values[1],'ham emails in the dataset')

There are 19088 spam emails and 19088 ham emails in the dataset


In [55]:
balanced_emails.rename(columns={'email': 'clean_msg_no_lst'}, inplace=True)

In [56]:
balanced_emails.head()

Unnamed: 0,clean_msg_no_lst,target
0,hello send email ago qualify new mortgage coul...,1
1,bamako telegraph review expose new pan epdemic...,1
2,message ultra portable business notebook pfaa ...,1
3,investment opportunity speculative investor fu...,1
4,hello dress black become call yet discover als...,1


In [57]:
balanced_emails.to_csv("../data/enron_emails_processed2.csv", encoding='utf-8', index = None)