In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = 160

import features as util
from raw_utils import save_to_csv
from preprocessing import dataset_add_columns

from ast import literal_eval

### Read Data

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train_tokens = ['train_balanced_tokens.csv', 'train_imbalanced_tokens.csv']
test_tokens = ['test_balanced_tokens.csv', 'test_imbalanced_tokens.csv']

#### Tokenized emails

In [3]:
train_balanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[0]), index_col=0, converters={'body': literal_eval})
test_balanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[0]), index_col=0, converters={'body': literal_eval})

In [4]:
train_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[1]), index_col=0, converters={'body': literal_eval})
test_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[1]), index_col=0, converters={'body': literal_eval})

After the preprocessing, the data look like this:

In [5]:
train_balanced_tokens.head()

Unnamed: 0,id,body,class
0,3423,"[navy, urladdress, navy, federal, security, zone, email, member, online, mobile, bank, routine, maintenance, account, update, preference, due, concern, safe...",True
1,682,"[dear, well, fargo, user, new, message, well, fargo, online, system, verify, account, please, download, attach, file, fill, quest, information, complete, ve...",True
2,491,"[assume, e-mail, relates, discuss, transfer, 2, ldc, swap, raptor, 2, raptor, frankly, know, mean, assign, swap, 1, vehicle, another, vehicle, think, genera...",False
3,2735,"[dear, usaa, customer, due, recent, upgrade, server, currently, review, member, account, request, please, take, second, time, comply, upgrade, effect, accou...",True
4,142,"[interruption, service, notification, take, note, important, update, new, web, mail, improve, new, messaging, system, outlook, also, include, extra, 10gb, u...",True


# Feature Extraction

Before inputing the emails to the machine learning algorithms, they have to be converted to numberical matrices.<br>
This process is called **feature extraction**. Different methods of achieving this will be tried, in order to compare their results.

## Text Vectorization

The baseline feature set will simply consist of numerical representations of the text data. This process is also called **vectorization**. 

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality and since not all words from the corpus will be of importance, only the top 500 most frequent terms are used.

In [6]:
tfidf_balanced = util.tfidf_features(train_balanced_tokens['body'], test_balanced_tokens['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_balanced = tfidf_balanced['tfidf_train']
tfidf_test_balanced = tfidf_balanced['tfidf_test']
tfidf_model_balanced = tfidf_balanced['vectorizer']

In [8]:
tfidf_imbalanced = util.tfidf_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], min_df=5, max_features=500)

In [9]:
tfidf_train_imbalanced = tfidf_imbalanced['tfidf_train']
tfidf_test_imbalanced = tfidf_imbalanced['tfidf_test']
tfidf_model_imbalanced = tfidf_imbalanced['vectorizer']

As an example, here is a part of the calcuated matrix for the balanced train set:

In [10]:
tfidf_train_balanced.head()

Unnamed: 0,0px,10,100,10px,11,12,14px,15,16px,20,2000,2001,2002,2016,24,30,713,able,accept,access,account,action,activity,add,additional,address,adjust,administrator,agreement,alert,align,also,america,american,another,answer,app,apple,approval,approve,arial,ask,attach,attachment,auto,available,avoid,back,background,bank,banking,base,believe,best,bill,block,book,border,bottom,box,br,break,browser,business,button,buy,ca,california,call,cancel,capital,card,case,cc,center,change,charge,check,choose,chris,city,click,close,code,collapse,color,come,comment,committee,communication,company,complete,confidential,confirm,confirmation,contact,contain,content,continue,contract,copy,copyright,corporation,cost,could,create,credit,current,currently,customer,data,database,date,david,day,de,deal,dear,december,decoration,delete,deliver,delivery,department,desk,...,result,return,review,rgb,richard,right,risk,run,safe,sale,san,sans,say,schedule,secure,security,see,select,sell,send,sender,sent,september,serif,serve,server,service,set,share,show,sign,since,sincerely,site,size,software,solid,solution,soon,space,spacing,spam,span,special,staff,standard,start,state,statement,status,step,still,stock,storage,street,style,subject,support,sure,system,table,take,tax,tbody,td,team,technology,term,texas,text,thank,thanks,think,thursday,time,today,top,total,tr,trade,trading,transaction,transfer,try,tuesday,two,tx,type,underline,united,unsubscribe,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,vertical,via,view,visit,volume,want,way,web,wednesday,week,weight,well,width,within,without,word,work,world,would,write,year,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079701,0.0,0.069711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070989,0.0,0.0,0.0,0.136848,0.076496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104256,0.0,0.0,0.063446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.206348,0.0,0.0,0.0,0.067984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.054842,0.0,0.0,0.077906,0.0,0.0,0.0,0.0,0.0,0.0,0.052204,0.0,0.0,0.0,0.040271,0.0,0.0,0.0,0.0,0.0,0.0,0.046712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152765,0.0,0.410093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060358,0.0,0.0,0.0,0.0,0.08161
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.228401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148838,0.0,0.0,0.0,0.0,0.0,0.0,0.099735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129063,0.0,0.0,0.0,0.144371,0.119929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064854,0.0,0.0,0.0,0.0,0.326591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066939,0.178348,0.0,0.093254,0.0,0.0,0.0,0.0,0.094807,0.0,0.0,0.070103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090263,0.124679,0.0,0.067639,0.0,0.075899,0.0,0.0,0.0,0.0,0.08545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081898,0.0,0.0,0.0,0.0,0.0,0.045302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090263,0.0,0.0,0.0,0.09102,0.0,0.102917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077581,0.168027,0.0,0.0,0.0,0.154752,0.0,0.061841,0.0,0.0,0.0,0.0,0.271349,0.0,0.083195,0.184828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057283,0.0,0.0,0.0,0.053197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067002,0.0,0.0,0.0,0.0,0.0,0.085155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18567,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142311,0.337904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192428,0.132443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.155742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129214,0.0,0.0,0.0,0.13318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.123366,0.339633,0.035483,0.379631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178593,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.163454,0.216947,0.0,0.180572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149616,0.0,0.142378,0.0,0.135914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354745,0.0,0.068022,0.0,0.109812,0.0,0.0,0.0,0.0,0.0,0.0,0.187101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Word2Vec

A more advanced technique is **Word Embedding**, which calculates a high-dimensional vector for each word based on the probability distribution of this word appearing before or after another. In other words, words belonging to the same context usually appear close to each other in the corpus, so they will be closer in the vector space as well.<br>
The chosen implementation is **Word2Vec**.

After the word vectors are calculated, the vectors of each word in an email are being averaged, thus resulting in a single vector for each email.

In [11]:
word2vec_balanced = util.word2vec_features(train_balanced_tokens['body'], test_balanced_tokens['body'], vector_size=100, min_count=5)

In [12]:
word2vec_train_balanced = word2vec_balanced['word2vec_train']
word2vec_test_balanced = word2vec_balanced['word2vec_test']
word2vec_model_balanced = word2vec_balanced['vectorizer']

In [13]:
word2vec_imbalanced = util.word2vec_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], vector_size=100, min_count=5)

In [14]:
word2vec_train_imbalanced = word2vec_imbalanced['word2vec_train']
word2vec_test_imbalanced = word2vec_imbalanced['word2vec_test']
word2vec_model_imbalanced = word2vec_imbalanced['vectorizer']

The resulting feature sets are like the following:

In [15]:
word2vec_train_balanced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0.289927,0.023323,-0.033025,-0.05224,-0.087432,0.085228,-0.135331,0.135703,-0.203205,-0.166281,0.471792,-0.072469,-0.0568,0.107817,-0.261119,0.264992,-0.465535,-0.045343,0.084077,-0.241176,0.210686,-0.018065,-0.060581,-0.174072,-0.084484,0.340751,-0.062518,0.029549,0.006666,-0.012719,0.116415,-0.233394,-0.355152,-0.118888,-0.083707,-0.28286,-0.179822,0.337604,-0.277851,-0.110221,-0.210723,0.163184,0.095068,-0.035413,-0.205244,0.059296,-0.344106,0.22283,0.063546,-0.034768,0.113453,0.189124,0.475867,0.026153,0.251813,0.078436,0.438226,0.0182,0.380544,0.688316,0.044833,-0.581285,0.074491,-0.083344,0.399773,-0.247264,-0.116656,-0.064603,-0.040265,-0.058135,0.063925,0.376548,-0.068843,-0.022001,-0.021945,0.099061,-0.160571,-0.270918,0.037585,-0.127789,0.037425,0.144204,0.163213,0.034335,0.236993,0.062792,0.257908,0.164825,0.220416,0.052248,-0.010558,0.19964,-0.086433,0.018575,0.147764,-0.098477,0.19319,-0.425493,0.284093,0.042912
1,0.264308,-0.147653,-0.069056,-0.12461,-0.112475,0.118836,-0.08931,0.108197,-0.209279,-0.352083,0.373935,-0.066474,-0.259959,0.165344,-0.258764,0.131957,-0.3791,0.009627,0.16811,-0.174027,0.266073,0.026363,-0.049107,-0.318314,-0.023787,0.346341,-0.146584,-0.181751,0.042907,0.228917,0.274304,-0.208073,-0.553218,-0.020099,-0.238196,-0.287991,-0.057582,0.29641,-0.105931,-0.281946,-0.182891,-0.023935,0.042436,0.014628,0.008477,0.043013,-0.458626,0.175524,-0.077043,-0.142023,0.058167,0.06805,0.273047,0.059471,0.058114,0.176285,0.436254,0.000738,0.302897,0.809529,0.132993,-0.584076,0.142072,-0.099821,0.483816,-0.19581,-0.049756,-0.22855,0.011909,-0.172373,0.07314,0.33382,-0.113164,0.047752,-0.014939,0.163898,-0.111511,-0.204291,-0.043195,0.053651,0.012439,-0.180232,0.193077,0.096243,0.43908,0.164556,0.187517,0.139447,0.331749,-0.089862,-0.118559,0.150749,-0.069062,0.071282,0.289252,0.085711,0.321513,-0.43617,0.295039,0.001421
2,0.257001,-0.17236,0.196148,0.026815,0.057188,-0.014209,-0.174962,-0.10755,-0.075742,-0.299471,0.358547,0.084786,-0.090662,0.113803,-0.043732,0.130803,-0.20122,-0.117016,-0.00146,-0.132798,0.216201,0.084477,-0.125812,-0.023278,-0.018728,0.241532,0.049224,-0.06101,0.149119,-0.206277,0.347064,-0.12906,-0.284124,-0.164825,-0.25293,-0.178651,-0.079237,0.36523,0.013546,-0.459573,-0.031016,0.004477,0.091297,-0.156606,-0.046665,0.067479,0.008263,0.171408,-0.019844,-0.008779,-0.076407,-0.066014,0.299304,0.095154,0.162214,0.07097,0.150942,0.120194,0.244226,0.435738,-0.218839,-0.293879,0.128615,-0.225178,0.32383,-0.326511,0.129618,-0.070965,0.069078,-0.151123,0.075714,0.240503,-0.076337,0.083655,-0.150823,0.088636,-0.244509,-0.134922,-0.0161,0.045439,0.073257,0.06524,0.176596,-0.243079,0.384737,-0.044765,-0.0269,0.090852,0.194613,0.109,-0.128408,0.002286,-0.003914,-0.018771,0.158914,0.059541,0.157634,-0.017668,0.064457,0.097142
3,0.352575,-0.026519,-0.026996,-0.14753,-0.140908,0.058073,-0.105103,-0.075193,-0.189889,-0.34065,0.280965,-0.084939,-0.028592,0.189815,-0.165243,0.332969,-0.386733,-0.087735,0.090496,-0.271191,0.094077,0.047163,0.10391,-0.27976,-0.062969,0.225994,-0.073904,-0.112101,-0.097705,0.332614,0.34365,-0.100923,-0.408862,-0.182439,-0.270602,-0.294815,0.008681,0.230169,-0.094656,-0.346125,-0.26128,0.101951,-0.015122,-0.067435,0.054629,0.114775,-0.583011,0.33527,-0.196424,-0.121787,-0.009603,-0.024201,0.329064,-0.04081,0.223667,0.090817,0.589189,0.000817,0.305648,0.826823,0.029985,-0.656368,0.022678,-0.089221,0.401408,-0.30919,0.151394,-0.228074,-0.199166,0.112935,0.150776,0.265024,-0.113424,-0.076242,-0.132452,0.174305,-0.109848,-0.123134,-0.173925,0.115247,0.028274,-0.024941,0.365877,0.01021,0.342349,0.141712,0.168632,0.147259,0.394304,-0.051684,-0.069974,0.185402,-0.106277,0.144509,0.177147,0.063002,0.491704,-0.449946,0.234211,0.123519
4,0.269328,0.156901,-0.002265,-0.094969,-0.073197,0.10496,-0.07932,-0.003786,-0.29497,-0.31062,0.264928,-0.080663,0.073346,0.190574,-0.075681,0.155906,-0.534698,0.034648,0.024971,-0.212579,0.238946,-0.011236,0.011779,-0.134316,0.03366,0.302213,-0.064471,-0.188609,-0.027957,0.234444,0.320695,-0.138069,-0.336584,-0.063843,-0.152887,-0.197854,-0.008764,0.298143,-0.079576,-0.324393,-0.233405,-0.0365,0.171159,-0.022942,-0.034242,0.058185,-0.37987,0.226108,-0.155447,-0.183653,-0.103043,-0.020494,0.271295,0.029746,0.02723,-0.01256,0.522972,0.062487,0.309329,0.683899,-0.090837,-0.38174,-0.149831,0.050808,0.411463,-0.281402,-0.063577,-0.232916,-0.043124,-0.082952,0.070046,0.23736,-0.117269,0.025288,-0.039727,0.010093,0.031452,-0.106626,-0.044678,-0.012975,0.154103,0.034983,0.312596,-0.052373,0.283526,0.06845,0.106016,0.065614,0.301179,0.078806,-0.142254,0.159077,-0.019183,0.002067,0.233449,0.208557,0.232732,-0.469751,0.192083,-0.017369


It should be noted that in this case, the columns do not provide information similar to how a tf-idf column corresponds to one word. This representation is purely for convenience and consistency, it won't matter during the prediction step.

# Feature Selection

In order to further reduce the dimensions of the feature matrix, the number of selected features will be halved using the top features according to the **chi-squared** feature selection method.

## Vectorization Features

### TF-IDF

In [16]:
selected_tfidf_balanced = util.chi2_feature_selection(tfidf_train_balanced, train_balanced_tokens['class'], tfidf_test_balanced, percentile=50)

In [17]:
tfidf_sel_train_balanced = selected_tfidf_balanced['features_train']
tfidf_sel_test_balanced = selected_tfidf_balanced['features_test']
tfidf_sel_model_balanced = selected_tfidf_balanced['selector']

In [18]:
selected_tfidf_imbalanced = util.chi2_feature_selection(tfidf_train_imbalanced, train_imbalanced_tokens['class'], tfidf_test_imbalanced, percentile=50)

In [19]:
tfidf_sel_train_imbalanced = selected_tfidf_imbalanced['features_train']
tfidf_sel_test_imbalanced = selected_tfidf_imbalanced['features_test']
tfidf_sel_model_imbalanced = selected_tfidf_imbalanced['selector']

The now-reduced train set:

In [20]:
tfidf_sel_train_balanced.head()

Unnamed: 0,0px,11,2000,2001,2002,2016,24,713,access,account,activity,address,administrator,agreement,alert,also,america,app,apple,attach,auto,available,avoid,back,bank,banking,bill,block,border,browser,button,buy,california,call,cancel,card,cc,chris,click,collapse,come,comment,committee,company,confirm,contract,copyright,could,customer,date,david,de,deal,dear,december,delete,deliver,delivery,detail,development,device,director,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,error,even,event,expire,express,failure,fargo,fax,ferc,final,firm,first,font,form,forward,friday,full,future,fw,gas,get,give,global,go,good,great,group,hello,help,houston,id,image,immediately,important,inbox,include,incoming,information,instruction,interest,james,january,jeff,john,jose,july,kindly,know,last,legal,let,letter,like,limit,link,list,log,login,long,look,mail,mailbox,many,mark,market,meet,meeting,michael,microsoft,mike,million,monday,month,much,name,nbsp,news,north,notice,notification,november,october,offer,one,online,padding,party,password,payment,paypal,pending,people,per,permanently,personal,phone,pipeline,plan,pm,point,position,power,presentation,price,privacy,program,project,promise,protect,question,quota,rate,re,receive,reply,report,request,reserve,richard,right,risk,safe,sale,say,schedule,secure,security,see,sell,sender,september,server,service,sign,since,sincerely,size,space,spam,start,statement,still,stock,storage,street,subject,support,team,text,thank,thanks,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,volume,want,wednesday,week,well,within,work,would,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079701,0.069711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070989,0.0,0.0,0.136848,0.076496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062382,0.0,0.0,0.156756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06697,0.0,0.0,0.0,0.071297,0.057803,0.0,0.157918,0.0,0.0,0.057104,0.0,0.0,0.059955,0.0,0.054842,0.0,0.077906,0.0,0.0,0.0,0.0,0.052204,0.0,0.0,0.0,0.0,0.0,0.046712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152765,0.0,0.410093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060358,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.228401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148838,0.0,0.0,0.0,0.0,0.099735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129063,0.0,0.0,0.144371,0.119929,0.0,0.0,0.0,0.0,0.0,0.383362,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.064854,0.0,0.0,0.0,0.326591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066939,0.178348,0.0,0.0,0.070103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.124679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.078852,0.0,0.0,0.081677,0.0,0.0,0.117966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162311,0.0,0.0,0.0,0.079405,0.0,0.087475,0.0,0.0,0.0,0.0,0.0,0.0,0.117708,0.075899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186416,0.0,0.083737,0.068767,0.0,0.138638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162801,0.0,0.082408,0.0,0.089531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09102,0.102917,0.0,0.0,0.0,0.0,0.0,0.154752,0.0,0.061841,0.0,0.271349,0.0,0.0,0.0,0.057283,0.0,0.0,0.0,0.053197,0.0,0.0,0.0,0.0,0.0,0.067002,0.0,0.0,0.085155,0.0,0.0,0.0,0.0,0.18567,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142311,0.337904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213512,0.0,0.0,0.0,0.0,0.0,0.150973,0.0,0.0,0.0,0.132443,0.0,0.0,0.0,0.0,0.108031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129214,0.0,0.0,0.0,0.13318,0.0,0.0,0.0,0.0,0.0,0.123366,0.339633,0.035483,0.379631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159936,0.0,0.149449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134823,0.0,0.0,0.0,0.0,0.0,0.098065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.163454,0.216947,0.0,0.0,0.0,0.0,0.0,0.186326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354745,0.0,0.068022,0.0,0.109812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Final Dataset Creation

Before using the features for classification with the machine learning algorithms, it is best to tidy up the datasets and keep them consistent by concatenating the features, the id and the class columns in the same DataFrame.

In [21]:
column_names = ['email_class', 'email_id'] # column names changed in case the word class or id appear in the token list

### TF-IDF

In [22]:
final_tfidf_train_balanced = dataset_add_columns(tfidf_sel_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_tfidf_test_balanced = dataset_add_columns(tfidf_sel_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [23]:
final_tfidf_train_imbalanced = dataset_add_columns(tfidf_sel_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_tfidf_test_imbalanced = dataset_add_columns(tfidf_sel_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

Looking into one of the previously explored examples:

In [24]:
final_tfidf_train_balanced[final_tfidf_train_balanced['email_id'] == 6]

Unnamed: 0,email_id,email_class,0px,11,2000,2001,2002,2016,24,713,access,account,activity,address,administrator,agreement,alert,also,america,app,apple,attach,auto,available,avoid,back,bank,banking,bill,block,border,browser,button,buy,california,call,cancel,card,cc,chris,click,collapse,come,comment,committee,company,confirm,contract,copyright,could,customer,date,david,de,deal,dear,december,delete,deliver,delivery,detail,development,device,director,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,error,even,event,expire,express,failure,fargo,fax,ferc,final,firm,first,font,form,forward,friday,full,future,fw,gas,get,give,global,go,good,great,group,hello,help,houston,id,image,immediately,important,inbox,include,incoming,information,instruction,interest,james,january,jeff,john,jose,july,kindly,know,last,legal,let,letter,...,link,list,log,login,long,look,mail,mailbox,many,mark,market,meet,meeting,michael,microsoft,mike,million,monday,month,much,name,nbsp,news,north,notice,notification,november,october,offer,one,online,padding,party,password,payment,paypal,pending,people,per,permanently,personal,phone,pipeline,plan,pm,point,position,power,presentation,price,privacy,program,project,promise,protect,question,quota,rate,re,receive,reply,report,request,reserve,richard,right,risk,safe,sale,say,schedule,secure,security,see,sell,sender,september,server,service,sign,since,sincerely,size,space,spam,start,statement,still,stock,storage,street,subject,support,team,text,thank,thanks,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,volume,want,wednesday,week,well,within,work,would,year
2228,6,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057317,0.0,0.079004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.396242,0.090574,0.0,0.0,0.0,0.0,0.0,0.0,0.112731,0.104429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114471,0.0,0.0,0.103148,0.069657,0.105036,0.0,0.0,0.0,0.112691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08388,0.0,0.0,0.0,0.0,0.067186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092669,0.0,0.0,0.076713,0.0,0.0,0.107239,0.158134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The words that appear more in the email have a bigger score, while the words that don't appear at all have a score of zero.

### Word2Vec

In [25]:
final_word2vec_train_balanced = dataset_add_columns(word2vec_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_word2vec_test_balanced = dataset_add_columns(word2vec_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [26]:
final_word2vec_train_imbalanced = dataset_add_columns(word2vec_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_word2vec_test_imbalanced = dataset_add_columns(word2vec_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

In [27]:
final_tfidf_train_balanced.head()

Unnamed: 0,email_id,email_class,0px,11,2000,2001,2002,2016,24,713,access,account,activity,address,administrator,agreement,alert,also,america,app,apple,attach,auto,available,avoid,back,bank,banking,bill,block,border,browser,button,buy,california,call,cancel,card,cc,chris,click,collapse,come,comment,committee,company,confirm,contract,copyright,could,customer,date,david,de,deal,dear,december,delete,deliver,delivery,detail,development,device,director,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,error,even,event,expire,express,failure,fargo,fax,ferc,final,firm,first,font,form,forward,friday,full,future,fw,gas,get,give,global,go,good,great,group,hello,help,houston,id,image,immediately,important,inbox,include,incoming,information,instruction,interest,james,january,jeff,john,jose,july,kindly,know,last,legal,let,letter,...,link,list,log,login,long,look,mail,mailbox,many,mark,market,meet,meeting,michael,microsoft,mike,million,monday,month,much,name,nbsp,news,north,notice,notification,november,october,offer,one,online,padding,party,password,payment,paypal,pending,people,per,permanently,personal,phone,pipeline,plan,pm,point,position,power,presentation,price,privacy,program,project,promise,protect,question,quota,rate,re,receive,reply,report,request,reserve,richard,right,risk,safe,sale,say,schedule,secure,security,see,sell,sender,september,server,service,sign,since,sincerely,size,space,spam,start,statement,still,stock,storage,street,subject,support,team,text,thank,thanks,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,volume,want,wednesday,week,well,within,work,would,year
0,3423,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079701,0.069711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070989,0.0,0.0,0.136848,0.076496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062382,0.0,0.0,0.156756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068424,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.072691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06697,0.0,0.0,0.0,0.071297,0.057803,0.0,0.157918,0.0,0.0,0.057104,0.0,0.0,0.059955,0.0,0.054842,0.0,0.077906,0.0,0.0,0.0,0.0,0.052204,0.0,0.0,0.0,0.0,0.0,0.046712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152765,0.0,0.410093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060358,0.0,0.0
1,682,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.228401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148838,0.0,0.0,0.0,0.0,0.099735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129063,0.0,0.0,0.144371,0.119929,0.0,0.0,0.0,0.0,0.0,0.383362,0.0,0.0,0.0,0.0
2,491,False,0.0,0.0,0.0,0.064854,0.0,0.0,0.0,0.326591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066939,0.178348,0.0,0.0,0.070103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.124679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.078852,0.0,0.0,0.081677,0.0,0.0,0.117966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162311,0.0,0.0,0.0,0.079405,0.0,0.087475,0.0,0.0,0.0,0.0,0.0,0.0,0.117708,0.075899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186416,0.0,0.083737,0.068767,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.047506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162801,0.0,0.082408,0.0,0.089531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09102,0.102917,0.0,0.0,0.0,0.0,0.0,0.154752,0.0,0.061841,0.0,0.271349,0.0,0.0,0.0,0.057283,0.0,0.0,0.0,0.053197,0.0,0.0,0.0,0.0,0.0,0.067002,0.0,0.0,0.085155,0.0,0.0,0.0,0.0,0.18567,0.0
3,2735,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142311,0.337904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213512,0.0,0.0,0.0,0.0,0.0,0.150973,0.0,0.0,0.0,0.132443,0.0,0.0,0.0,0.0,0.108031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.281316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129214,0.0,0.0,0.0,0.13318,0.0,0.0,0.0,0.0,0.0,0.123366,0.339633,0.035483,0.379631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,142,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159936,0.0,0.149449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.134823,0.0,0.0,0.0,0.0,0.0,0.098065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.163454,0.216947,0.0,0.0,0.0,0.0,0.0,0.186326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354745,0.0,0.068022,0.0,0.109812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Saving the Results

In [28]:
save_to_csv(final_tfidf_train_balanced, csv_path, 'tfidf_chi2_train_balanced.csv')
save_to_csv(final_tfidf_test_balanced, csv_path, 'tfidf_chi2_test_balanced.csv')

save_to_csv(final_tfidf_train_imbalanced, csv_path, 'tfidf_chi2_train_imbalanced.csv')
save_to_csv(final_tfidf_test_imbalanced, csv_path, 'tfidf_chi2_test_imbalanced.csv')

Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/tfidf_chi2_train_balanced.csv
Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/tfidf_chi2_test_balanced.csv
Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/tfidf_chi2_train_imbalanced.csv
Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/tfidf_chi2_test_imbalanced.csv


In [29]:
save_to_csv(final_word2vec_train_balanced, csv_path, 'word2vec_train_balanced.csv')
save_to_csv(final_word2vec_test_balanced, csv_path, 'word2vec_test_balanced.csv')

save_to_csv(final_word2vec_train_imbalanced, csv_path, 'word2vec_train_imbalanced.csv')
save_to_csv(final_word2vec_test_imbalanced, csv_path, 'word2vec_test_imbalanced.csv')

Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/word2vec_train_balanced.csv
Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/word2vec_test_balanced.csv
Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/word2vec_train_imbalanced.csv
Saving to C:\Users\13636\OneDrive\01WorkingDirectory\02PycharmProjects\FraudulentEmailAttack\data/csv/word2vec_test_imbalanced.csv
