## Importing the libraries

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import tensorflow as tf
tf.__version__
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

## Importing the dataset

In [2]:
dataset_train = pd.read_csv('emails.csv') 
dataset_train.duplicated().sum()

0

In [3]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'), 
        'to': map_to_list(emails, 'to'), 
        'from_': map_to_list(emails, 'from')
    }

def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [4]:
email_df = pd.DataFrame(parse_into_emails(dataset_train.message))
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)

In [5]:
email_df.head()

Unnamed: 0,body,to,from_
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com
5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com


## Getting the shape of the dataset


In [6]:
email_df.shape

(491067, 3)

## Preprocessing the dataset

In [7]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    '''
    Preprocess a string.
    :parameter
        :param text: string - name of column containing text
        :param lst_stopwords: list - list of stopwords to remove
        :param flg_stemm: bool - whether stemming is to be applied
        :param flg_lemm: bool - whether lemmitisation is to be applied
    :return
        cleaned text
    '''

    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\revoquant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
lst_stopwords = nltk.corpus.stopwords.words("english")


In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\revoquant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
email_df["body_clean"] = email_df["body"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
email_df.head()

Unnamed: 0,body,to,from_,body_clean
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,test successful way go
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,randycan send schedule salary level everyone t...
5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com,greghow either next tuesday thursdayphillip


In [15]:
email_df.groupby('to').describe()

Unnamed: 0_level_0,body,body,body,body,from_,from_,from_,from_,body_clean,body_clean,body_clean,body_clean
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
to,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
""" - *Chris.Germany@enron.com"" <Chris.Germany@enron.com>",4,2,"DoneChris,Please fax the support sent to you o...",2,4,1,chris.germany@enron.com,4,4,2,donechrisplease fax support sent contact cindy...,2
""" - *Destephanis, Kara"" <kdestep@columbiaenergygroup.com>,",3,1,For 1/19th we have sold the extra capacity to ...,3,3,1,joann.collins@enron.com,3,3,1,119th sold extra capacity amoco5000 bgecitygat...,3
""" - *Kinney, Doug"" <dkinney@columbiaenergygroup.com>,",6,2,They want firm primary.---------------------- ...,3,6,2,clarissa.garcia@enron.com,3,6,2,want firm primary forwarded clarissa garciahou...,3
""" - *Koch, Kent"" <kkoch@nisource.com>, "" -",4,1,"AM ---------------------------*Millar, Debra"" ...",4,4,1,judy.townsend@enron.com,4,4,1,millar debra dmillarnisourcecom burke lynnlbur...,4
""" - *Scott.Goodell@enron.com"" <Scott.Goodell@enron.com>,",7,2,I will assume you de' man on this one.Chris Ge...,4,7,2,chris.germany@enron.com,4,7,2,assume de man onechris germanyhouectectagl sen...,4
...,...,...,...,...,...,...,...,...,...,...,...,...
"zuroff.adele@enron.com, gold.amy@enron.com, wells.becky@enron.com,",4,3,"dahncke.beth@enron.com, lyons.bill@enron.com, ...",2,4,1,ava.garcia@enron.com,4,4,3,dahnckebethenroncom lyonsbillenroncom matticeb...,2
zwharton@dawray.com,43,28,Credit cards that is! - I just requested anoth...,4,43,5,martin.cuilla@enron.com,19,43,28,credit card requested another 50 macys gc isst...,4
"zwharton@dawray.com, w.dangerfield@deaus.com, carldfrank@yahoo.com,",1,1,"chuckrutt@hotmail.com, dschlich@csc.com, dgree...",1,1,1,lfrank@vignette.com,1,1,1,chuckrutthotmailcom dschlichcsccom dgreenclaus...,1
"~*Everyone At San Ramon/Western Region/The Bentley Company@Exchange,",2,1,"jeff.dasovich@enron.com, ryan.deane@enron.com,...",2,2,1,Jeannine Huda/Western Region/The Bentley Compa...,2,2,1,jeffdasovichenroncom ryandeaneenroncom lairddy...,2


In [16]:
email_df.groupby('from_').describe()

Unnamed: 0_level_0,body,body,body,body,to,to,to,to,body_clean,body_clean,body_clean,body_clean
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
from_,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
""" Walter McDougle"" <wmcdougle@spinexp.com>",3,1,"AM ---------------------------today volBill,Th...",3,3,1,Billyoung3@aol.com @ ENRON,3,3,1,today volbillthe vr 375 a1d gas well currently...,3
""""" <subs@btu.net>@ENRON",4,4,-----Original Message-----Attached is the late...,1,4,2,Btus Weekly Power Report,2,4,4,original messageattached latest issue btu week...,1
"""""m y-"" <enronsato@hotmail.com>@ENRON",6,1,Jeff wants to know what you think about this. ...,6,6,1,"Skilling, Jeff",6,6,1,jeff want know think srsoriginal messageonenro...,6
"""3Toyota"" <3toyota@cadvision.com>@ENRON [mailto",1,1,How about 2002's? I was just thinking there wo...,1,1,1,"Dorland, Chris",1,1,1,2002s thinking would incentive 2001schrisorigi...,1
"""=?ISO-8859-1?Q?APG=20-=20Ana=20Paula=20Gon=E7alves?=""",6,3,and EPErqb@tozzini.com.brrqb@tozzini.com.brAna...,2,6,1,"<john.schwartzenburg@enron.com>, <randy.pais@e...",6,6,3,eperqbtozzinicombrrqbtozzinicombrana paula tha...,2
...,...,...,...,...,...,...,...,...,...,...,...,...
zufferli@enron.com,2,2,"I think I've already responded, but, we are OK...",1,2,1,john.lavorato@enron.com,2,2,2,think ive already responded ok power trading,1
zulie.flores@enron.com,6,3,"kevin.howard@enron.com, harold.inman@enron.com...",3,6,2,"marla.barnard@enron.com, cynthia.barrow@enron....",3,6,3,kevinhowardenroncom haroldinmanenroncomvinceka...,3
zvo2z17d0@untappedmarkets.com,1,1,Would YOU like to learn more about a company ...,1,1,1,undisclosed-recipients@enron.com,1,1,1,would like learn company product virtually unt...,1
zwharton@dawray.com,2,2,"Hey honey, will you please follow up with Tony...",1,2,1,mcuilla@enron.com,2,2,2,hey honey please follow tony 7132995034 aboutw...,1


In [17]:
email_df.groupby('body_clean').describe()

Unnamed: 0_level_0,body,body,body,body,to,to,to,to,from_,from_,from_,from_
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
body_clean,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
,450,105,*~*~*~*~*~*~*~*~*~*,121,450,210,w..white@enron.com,18,450,147,matthew.lenhart@enron.com,51
0 011038original messagecommissioners attached letter counsel enron energy service addressing thepoints information raised oral argument friday may 11 2001in 0011038 et alx24460docjeanne bennett x24460doc,1,1,0\t0-11-038-----Original Message-----Commissio...,1,1,1,'LYN@cpuc.ca.gov'; 'RB1@cpuc.ca.gov'; 'HMD@cpu...,1,1,1,JBennett <JBennett@GMSSR.com>@ENRON [mailto,1
0 011038original messagecxwcpuccagov gfbcpuccagov011038commissioners attached letter counsel enron energy service addressing thepoints information raised oral argument friday may 11 2001in 0011038 et alx24460docjeanne bennett x24460doc,2,1,0\t0-11-038-----Original Message-----'CXW@cpuc...,2,2,1,'LYN@cpuc.ca.gov'; 'RB1@cpuc.ca.gov'; 'HMD@cpu...,2,2,1,JBennett <JBennett@GMSSR.com>@ENRON,2
0 5 percent would similar bod enron turning project frevert kitchen duran support signed dash unless bod planning new ceo unlikely happenas indication confidence deal team spending 3 day vail week family reccuperate spsent blackberry wireless handheld wwwblackberrynet,1,1,0 to 5 percent. It would be similar to having...,1,1,1,louise.kitchen@enron.com,1,1,1,charles.ward@enron.com,1
0 bid christie g anderson akers2 bid kevin johnson0 bid torrance smallthanks,4,1,"$0 bids for S Christie, G Anderson, and D Aker...",4,4,1,patrick.ryder@enron.com,4,4,1,eric.bass@enron.com,4
...,...,...,...,...,...,...,...,...,...,...,...,...
zuliefloresenroncomhere frank prepared sent lay whalley frevert agreed please forward individual group feel appropriate bethattached final analysis enron focus group paul braganasked send apology getting lastminute appreciate flexibilitysincerelymichael rossiluntz researchfocus group analysisdoc focus group analysisdoc,1,1,zulie.flores@enron.comHere is what Frank prepa...,1,1,1,"""'etilney@enron.com'"" <etilney@enron.com>",1,1,1,elizabeth.tilney@enron.com,1
zuliei shall take bus tuesday morning adet breakfastvince kaminski,1,1,"Zulie,I shall take the bus Tuesday morning to ...",1,1,1,zulie.flores@enron.com,1,1,1,j.kaminski@enron.com,1
zulievince kaminski,1,1,"Zulie,Vince Kaminski",1,1,1,zulie.flores@enron.com,1,1,1,j.kaminski@enron.com,1
zwe making list directory need moved netco directory moved estate cant use copy made group could use file make copy change list put together alreadythankskam,1,1,"Z,We have been making lists of the directories...",1,1,1,zhiyong.wei@enron.com,1,1,1,kam.keiser@enron.com,1


## Creating the features which indicates that the email is fraud or there is fraud related message in that particular email1

In [18]:
fraud_related_words = ['fraud', 'scam', 'phishing', 'malware', 'deception', 'hoax']  # Add more fraud-related words if needed

email_df['fraud'] = email_df['body_clean'].apply(lambda x: 1 if any(word in x.lower() for word in fraud_related_words) else 0)
email_df.head()

Unnamed: 0,body,to,from_,body_clean,fraud
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast,0
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...,0
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,test successful way go,0
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,randycan send schedule salary level everyone t...,0
5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com,greghow either next tuesday thursdayphillip,0


In [19]:
email_df.groupby('fraud').describe()

Unnamed: 0_level_0,body,body,body,body,to,to,to,to,from_,from_,from_,from_,body_clean,body_clean,body_clean,body_clean
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
fraud,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,488345,227154,"geir.solberg@enron.com, john.anderson@enron.co...",2135,488345,65404,pete.davis@enron.com,9097,488345,36978,kay.mann@enron.com,12312,488345,226007,geirsolbergenroncom johnandersonenroncommarkgu...,2135
1,2722,1336,BUSINESS HIGHLIGHTSTRANSACTION DEVELOPMENTTran...,35,2722,699,"David_Aamodt@pgn.com, dapnucc@teleport.com, di...",59,2722,616,"Lebrocq, Wendi",58,2722,1336,business highlightstransaction developmenttran...,35


In [20]:
observations_class_0 = email_df[email_df['fraud'] == 1]

In [21]:
observations_class_0

Unnamed: 0,body,to,from_,body_clean,fraud
1332,TheStreet.com and Privista are pleased to pres...,members@realmoney.com,members@realmoney.com,thestreetcom privista pleased present withcred...,1
3054,Premonition?=20=09=09=09British Trader Sentenc...,John Arnold/HOU/ECT@ECT,john.arnold@enron.com,premonition20090909british trader sentenced pr...,1
3326,The problem if we limit the size on options to...,andy.zipper@enron.com,john.arnold@enron.com,problem limit size option size offered onthe s...,1
3947,"paul.mead@enron.com, david.gallagher@enron.com...","richard.lewis@enron.com, john.lavorato@enron.c...",bob.shults@enron.com,paulmeadenroncom davidgallagherenroncomgregorb...,1
3949,Premonition?=20=09=09=09British Trader Sentenc...,John Arnold/HOU/ECT@ECT,john.arnold@enron.com,premonition20090909british trader sentenced pr...,1
...,...,...,...,...,...
515584,"d..gros@enron.com, kerry.roper@enron.combrenda...","rex.shelby@enron.com, andy.zipper@enron.com, s...",tina.spiller@enron.com,dgrosenroncom kerryroperenroncombrendaflorescu...,1
515586,"d..gros@enron.com, steve.hotte@enron.com, exec...","jeff.bartlett@enron.com, sally.beck@enron.com,...",tina.spiller@enron.com,dgrosenroncom stevehotteenroncom execjonesenro...,1
515587,"d..gros@enron.com, steve.hotte@enron.com, exec...","jeff.bartlett@enron.com, sally.beck@enron.com,...",tina.spiller@enron.com,dgrosenroncom stevehotteenroncom execjonesenro...,1
515826,"w..brown@enron.com, john.cummings@enron.com, h...","jeff.bartlett@enron.com, sally.beck@enron.com,...",peggy.mccurley@enron.com,wbrownenroncom johncummingsenroncom hdouglasen...,1


## Taking care of missing values if present over here

In [22]:
email_df.isnull().sum()

body          0
to            0
from_         0
body_clean    0
fraud         0
dtype: int64

No missing values are there inside the dataset

In [23]:
X=email_df.drop('fraud',axis=1)
y=email_df['fraud']

In [25]:
X.head(2)

Unnamed: 0,body,to,from_,body_clean
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...


In [27]:
X.shape

(491067, 4)

In [26]:
y.head(2)

0    0
1    0
Name: fraud, dtype: int64

In [28]:
y.shape

(491067,)

## Splitting the dataset into training set and testing set over here

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0,stratify=email_df['fraud'])


## Dealing with imbalanced dataset

In [38]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler 

# Specify the sampling_strategy as a float or dictionary
# For example, if you want to under-sample the majority class to have 80% of the minority class:
ns = RandomUnderSampler(sampling_strategy=0.8)

# Assuming you have already defined X_train and y_train
X_train_ns, y_train_ns = ns.fit_resample(X_train, y_train)

print("The number of classes before fit: {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit: Counter({0: 366259, 1: 2041})
The number of classes after fit Counter({0: 2551, 1: 2041})


## Cleaning the text

In [41]:
messages=X.copy()

In [43]:
messages['body'][1]

"Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time."

In [44]:
messages.reset_index(inplace=True)

In [45]:
messages

Unnamed: 0,index,body,to,from_,body_clean
0,0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast
1,1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...
2,2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,test successful way go
3,3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,randycan send schedule salary level everyone t...
4,5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com,greghow either next tuesday thursdayphillip
...,...,...,...,...,...
491062,517396,This is a trade with OIL-SPEC-HEDGE-NG (John L...,kori.loibl@enron.com,john.zufferli@enron.com,trade oilspechedgeng john lavoratos book john ...
491063,517397,Some of my position is with the Alberta Term b...,john.lavorato@enron.com,john.zufferli@enron.com,position alberta term book send position direc...
491064,517398,"2-----Original Message-----Morning John,I'm st...","Zufferli, John","Doucet, Dawn",2original messagemorning johnim still working ...
491065,517399,Analyst\t\t\t\t\tRankStephane Brodeur\t\t\t1Ch...,jeanie.slone@enron.com,john.zufferli@enron.com,analyst rankstephane brodeur 1chad clark 1ian ...
