## Mounting the google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing the essential libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import tensorflow as tf
tf.__version__
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

## Importing the dataset

In [3]:
dataset_train = pd.read_csv('drive/MyDrive/emails.csv',nrows=10000)
dataset_train.duplicated().sum()

0

In [4]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from_': map_to_list(emails, 'from')
    }

def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [5]:
email_df = pd.DataFrame(parse_into_emails(dataset_train.message))
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)

In [6]:
email_df.head()

Unnamed: 0,body,to,from_
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com
5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com


## Getting the shape of the dataset

In [7]:
email_df.shape

(9464, 3)

## Preprocessing the dataset

In [8]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    '''
    Preprocess a string.
    :parameter
        :param text: string - name of column containing text
        :param lst_stopwords: list - list of stopwords to remove
        :param flg_stemm: bool - whether stemming is to be applied
        :param flg_lemm: bool - whether lemmitisation is to be applied
    :return
        cleaned text
    '''

    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()

    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
lst_stopwords = nltk.corpus.stopwords.words("english")

In [11]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
email_df["body_clean"] = email_df["body"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
email_df.head()

Unnamed: 0,body,to,from_,body_clean
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,test successful way go
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,randycan send schedule salary level everyone t...
5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com,greghow either next tuesday thursdayphillip


In [13]:
email_df.groupby('to').describe()

Unnamed: 0_level_0,body,body,body,body,from_,from_,from_,from_,body_clean,body_clean,body_clean,body_clean
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
to,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
"""'Arnold, John'"" <john.arnold@enron.com>",9,3,personal business.PMJohn -I completely underst...,3,9,2,John.Arnold@enron.com [SMTP,6,9,3,personal businesspmjohn completely understand ...,3
"""'Eric Bass'"" <Eric.Bass@enron.com>",1,1,I hope you were only thinking good things!I ha...,1,1,1,eric.bass@enron.com,1,1,1,hope thinking good thingsi pretty good weekend...,1
"""'Eric_Bass@enron.com'"" <Eric_Bass@enron.com>",2,2,It only takes about 10 minutes to get to highl...,1,2,1,eric.bass@enron.com,2,2,2,take 10 minute get highland village cango anyt...,1
"""'John.Arnold@enron.com'"" <John.Arnold@enron.com>",3,1,I will not be able to attend the meeting but d...,3,3,1,john.arnold@enron.com,3,3,1,able attend meeting interest atutor math subje...,3
"""'K. Bass'"" <daphneco64@bigplanet.com>",1,1,I think we are going to stay in town and meet ...,1,1,1,"""Bass, Jason"" <Jason.Bass2@COMPAQ.com>",1,1,1,think going stay town meet airport thesurgery ...,1
...,...,...,...,...,...,...,...,...,...,...,...,...
ywang@enron.com,6,2,please add mike grigsby to distribution,3,6,1,phillip.allen@enron.com,6,6,2,please add mike grigsby distribution,3
"ywang@enron.com, patti.sullivan@enron.com, phillip.k.allen@enron.com,",18,2,"jane.m.tholt@enron.com, mike.grigsby@enron.com...",15,18,1,critical.notice@enron.com,18,18,2,janemtholtenroncom mikegrigsbyenroncomtranswes...,15
zalaywan@caiso.com,3,1,put this into the congestion redesign file if ...,3,3,1,tim.belden@enron.com,3,3,1,put congestion redesign file havent alreadsusa...,3
"zalaywan@caiso.com, eschmid@caiso.com, crobinson@caiso.com,",3,1,mary.hain@enron.comBelow is a summary of my SW...,3,3,1,tom.delaney@enron.com,3,3,1,maryhainenroncombelow summary swptf caiso sw i...,3


In [14]:
email_df.groupby('from_').describe()

Unnamed: 0_level_0,body,body,body,body,to,to,to,to,body_clean,body_clean,body_clean,body_clean
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
from_,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
"""ALLYSON FELLER, BLOOMBERG/ NEW YORK"" <AFELLER@bloomberg.net>@ENRON",3,1,"Please send it to me.johnJohn,We are trying to...",3,3,1,JPECHER@ENRON.COM,3,3,1,please send mejohnjohnwe trying transfer bloom...,3
"""ALLYSON FELLER, BLOOMBERG/ NEW YORK"" <AFELLER@bloomberg.net>@ENRON [mailto",1,1,"Please send it to me.johnJohn,We are trying to...",1,1,1,JPECHER@ENRON.COM,1,1,1,please send mejohnjohnwe trying transfer bloom...,1
"""Adrian Clark"" <AClark@firstcallassociates.com>@ENRON [mailto",1,1,I know a set up when I see one-----Original Me...,1,1,1,"Maggi, Mike",1,1,1,know set see oneoriginal messagemikehello name...,1
"""Allen, Phillip K."" <Phillip.K.Allen@ENRON.com>",1,1,"Ceci,I would like to stay current on listings ...",1,1,1,<ceci@gorge.net>,1,1,1,cecii would like stay current listing 225 alte...,1
"""Andy Colman"" <andy@spectronenergy.com>@ENRON",1,1,"not mad at you-----Original Message-----Jon, c...",1,1,1,"Arnold, John",1,1,1,mad youoriginal messagejon cant say desk front...,1
...,...,...,...,...,...,...,...,...,...,...,...,...
wtashnek@aol.com,1,1,I'm just checking to see if you got my e-mail ...,1,1,1,john.arnold@enron.com,1,1,1,im checking see got email monday,1
yahoo-delivers@yahoo-inc.com,8,4,[IMAGE]Yahoo! sent this email to you because y...,3,8,2,pallen@ect.enron.com,5,8,4,imageyahoo sent email yahoo account informatio...,3
yevgeny.frolov@enron.com,1,1,kirk.mcdaniel@enron.comkirk.mcdaniel@enron.com...,1,1,1,"w.kent.baker@accenture.com, kenny.w.baldwin@ac...",1,1,1,kirkmcdanielenroncomkirkmcdanielenroncomas dis...,1
yild@zdemail.zdlists.com,3,1,"as links, visit the Yahoo! Internet Life home ...",3,3,1,pallen@enron.com,3,3,1,link visit yahoo internet life home page daily...,3


In [15]:
email_df.groupby('body_clean').describe()

Unnamed: 0_level_0,body,body,body,body,to,to,to,to,from_,from_,from_,from_
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
body_clean,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
,9,4,AM ---------------------------,4,9,4,John Arnold/HOU/ECT@ECT,4,9,2,john.arnold@enron.com,6
001nevereditthisline002this advertisement sent thestreetcombecause currently within last year beena subscriber either freetrial paid one web siteswwwthestreetcom wwwrealmoneycom nota current former subscriber believe receivedthis message error please forward message tomembersthestreetcom call customer servicedepartment 18005629571 please assured werespect privacy subscriber havenot disclosed name information youto advertiser third partyfree reportavailable online nowdear investordespite tragic event september 11th america remainsstrong resolute week ahead challengesfor individual investor dauntingthis time go alone thats want tohave free online access forbes latest investment reportthis url browserforbes financial editor laszlo birinyi jr david dremankenneth l fisher richard lehman marc robin haveidentified special profit opportunity well somehigh yielding safe haven demand missile guidance night visionmarc robin weighs redhot company destined helpus win war make carbon dioxide laser system usinginfrared light essential military missile guidanceand night vision system access free report surprising safety highyield preferredsour income security advisor richard lehman explains todaysmore sophisticated preferred stock offering higher yield andlower risk past counterpart discover best yieldingfrom 85 111 free report timely value quality stocksvalue pro david dreman uncovered great bargainsin one ofthe best managed oil giant reserve 4 barrel per sharein tobacco leaf processor benefiting strong overseas demandand overlooked pharmaceutical great shape 2002 tech favorite poised fast reboundlaszlo birinyi jr see substantial growth 2002 qualitytech bluechip attractive currentoversold price get detail free reportthats small sampling insight opportunitiesyoull find free report detail includingcompany name stock symbol available online right nowfor immediate access forbes free report opportunity andsafe haven today market money nowyours trulysteve forbeschairmanthis advertisement supplied third party hasbeen sent thestreetcom informational purposesonly responsible independentlyauthenticated whole part accuracy informationprovided advertisement information berelied upon without consulting advertiser advertisementdoes imply endorsement usthestreetcom inc registered security brokerdealeror investment adviser either u security andexchange commission state security regulatoryauthority information site dissemination ofadvertising material intended security brokerageinvestment tax accounting legal advice u offer orsolicitation u offer sell buy endorsementrecommendation sponsorship service newslettercompany security fund cannot ass verifyor guarantee adequacy accuracy completeness anyinformation suitability profitability particular investmentor potential value investment informational sourceyou bear sole responsibility investment research anddecisions seek advice qualified securitiesprofessional making investment purchase ofinvestment advice sale purchase product servicesor security ownership interest result informationpresented site disseminated advertising material willbe negotiated basis party without anyadditional participation remuneration thestreetcom incif would prefer receive type offer u inthe future please reply thestreetoffers2mailthestreetcomwith remove subject line001nevereditthisline002hrthis advertisement sent thestreetcombecause currently within last year beena subscriber either freetrial paid one web siteswwwthestreetcom wwwrealmoneycom nota current former subscriber believe receivedthis message error please forward message tomembersthestreetcom call customer servicedepartment 18005629571 please assured werespect privacy subscriber havenot disclosed name information youto advertiser third partyhrbrheadstyle typetextcss hyperlink state font type smallhead smallheadred mediumhead mediumheadlight mediumheadreditalic mediumheadblue mediumheadblueitalic mediumheadsection mediumheaditalic largehead bar styleheadbody bgcolorfffffftable width480 cellpadding0 cellspacing0 border0trtrtrtd bgcolorffff00 width319 height60 aligncenter valignmiddle classlargeheadfree reportbravailable online nowtdtrtrtrtrtd colspan2 bgcolorffffff width478 aligncenter valignmiddletable width470 cellpadding4 cellspacing0 border0trtd width470 bgcolorffffffbrspan classmediumheaddear investorbrbrspanspan classmediumheadlightdespite tragic event september 11th america remains strong resolute week ahead challenge individual investor dauntingbrbrbrtable width470 cellpadding0 cellspacing0 border0trtd width15nbsptd classmediumheadbluehereatdtd width15nbsptdtrtablebrbrforbes financial editor span classmediumheadlaszlo birinyi jrspan span classmediumheaddavid dremanspan span classmediumheadkenneth l fisherspan span classmediumheadrichard lehmanspan span classmediumheadmarc robinsspan identified special profit opportunity well highyielding safe havensbrbrbrspancenterspan classmediumheadsectiondemand missile guidance night visionbrbrspancenter classmediumheadbluefree reportabrbrbrspancenterspan classmediumheadsectionsurprising safety highyield preferredsbrbrspancenter classmediumheadbluefree reportabrbrbrspancenterspan classmediumheadsectiontimely value quality stocksbrbrspancenter classmediumheadbluefree reportabrbrbrspancenterspan classmediumheadsectiontech favorite poised fast reboundbrbrspancenter classmediumheadbluefree reportabrbrbrbrspan classmediumheadblueavailable online right nowabrbrtable width470 cellpadding0 cellspacing0 border0tr classmediumheadbluefree reporta span classmediumheaditalicopportunities safe haven today marketspan money nowtdtrtablebrbrbrbrnbspnbspnbspspan classmediumheadyours trulybrbrnbspnbspnbspsteve forbesbrnbspnbspnbspchairmanspantdtrtabletdtrtrtd colspan2 width478 height10 bgcolorffffffnbsptdtrtrtrtablebrhrthis advertisement supplied third party hasbeen sent thestreetcom informational purposesonly responsible independentlyauthenticated whole part accuracy informationprovided advertisement information berelied upon without consulting advertiser advertisementdoes imply endorsement usthestreetcom inc registered security brokerdealeror investment adviser either u security andexchange commission state security regulatoryauthority information site dissemination ofadvertising material intended security brokerageinvestment tax accounting legal advice u offer orsolicitation u offer sell buy endorsementrecommendation sponsorship service newslettercompany security fund cannot ass verifyor guarantee adequacy accuracy completeness anyinformation suitability profitability particular investmentor potential value investment informational sourceyou bear sole responsibility investment research anddecisions seek advice qualified securitiesprofessional making investment purchase ofinvestment advice sale purchase product servicesor security ownership interest result informationpresented site disseminated advertising material willbe negotiated basis party without anyadditional participation remuneration thestreetcom incbrbrif would prefer receive type offer u inthe future please reply thestreetoffers2mailthestreetcom remove thesubject linebrbrhrbrbr001nevereditthisline002,1,1,--001nevereditthisline002---------------------...,1,1,1,jarnold@ect.enron.com,1,1,1,thestreet@offers2.mail-thestreet.com,1
01 attachment free virus scan mail forwarded phillip k allenhouect 0305200101 attachment free virus scan mailsorry deadline passed enrons deal yesterdaywillbe included survey01attachment free virus scan mailwe send evening calc book probably aroundanne01 attachment free virus scan mailanneare planning send today bidweek deal soon need knowwhetherto transfer everything data basethanksliane kucher2023832147,4,1,/01 Attachment is free from viruses. Scan Mail...,4,4,1,Anne.Bike@enron.com,4,4,1,phillip.allen@enron.com,4
010101 herald actual millenium unique date repeated onlyafter 1000 year here hoping millenium shower health wealth hapiness allextremes family make world great place livebest regardsmukuljayshree,3,1,01.01.01 heralds the actual millenium. A uniqu...,3,3,1,harora@ect.enron.com,3,3,1,advapl@vsnl.com,3
072701francishere file need excel thank helpphilliporiginal messageoriginal messagei apologize sending yesterday information experiencing technical difficulty please let know question comment thank youanne bike,1,1,"of 07/27/01Francis,Here are the files we need ...",1,1,1,"Allen, Phillip K.; Distribution Prices - L Kuc...",1,1,1,"Bike, Anne",1
...,...,...,...,...,...,...,...,...,...,...,...,...
zf trading 4,4,1,z/f trading 4,4,4,1,per.sekse@enron.com,4,4,1,john.arnold@enron.com,4
zf wow would thunk prompt gas 6 zf aswide last year hard think better scenario fliprather hard think scenario zf contango itcouldnt yeara lot boy max withdrawing storage thats curvetold last bid week obviously gas trying come thanis burned incentivize economic player like anenron inject problem stick ground pullingit g zg 35 back cash getting priced gcashz look awfully weak thus putting lot pressure zf storageeconomics always dictate market except maybe latter half ofwinter buyer hj 70 certainly hope anywaysagree back half winter strong storage boysare withdrawing today buying bottom fishing fg yet itgoing zeroagree janfebi cudnt resist sold little yest 39will prob end upbuyingem back 45 piss away rest monthfyiif ever chance speak phone feel free callmy timeisbusy managed paper flow eol dont bother withcalls occasional email give time respond havethetime im also susprised decjanmad buck trading way butone cant help wonder given loadsgass crazy westthe calledimpactof husbanding end user yet frontn curve still cant backwardateatallindeed make sprds look rich agaonalso curious loan deal pipe work ouut man theseguys cud really getting troubleanother big reason dec cashhasnt able go janbut think make strong longs intheback mkt ie taking gas back march april shud keepthat part curve strong agreebe cool mantalk later,3,1,Z/F !!!! wow. Who would have thunk it. P...,3,3,1,John.Arnold@enron.com,3,3,1,john.arnold@enron.com,3
zip250mb usb drive take anywhere___________________________________________________________computers___________________________________________________________smartpad pocketpc save 12lets instantly capture everything write draw using thesmartpad pen ordinary paper___________________________________________________________kds valiant 6480iptdp3 save 27___________________________________________________________viewsonic ve150 15 lcd monitor save 24this lightweight monitor conserve power fit perfectly inareas limited work space___________________________________________________________sipix stylecam exclusive low pricea digital camera streaming video camera usb video camera andvideo conferencing camera allinone___________________________________________________________avertv box external tv tuner module 30 mailin rebate10895 rebatewatch tv video dvd movie pc play video gamesdirectly computer too___________________________________________________________altec lansing 4100 5piece system 50 mailin rebate 9095after rebateexperience optimum 4channel sound performance___________________________________________________________software___________________________________________________________windows xp home upgrade free shipping dec 31 2001an excellent choice home user come exciting newfeatures___________________________________________________________microsoft train simulator save 27this program place role engineer passenger withunprecedented realism___________________________________________________________symantec norton systemworks 2002 get 50 mailinrebateprotect pc virus threat optimize performance andclean internet clutter___________________________________________________________intuit turbotax deluxe 2001 10 rebate offer 2795 afterrebatethis program packed money saving advice also helpsyou take advantage new tax laws___________________________________________________________dvdvideo___________________________________________________________evolution dvd save 22david duchovny orlando jones seann william scott juliannemoore save world___________________________________________________________books___________________________________________________________30zukav coauthor linda francis show reader apply crucialconcepts daily lives___________________________________________________________music___________________________________________________________this twodisc set magical sound behind movieincluding song enya___________________________________________________________electronics___________________________________________________________philips expanium portable mp3cd player save 47take mp3 file wherever go play regular audio cd aswell cdrs cdrws___________________________________________________________go video dual deck 4 head hifi vcr save 50commercial movie advance feature automatically skip throughcommercials previews___________________________________________________________as always thank choosing buycomrobert r pricepresident buycomdlink offer complete line wireless networking solutionsin addition electronics buycom also offer topofthelinecomputers bestselling book video wireless software muchcomputerssoftwareelectronicswirelessbooksmusicgamesvideodvdclearancesupport question please reply buycom emailall price product availability subject change withoutnotice unless noted price include shipping andapplicable sale tax product quantity limited list pricerefers manufacturer suggested retail price may bedifferent actual selling price area please visitus buycom link informationincluding latest pricing availability restriction eachoffer buycom internet superstore trademarksof buycom inc buycom inc 2001 right reservedwe respect privacy would rather receive emailalerting buycom special offer product announcement,1,1,==============================================...,1,1,1,jarnold@enron.com,1,1,1,buy.com@enews.buy.com,1
zip250mb usb drive take anywhere___________________________________________________________videodvd___________________________________________________________evolution dvd save 23david duchovny orlando jones seann william scott juliannemoore save world___________________________________________________________two play game widescreen dvd save 25a beautiful advertising exec try save relationship afterfinding man another woman___________________________________________________________dancing blue iguana dvd save 22the life five labased strip club dancer converge thecourse one week___________________________________________________________animal house widescreen anniversary edition vhs save 25this raunchy screwball comedy john belushi spoof the1960s college life delta fraternity___________________________________________________________for love game special edition vhs new releasea legendary baseball pitcher kevin costner find career andlove life air___________________________________________________________somewhere time 20th anniversary edition vhs save 25a young writer christopher reeve sacrifice present life tofind happiness past___________________________________________________________music___________________________________________________________two disc magical sound movie feature musicfrom enya___________________________________________________________marys latest album feature guest like missy elliot lennykravitz eve hakeem___________________________________________________________books___________________________________________________________selfmatters phillip c mcgraw save 30dr phil brings straighttalking style help reader seizethe power lie within___________________________________________________________30zukav coauthor linda francis show reader applycrucial concept everyday life___________________________________________________________games___________________________________________________________ssx tricky ps2 save 14features insane sick uber trick surreal mindblowing worldsand cast funky characters___________________________________________________________shrek xbox save 15in effort live happily ogre shrek embarks 36missions four neverbeforeseen worlds___________________________________________________________electronics___________________________________________________________sampo dvdmp3 player cf card readerthe world first dvd player digital photo playbackcapabilities___________________________________________________________as always thank choosing buycomrobert r pricepresident buycomdlink offer complete line wireless networking solutionsin addition electronics buycom also offer topofthelinecomputers bestselling book video wireless software muchcomputerssoftwareelectronicswirelessbooksmusicgamesvideodvdclearancesupport question please reply buycom emailall price product availability subject change withoutnotice unless noted price include shipping andapplicable sale tax product quantity limited list pricerefers manufacturer suggested retail price may bedifferent actual selling price area please visitus buycom link informationincluding latest pricing availability restriction eachoffer buycom internet superstore trademarksof buycom inc buycom inc 2001 right reservedwe respect privacy would rather receive emailalerting buycom special offer product announcement,1,1,==============================================...,1,1,1,jarnold@enron.com,1,1,1,buy.com@enews.buy.com,1


In [16]:
fraud_related_words = ['fraud', 'scam', 'phishing', 'malware', 'deception', 'hoax']  # Add more fraud-related words if needed

email_df['fraud'] = email_df['body_clean'].apply(lambda x: 1 if any(word in x.lower() for word in fraud_related_words) else 0)
email_df.head()

Unnamed: 0,body,to,from_,body_clean,fraud
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast,0
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...,0
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,test successful way go,0
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,randycan send schedule salary level everyone t...,0
5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com,greghow either next tuesday thursdayphillip,0


In [17]:
email_df.groupby('fraud').describe()

Unnamed: 0_level_0,body,body,body,body,to,to,to,to,from_,from_,from_,from_,body_clean,body_clean,body_clean,body_clean
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
fraud,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,9428,4717,"george.huan@enron.com, mike.maggi@enron.com, l...",46,9428,1585,John Arnold/HOU/ECT@ECT,571,9428,1218,john.arnold@enron.com,2486,9428,4700,georgehuanenroncom mikemaggienroncom larrymaye...,46
1,36,21,"jeffrey.hodge@enron.com, melissa.murphy@enron....",3,36,16,"aimee.shek@enron.com, albino.lopez@enron.com, ...",9,36,14,outlook.team@enron.com,9,36,21,jeffreyhodgeenroncom melissamurphyenroncomkyle...,3


In [18]:
observations_class_0 = email_df[email_df['fraud'] == 1]

In [19]:
observations_class_0

Unnamed: 0,body,to,from_,body_clean,fraud
1332,TheStreet.com and Privista are pleased to pres...,members@realmoney.com,members@realmoney.com,thestreetcom privista pleased present withcred...,1
3054,Premonition?=20=09=09=09British Trader Sentenc...,John Arnold/HOU/ECT@ECT,john.arnold@enron.com,premonition20090909british trader sentenced pr...,1
3326,The problem if we limit the size on options to...,andy.zipper@enron.com,john.arnold@enron.com,problem limit size option size offered onthe s...,1
3947,"paul.mead@enron.com, david.gallagher@enron.com...","richard.lewis@enron.com, john.lavorato@enron.c...",bob.shults@enron.com,paulmeadenroncom davidgallagherenroncomgregorb...,1
3949,Premonition?=20=09=09=09British Trader Sentenc...,John Arnold/HOU/ECT@ECT,john.arnold@enron.com,premonition20090909british trader sentenced pr...,1
4405,The problem if we limit the size on options to...,andy.zipper@enron.com,john.arnold@enron.com,problem limit size option size offered onthe s...,1
4505,"anitha.mathis@enron.com, antonette.concepcion@...","aimee.shek@enron.com, albino.lopez@enron.com, ...",outlook.team@enron.com,anithamathisenroncom antonetteconcepcionenronc...,1
4507,"anitha.mathis@enron.com, antonette.concepcion@...","aimee.shek@enron.com, albino.lopez@enron.com, ...",outlook.team@enron.com,anithamathisenroncom antonetteconcepcionenronc...,1
4508,"anitha.mathis@enron.com, antonette.concepcion@...","aimee.shek@enron.com, albino.lopez@enron.com, ...",outlook.team@enron.com,anithamathisenroncom antonetteconcepcionenronc...,1
5073,only you would know of a website that contradi...,=09Angie Conner (E-mail); Ann Sutton (E-mail);...,"=09Ward, Kim S (Houston) =20",would know website contradicts much time hand ...,1


## Taking care of missing values if present over here

In [20]:
email_df.isnull().sum()

body          0
to            0
from_         0
body_clean    0
fraud         0
dtype: int64

No missing values are there inside the dataset

In [21]:
X=email_df.drop('fraud',axis=1)
y=email_df['fraud']

In [22]:
X.head(2)

Unnamed: 0,body,to,from_,body_clean
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...


In [23]:
X.shape

(9464, 4)

In [24]:
y.head(2)

0    0
1    0
Name: fraud, dtype: int64

In [25]:
y.shape

(9464,)

## Cleaning the text

In [26]:
messages=X.copy()

In [27]:
messages['body'][1]

"Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time."

In [28]:
messages.reset_index(inplace=True)

In [29]:
messages

Unnamed: 0,index,body,to,from_,body_clean
0,0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast
1,1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting take fun trip espec...
2,2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,test successful way go
3,3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,randycan send schedule salary level everyone t...
4,5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com,greghow either next tuesday thursdayphillip
...,...,...,...,...,...
9459,9995,"shes pretty sexy, huh? are we getting togethe...",Brian Hoskins/HOU/ECT@ECT,eric.bass@enron.com,shes pretty sexy huh getting together talk eur...
9460,9996,i copied your idea - and it screwed up your name!,danielles@jonesgranger.com,eric.bass@enron.com,copied idea screwed name
9461,9997,"---------------------------Eric,Just a reminde...",Eric Bass/HOU/ECT@ECT,eric.bass@enron.com,ericjust reminder still need 9912 flash detail...
9462,9998,did you buy any enron in the 60s?,lwbthemarine@bigplanet.com,eric.bass@enron.com,buy enron 60


In [30]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['body_clean'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
print(corpus[1])

travel busi meet take fun trip especi prepar present would suggest hold busi plan meet take trip without formal busi meet would even tri get honest opinion whether trip even desir necessarya far busi meet think would product tri stimul discuss across differ group work often present speak other quiet wait turn meet might better held round tabl discuss formatmi suggest go austin play golf rent ski boat jet ski fli somewher take much time


In [32]:
voc_size=2500

In [33]:
from keras.preprocessing.text import one_hot
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr[0]

[1614]

In [34]:
onehot_repr[1]

[2317,
 1742,
 1663,
 673,
 298,
 1471,
 1229,
 2281,
 164,
 1354,
 1035,
 634,
 1742,
 1991,
 1663,
 673,
 1471,
 1540,
 664,
 1742,
 1663,
 1354,
 1151,
 2058,
 1197,
 2288,
 661,
 1708,
 1471,
 1151,
 462,
 1094,
 1699,
 1742,
 1663,
 1704,
 1354,
 1569,
 2058,
 2188,
 2207,
 661,
 1003,
 1915,
 485,
 879,
 164,
 2320,
 2006,
 1902,
 1417,
 2220,
 1663,
 1507,
 123,
 577,
 2234,
 2209,
 2207,
 50,
 1035,
 1102,
 34,
 479,
 2270,
 872,
 1931,
 597,
 349,
 1931,
 949,
 2227,
 673,
 70,
 50]

## Embedding Representation

In [35]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...    0    0 1614]
 [ 577 2234 2209 ...  673   70   50]
 [   0    0    0 ... 2081 2390 1102]
 ...
 [   0    0    0 ...  233  213 2093]
 [   0    0    0 ...    0 1030  574]
 [2078  222  170 ...  163 1197 2232]]


In [36]:
embedded_docs[1]

array([ 577, 2234, 2209, 2207,   50, 1035, 1102,   34,  479, 2270,  872,
       1931,  597,  349, 1931,  949, 2227,  673,   70,   50], dtype=int32)

## Part 2 - Building and Training the RNN

In [37]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

## Initialising the RNN

In [38]:
regressor = Sequential()

## Adding Embedding layers over here

In [39]:
embedding_vector_features=40
regressor.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))

## Adding the first LSTM layer and some Dropout regularisation

In [40]:
regressor.add(LSTM(units = 300, return_sequences = True))
regressor.add(Dropout(0.2))

## Adding the second LSTM layer and some Dropout regularisation

In [41]:
regressor.add(LSTM(units = 300, return_sequences = True))
regressor.add(Dropout(0.2))

## Adding the third LSTM layer and some Dropout regularisation

In [42]:
regressor.add(LSTM(units = 300, return_sequences = True))
regressor.add(Dropout(0.2))

## Adding the fourth LSTM layer and some Dropout regularisation

In [43]:
regressor.add(LSTM(units = 300, return_sequences = True))
regressor.add(Dropout(0.2))

## Adding the fifth LSTM layer and some Dropout regularisation

In [44]:
regressor.add(LSTM(units = 300))
regressor.add(Dropout(0.2))

## Adding the output layer over here

In [45]:
regressor.add(Dense(units = 1,activation="sigmoid"))

## Compiling the model

In [46]:
regressor.compile(optimizer = 'adam', loss = 'binary_crossentropy',metrics=['accuracy'])


In [47]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

## Splitting the dataset into training set and testing set over here

In [58]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_final,y_final,test_size=0.2,random_state=42,stratify=y_final)

## Dealing with imbalanced dataset over here





In [60]:
import imblearn
print(imblearn.__version__)

0.10.1


In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Sample dataset - Replace this with your actual dataset
# Assume X contains the feature matrix and y contains the target labels




# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE only to the training set
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print the number of samples before and after oversampling
print("Before oversampling - Class 1 samples:", sum(y_train == 1))
print("After oversampling - Class 1 samples:", sum(y_train_resampled == 1))

# Now you can proceed with your training using the oversampled training data


Before oversampling - Class 1 samples: 29
After oversampling - Class 1 samples: 7542


In [65]:
X_train_resampled.shape

(15084, 20)

In [66]:
y_train_resampled.shape

(15084,)

## Training the model

In [73]:
regressor.fit(X_train_resampled, y_train_resampled, epochs = 50, batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x79d9c129a890>

## Predicting the test set results

In [68]:
y_pred=regressor.predict(X_test)



## Setting the threshold value of 60%bold text

In [69]:
y_pred=np.where(y_pred > 0.6, 1,0)

## Evaluating the performance of the model

In [70]:
from sklearn.metrics import confusion_matrix

In [71]:
confusion_matrix(y_test,y_pred)

array([[1886,    0],
       [   7,    0]])

In [72]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9963021658742737

In [55]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1884
           1       0.00      0.00      0.00         9

    accuracy                           1.00      1893
   macro avg       0.50      0.50      0.50      1893
weighted avg       0.99      1.00      0.99      1893



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
