Data Exploration

In [1]:
import numpy as np
import pandas as pd
import email

In [2]:
emails = pd.read_csv("emails.csv")
emails.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [3]:
print(emails.loc[0]["message"])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


Extract Data

In [4]:
#Extract Headers
def get_field(field, messages):
    column = []
    for message in messages:
        e = email.message_from_string(message)
        column.append(e.get(field))
    return column

In [5]:
emails['date'] = get_field("Date", emails['message'])
emails['subject'] = get_field("Subject", emails['message'])
emails['XFolder'] = get_field("X-Folder", emails['message'])
emails['X-From'] = get_field("X-From", emails['message'])
emails['X-To'] = get_field("X-To", emails['message'])
emails.head(3)

Unnamed: 0,file,message,date,subject,XFolder,X-From,X-To
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Re:,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",Re: test,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Phillip K Allen,Leah Van Arsdall


In [6]:
#Extract Message Body
def body(messages):
    column = []
    for message in messages:
        e = email.message_from_string(message)
        column.append(e.get_payload())
    return column

emails['body'] = body(emails['message'])
emails['body'].head(3)

0                            Here is our forecast\n\n 
1    Traveling to have a business meeting takes the...
2                       test successful.  way to go!!!
Name: body, dtype: object

In [7]:
#Extract Employee Name
def employee(file):
    column = []
    for string in file:
        column.append(string.split("/")[0])
    return column

emails['employee'] = employee(emails['file'])
emails['employee'].head(3)

0    allen-p
1    allen-p
2    allen-p
Name: employee, dtype: object

Data Cleaning and Transformation

In [8]:
#Convert date column to datetime
import datetime
from dateutil import parser
x = parser.parse("Fri, 4 May 2001 13:51:00 -0700 (PDT)")
print(x.strftime("%d-%m-%Y %H:%M:%S"))

04-05-2001 13:51:00


In [9]:
def change_type(dates):
    column = []
    for date in dates:
        column.append(parser.parse(date).strftime("%d-%m-%Y %H:%M:%S"))
    return column
emails['date'] = change_type(emails['date'])
emails['date'].head(2)

0    14-05-2001 16:39:00
1    04-05-2001 13:51:00
Name: date, dtype: object

In [10]:
print(emails['XFolder'][0])
emails['XFolder'][0].split("\\")[-1]

\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail


"'Sent Mail"

In [11]:
#Extract the last folder name
def preprocess_folder(folders):
    column = []
    for folder in folders:
        if (folder is None or folder == ""):
            column.append(np.nan)
        else:
            column.append(folder.split("\\")[-1].lower())
    return column

emails['XFolder'] = preprocess_folder(emails['XFolder'])
emails['XFolder'].head(3)

0    'sent mail
1    'sent mail
2    'sent mail
Name: XFolder, dtype: object

In [12]:
#Remove folders containing too few e-mails (less than 2)
unwanted_folders = ["all documents", "deleted items", "discussion threads", "sent", "deleted Items", "inbox",
                   "sent items", "'sent mail", "untitled", "notes inbox", "junk file", "calendar"]

emails = emails.loc[~emails['XFolder'].isin(unwanted_folders)]

In [13]:
#Replace empty missing values in subject with np.nan
def replace_empty_with_nan(subject):
    column = []
    for val in subject:
        if (val == ""):
            column.append(np.nan) 
        else:
            column.append(val)
    return column

In [14]:
emails['subject'] = replace_empty_with_nan(emails['subject'])
emails['X-To'] = replace_empty_with_nan(emails['X-To'])

In [15]:
#Drop missing value rows
emails.dropna(axis=0, inplace=True)

In [16]:
emails.isnull().sum()

file        0
message     0
date        0
subject     0
XFolder     0
X-From      0
X-To        0
body        0
employee    0
dtype: int64

In [17]:
emails.head()

Unnamed: 0,file,message,date,subject,XFolder,X-From,X-To,body,employee
3026,allen-p/straw/1.,Message-ID: <12644875.1075855692817.JavaMail.e...,14-03-2000 17:07:00,Central Texas Bale Resource,straw,bobregon@bga.com,list <strawbale@crest.org>,Hi All\n\nWe are looking for a wheat farmer ne...,allen-p
3027,allen-p/straw/2.,Message-ID: <22208447.1075855692838.JavaMail.e...,17-02-2000 07:37:00,Re: History of Lime and Cement,straw,rob_tom@freenet.carleton.ca (Robert W. Tom),CALXA@aol.com,"'Arry (calxa@aol.com), Lime Ex-splurt Extraord...",allen-p
3028,allen-p/straw/3.,Message-ID: <31438311.1075855692860.JavaMail.e...,17-02-2000 07:01:00,History of Lime and Cement,straw,CALXA@aol.com,"strawbale@crest.org, absteen@dakotacom.net","Folks,\n\nI just found this interesting site a...",allen-p
3029,allen-p/straw/4.,Message-ID: <2055670.1075855692881.JavaMail.ev...,10-02-2000 01:46:00,Re: Newsgroups,straw,billc@greenbuilder.com,strawbale@crest.org,>What other cool newsgroups are available for ...,allen-p
3030,allen-p/straw/5.,Message-ID: <22141218.1075855692903.JavaMail.e...,07-01-2000 16:29:00,RE: concrete stain,straw,"""Matt"" <matt@fastpacket.net>",strawbale@crest.org,"> Hi,\n> We recently faced the same questions ...",allen-p


In [18]:
cols_to_drop = ['file','message','date','X-From','X-To']

In [19]:
emails.drop(cols_to_drop, axis=1, inplace=True)

In [20]:
emails.head()

Unnamed: 0,subject,XFolder,body,employee
3026,Central Texas Bale Resource,straw,Hi All\n\nWe are looking for a wheat farmer ne...,allen-p
3027,Re: History of Lime and Cement,straw,"'Arry (calxa@aol.com), Lime Ex-splurt Extraord...",allen-p
3028,History of Lime and Cement,straw,"Folks,\n\nI just found this interesting site a...",allen-p
3029,Re: Newsgroups,straw,>What other cool newsgroups are available for ...,allen-p
3030,RE: concrete stain,straw,"> Hi,\n> We recently faced the same questions ...",allen-p


In [21]:
#Extract e-mails for employees who had over 2000 e-mails
email_count = dict(emails["employee"].value_counts())
reduced_emails = [key for key, val in email_count.items() if val >= 2000]
emails = emails.loc[emails['employee'].isin(reduced_emails)]
print(emails["employee"].value_counts())

kean-s         5344
kaminski-v     4388
kitchen-l      3775
farmer-d       3727
williams-w3    2740
lokay-m        2422
taylor-m       2374
Name: employee, dtype: int64


In [22]:
#Choose an employee
employee = emails[emails["employee"] == "kaminski-v"]

In [23]:
# Returns the folders containing more than 'n' number of e-mails
def remove_folders(emails, n):
    email_count = dict(emails["XFolder"].value_counts())
    small_folders = [key for key, val in email_count.items() if val <= n]
    emails = emails.loc[~emails['XFolder'].isin(small_folders)]
    return emails

In [24]:
n = 200
employee = remove_folders(employee, n)

In [25]:
#Encoding class labels
from sklearn.preprocessing import LabelEncoder
def label_encoder(df):
    class_le = LabelEncoder()
    # apply label encoder on the `XFolder` column
    y = class_le.fit_transform(df['XFolder'])
    df.loc[:,'label'] = y
    return df

In [26]:
label_encoder(employee)    
unique_folders = employee["label"].unique()
print (unique_folders)

[1 2 4 0 3 5 6]


In [27]:
employee['XFolder'].unique()

array(['ene_ect', 'management', 'projects', 'conferences', 'personal',
       'resumes', 'universities'], dtype=object)

In [28]:
employee.to_csv("cleaned_data.csv", index=False)

In [29]:
new_data=pd.read_csv("cleaned_data.csv")


In [30]:
new_data.head()

Unnamed: 0,subject,XFolder,body,employee,label
0,Re: 2001 PRELIMINARY I/C billing,ene_ect,"Becky,\n\nThe charges to Corp. go to RAC and a...",kaminski-v,1
1,Re: Jeff Skilling's presentation,ene_ect,I want to thank all of you for all your help!\...,kaminski-v,1
2,Improved Process for Engaging Temporary Workers,ene_ect,"As you are aware, Enron utilizes temporary sta...",kaminski-v,1
3,December 15th Super Saturday FRIDAY Interview ...,ene_ect,Please see the attached regarding interview co...,kaminski-v,1
4,eSource presents eSearch,ene_ect,eSource Launches eSearch Site Bringing Researc...,kaminski-v,1


In [31]:
%pip install snorkel

You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [32]:
from snorkel.labeling import labeling_function

In [33]:
CONFERENCE=0
ENE_ECT=1
MANAGEMENT=2
PERSONAL=3
PROJECT=4
RESUMES=5
UNIVERSITIES=6
ABSTAIN=-1

In [34]:
#Transforms text to feature vectors that can be used as input to estimator
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(employee.body).toarray()
labels = employee.label

In [35]:
#Find the terms that are the most correlated with each of the category
from sklearn.feature_selection import chi2
import numpy as np
def keyword(features,name):
    features_chi2 = chi2(features, labels == name)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    unigrams=unigrams[-25:]
    bigrams=bigrams[-25:]
    return unigrams+bigrams

In [36]:
keyword(features,0)

['nicosia',
 '892261',
 'icbi',
 'convention',
 'conferences',
 'waterloo',
 '519',
 'simou',
 'event',
 'infocast',
 'cy',
 '7484',
 'oliver',
 '7626',
 'speakers',
 'speaker',
 'registration',
 'phelim',
 'andreas',
 'panel',
 'cyprus',
 'trigeorgis',
 'lenos',
 'garp',
 'conference',
 'trigeorgis professor',
 'cy 1678',
 '75 kallipoleos',
 '892261 fax',
 'dept business',
 '357 892261',
 'box 20537',
 'business 75',
 'university cyprus',
 'fax 339063',
 '20537 cy',
 'nicosia cyprus',
 'cyprus tel',
 'cyprus dept',
 '2000 conference',
 '2001 convention',
 'finance university',
 'andreas simou',
 'bullet points',
 'risk uk',
 '20 7626',
 '44 20',
 'real options',
 'lenos trigeorgis',
 'garp 2001']

In [37]:
@labeling_function()
def lf_keyword_conference(x):
  keywords=keyword(features,0)
  return CONFERENCE if any(word in x.body.lower() for word in keywords) else ABSTAIN

In [38]:
@labeling_function()
def lf_keyword_ene_ect(x):
  keywords=keyword(features,1)
  return ENE_ECT if any(word in x.body.lower() for word in keywords) else ABSTAIN

In [39]:
@labeling_function()
def lf_keyword_management(x):
  keywords=keyword(features,2)
  return MANAGEMENT if any(word in x.body.lower() for word in keywords) else ABSTAIN

In [40]:
@labeling_function()
def lf_keyword_personal(x):
  keywords=keyword(features,3)
  return PERSONAL if any(word in x.body.lower() for word in keywords) else ABSTAIN

In [41]:
@labeling_function()
def lf_keyword_project(x):
  keywords=keyword(features,4)
  return PROJECT if any(word in x.body.lower() for word in keywords) else ABSTAIN

In [42]:
@labeling_function()
def lf_keyword_resumes(x):
  keywords=keyword(features,5)
  return RESUMES if any(word in x.body.lower() for word in keywords) else ABSTAIN

In [43]:
@labeling_function()
def lf_keyword_universities(x):
  keywords=keyword(features,6)
  return UNIVERSITIES if any(word in x.body.lower() for word in keywords) else ABSTAIN

In [44]:
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling.model import MajorityLabelVoter

In [45]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(employee,test_size=0.2)


In [46]:
Y_test = test.label.values

In [47]:
# Define the set of labeling functions (LFs)
lfs=[lf_keyword_conference,lf_keyword_ene_ect,lf_keyword_management,lf_keyword_personal,lf_keyword_project,lf_keyword_resumes,lf_keyword_universities]
# Apply the LFs to the unlabeled training data
applier=PandasLFApplier(lfs=lfs)
L_train=applier.apply(df=train)
L_test = applier.apply(df=test)
# Train the label model and compute the training labels
#train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")
#train=train[train.label !=ABSTAIN]

100%|██████████| 2259/2259 [24:45<00:00,  1.52it/s]
100%|██████████| 565/565 [06:20<00:00,  1.48it/s]


In [48]:
majority_model=MajorityLabelVoter(cardinality=7)
preds_train=majority_model.predict(L=L_train)

In [49]:
label_model=LabelModel(cardinality=7,verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100,seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=6.545]
INFO:root:[100 epochs]: TRAIN:[loss=0.017]
 29%|██▉       | 147/500 [00:00<00:00, 1466.48epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.011]
INFO:root:[300 epochs]: TRAIN:[loss=0.008]
 71%|███████   | 355/500 [00:00<00:00, 1823.11epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.006]
100%|██████████| 500/500 [00:00<00:00, 1888.89epoch/s]
INFO:root:Finished Training


In [50]:
probs_train = label_model.predict_proba(L=L_train)

In [51]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=train, y=probs_train, L=L_train)

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.XFolder.tolist())
X_test = vectorizer.transform(test.XFolder.tolist())

In [53]:
from snorkel.utils import probs_to_preds
preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

In [56]:
from sklearn.linear_model import LogisticRegression
sklearn_model = LogisticRegression(C=1e3, solver="liblinear")

In [57]:
sklearn_model.fit(X=X_train, y=preds_train_filtered)

LogisticRegression(C=1000.0, solver='liblinear')

In [58]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 52.0%
