In [2]:
import email
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
from datetime import datetime, timedelta
from dateutil.parser import parse
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec

plt.rcParams['figure.dpi'] = 100
plt.rcParams["figure.autolayout"] = True

In [23]:
data_dir = Path(Path.cwd().parent, 'data/interim')
models_dir = Path(Path.cwd().parent, 'models')
evals_dir = Path(data_dir, 'evals_3')
labels_dir = Path(data_dir, 'labels_0')
models_dir = Path(Path.cwd().parent, 'models')
datasets_order = {
    'chains_eq_2': 0,
    'chains_eq_3': 1,
    'chains_ge_4_lt_10': 2,
    'chains_ge_10': 3}

datasets_names = {
    'chains_eq_2': r'CL $= 2$',
    'chains_eq_3': r'CL $= 3$',
    'chains_ge_4_lt_10': r'10 $>$ CL $\geq$ 4',
    'chains_ge_10': r'CL $\geq$ 10'}

sizes = {}
for path in Path(models_dir).glob('d2v*.model'):
    size = len(Doc2Vec.load(str(path)).dv.vectors)
    name = '_'.join(path.stem.split('_')[1:])
    sizes[name] = size


In [24]:
df_kmeans = pd.concat([pd.read_csv(path) for path in evals_dir.glob('eval_km*.csv')], ignore_index=True)
df_kmeans['dimensions'] = df_kmeans['dataset'].apply(lambda s: s.split('_')[-1])
df_kmeans['dataset'] = df_kmeans['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_kmeans['order'] = df_kmeans['dataset'].map(datasets_order)
df_kmeans['name'] = df_kmeans['dataset'].map(datasets_names)
df_kmeans = df_kmeans.sort_values('order')
df_kmeans['k_noise'] = df_kmeans['n_clusters'].astype(int)
df_kmeans['method'] = 'KMeans'

df_dbscan = pd.concat([pd.read_csv(path) for path in evals_dir.glob('eval_dbscan*.csv')], ignore_index=True)
df_dbscan['size'] = df_dbscan['dataset'].map(sizes)
df_dbscan['prop_noise'] = df_dbscan['n_noise']/df_dbscan['size']
df_dbscan['dimensions'] = df_dbscan['dataset'].apply(lambda s: s.split('_')[-1])
df_dbscan['dataset'] = df_dbscan['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_dbscan['order'] = df_dbscan['dataset'].map(datasets_order)
df_dbscan['name'] = df_dbscan['dataset'].map(datasets_names)
df_dbscan = df_dbscan.sort_values('order')
df_dbscan['k_noise'] = df_dbscan.apply(lambda x: f"{x['n_clusters']} ({x['prop_noise']:.3f})", axis=1)
df_dbscan['method'] = 'DBSCAN'

df_hdbscan = pd.concat([pd.read_csv(path) for path in evals_dir.glob('*_hdbscan*.csv')], ignore_index=True)
df_hdbscan['size'] = df_hdbscan['dataset'].map(sizes)
df_hdbscan['prop_noise'] = df_hdbscan['n_noise']/df_hdbscan['size']
df_hdbscan['dimensions'] = df_hdbscan['dataset'].apply(lambda s: s.split('_')[-1])
df_hdbscan['dataset'] = df_hdbscan['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_hdbscan['order'] = df_hdbscan['dataset'].map(datasets_order)
df_hdbscan['name'] = df_hdbscan['dataset'].map(datasets_names)
df_hdbscan = df_hdbscan.sort_values('order')
df_hdbscan['k_noise'] = df_hdbscan.apply(lambda x: f"{x['n_clusters']} ({x['prop_noise']:.3f})", axis=1)
df_hdbscan['method'] = 'HDBSCAN'
datasets = df_dbscan.sort_values(by='order')['dataset'].unique()

eval_scores = ['sl_score', 'ch_score', 'db_score', 'entropy']

In [25]:
eval_scores

['sl_score', 'ch_score', 'db_score', 'entropy']

In [26]:
def labels_km(df):
    for idx, data in df.iterrows():
        return f'labels_km_{data["dataset"]}_{data["dimensions"]}_{data["n_clusters"]:02d}_{data["distance"]}.pkl'

def labels_db(df):
    for idx, data in df.iterrows():
        return f'labels_dbscan_{data["dataset"]}_{data["dimensions"]}_{data["epsilon"]}_{int(data["min_pts"]):02d}_{data["distance"]}.pkl'

def labels_hd(df):
    for idx, data in df.iterrows():
        return f'labels_hdbscan_{data["dataset"]}_{data["dimensions"]}_{data["min_clt_size"]:02d}_{data["min_samples"]:02d}_{data["distance"]}.pkl'

In [27]:
# best_labels_ = []
best_labels_ = {}
for score in eval_scores:
    for dat in datasets:
        if score in ['db_score', 'entropy']:
            asc = True
        else:
            asc= False
        tmp1 = pd.concat([
            df_kmeans.loc[df_kmeans['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
            df_dbscan.loc[df_dbscan['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
            df_hdbscan.loc[df_hdbscan['dataset']==dat,:].sort_values(score,ascending=asc).head(1)
        ])
        tmp2 = tmp1.sort_values(score,ascending=False).head(1)
        if tmp2.iloc[0,11] == 'KMeans':
            label = labels_km(tmp2)
        elif tmp2.iloc[0,11] == 'DBSCAN':
            label = labels_db(tmp2)
        elif tmp2.iloc[0,11] == 'HDBSCAN':
            label = labels_hd(tmp2)

        if dat in best_labels_:
            best_labels_[dat].append(label)
        else:
            best_labels_[dat] = [label]

best_labels_ = {k:list(set(v)) for k, v in best_labels_.items()}
best_labels = {}
for k, v in best_labels_.items():
    for i in range(len(v)):
        if k not in best_labels:
            best_labels[k] = {i: v[i]}
        else:
            best_labels[k].update({i: v[i]})
best_labels

{'chains_eq_2': {0: 'labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl',
  1: 'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
  2: 'labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl'},
 'chains_eq_3': {0: 'labels_km_chains_eq_3_300_02_cosine.pkl',
  1: 'labels_dbscan_chains_eq_3_50_0.83_04_euclidean.pkl',
  2: 'labels_dbscan_chains_eq_3_50_0.25_03_wmd.pkl'},
 'chains_ge_4_lt_10': {0: 'labels_km_chains_ge_4_lt_10_50_02_wmd.pkl',
  1: 'labels_dbscan_chains_ge_4_lt_10_300_0.2_13_wmd.pkl',
  2: 'labels_dbscan_chains_ge_4_lt_10_50_0.66_04_euclidean.pkl'},
 'chains_ge_10': {0: 'labels_dbscan_chains_ge_10_50_0.56_02_euclidean.pkl',
  1: 'labels_dbscan_chains_ge_10_300_0.63_02_euclidean.pkl',
  2: 'labels_km_chains_ge_10_50_02_cosine.pkl'}}

In [15]:
tmp2.iloc[0,11]

'DBSCAN'

In [28]:
{k:list(set(v)) for k, v in best_labels.items()}
# for k, v in best_labels.items():
#     for lab in v:
#         print(Path(labels_dir, lab).is_file())

{'chains_eq_2': [0, 1, 2],
 'chains_eq_3': [0, 1, 2],
 'chains_ge_4_lt_10': [0, 1, 2],
 'chains_ge_10': [0, 1, 2]}

In [29]:
clusters = {}
for dat in datasets:
    for i, lab in best_labels[dat].items():
        if dat not in clusters:
            clusters[dat] = {i:pd.read_pickle(Path(labels_dir, lab))}
        else:
            clusters[dat].update({i:pd.read_pickle(Path(labels_dir, lab))})
        # if dat in clusters:
        #     clusters[dat].append(pd.read_pickle(Path(labels_dir, lab)))
        # else:
        #     clusters[dat] = [pd.read_pickle(Path(labels_dir, lab))]


In [30]:
for d in clusters.values():
    for i, c in d.items():
        print(len(c))

23841
23841
23841
8769
8769
8769
9676
9676
9676
5324
5324
5324


In [31]:
parsed_emails = {dat:pd.read_pickle(Path(data_dir,f'parsed_emails_{dat}.pkl')) for dat in datasets}

In [18]:
# results = {}
# for dat in datasets:
#     for i, c in enumerate(clusters[dat]):
#         print(i, len(c))

In [32]:
results = {}
for dat in datasets:
    for i, c in enumerate(clusters[dat].values()):
        tmp = parsed_emails[dat].copy()
        tmp['label'] = c
        if dat not in results:
            results[dat] = {i:tmp}
        else:
            results[dat].update({i:tmp})

In [33]:
for dat in datasets:
    for i, v in results[dat].items():
        print(i, best_labels[dat][i])
        print(v.value_counts('label'))

0 labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl
label
 0    23834
-1        5
 1        2
dtype: int64
1 labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl
label
-1    23522
 1      314
 0        5
dtype: int64
2 labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl
label
-1    23525
 0      314
 1        2
dtype: int64
0 labels_km_chains_eq_3_300_02_cosine.pkl
label
0    8642
1     127
dtype: int64
1 labels_dbscan_chains_eq_3_50_0.83_04_euclidean.pkl
label
-1    8639
 0     126
 1       4
dtype: int64
2 labels_dbscan_chains_eq_3_50_0.25_03_wmd.pkl
label
0    8766
1       3
dtype: int64
0 labels_km_chains_ge_4_lt_10_50_02_wmd.pkl
label
0    9468
1     208
dtype: int64
1 labels_dbscan_chains_ge_4_lt_10_300_0.2_13_wmd.pkl
label
 0    9674
-1       2
dtype: int64
2 labels_dbscan_chains_ge_4_lt_10_50_0.66_04_euclidean.pkl
label
-1    9529
 0     139
 1       8
dtype: int64
0 labels_dbscan_chains_ge_10_50_0.56_02_euclidean.pkl
label
-1    5230
 0      92
 1       2
dtype: int64
1 labels_dbsca

In [57]:
for dat in datasets:
    for i, v in results[dat].items():
        # print(Path(labels_dir, best_labels[dat][i]))
        print(best_labels[dat][i])

labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl
labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl
labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl
labels_km_chains_eq_3_300_02_cosine.pkl
labels_dbscan_chains_eq_3_50_0.83_04_euclidean.pkl
labels_dbscan_chains_eq_3_50_0.25_03_wmd.pkl
labels_km_chains_ge_4_lt_10_50_02_wmd.pkl
labels_dbscan_chains_ge_4_lt_10_300_0.2_13_wmd.pkl
labels_dbscan_chains_ge_4_lt_10_50_0.66_04_euclidean.pkl
labels_dbscan_chains_ge_10_50_0.56_02_euclidean.pkl
labels_dbscan_chains_ge_10_300_0.63_02_euclidean.pkl
labels_km_chains_ge_10_50_02_cosine.pkl


In [58]:
Path(labels_dir, best_labels[dat][i])

PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/labels_0/labels_km_chains_ge_10_50_02_cosine.pkl')

In [34]:
tmp = results['chains_eq_2'][0]

In [35]:
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
144,Wish we could go - but we're off to Ft. Lauder...,True,75963,2,Re: Friday,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,909762180,0
145,Hey Marc - any chance you guys might like to j...,True,75963,2,Re: Friday,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,909762960,0
157,This message was returned to me - it looks lik...,False,192107,2,Undeliverable message,mark.taylor@enron.com,per.sekse@enron.com,910973340,0
158,I think you can go straight to performance rev...,True,192107,2,Re: Undeliverable message,mark.taylor@enron.com,per.sekse@enron.com,911468460,0
165,Anita:\n\nI seem to remember that our traders ...,False,98742,2,LNG hedging for China,mark.taylor@enron.com,anita.fam@enron.com,911501880,0
...,...,...,...,...,...,...,...,...,...
250917,"Jeff,\n\nThe files are in DesertSkyCurtail in ...",False,107371,2,May curtailment spread,mark.fisher@enron.com,jeff.duff@enron.com,1025701860,0
250920,"Mark,\n\nI checked the sums with what was sent...",True,107371,2,Re: May curtailment spread,jeff.duff@enron.com,mark.fisher@enron.com,1025704560,0
250970,"Tim,\n\nIn Oct 2001 I produced the attached re...",False,74063,2,Fluvanna and Trew Ranch reports,mark.fisher@enron.com,"tim.derrick@enron.com, jeff.duff@enron.com, je...",1026400320,0
250971,"Thanks. I will use this report, and we should...",True,74063,2,Re: Fluvanna and Trew Ranch reports,tim.derrick@enron.com,"mark.fisher@enron.com, jeff.duff@enron.com, je...",1026403800,0


In [36]:
tmp = results['chains_eq_2'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
784,Thank you for your help. I look forward to he...,True,209486,2,RE: derivatives documentation software,tana.jones@enron.com,ian.howells@documentum.com,928482540,0
1336,Of all the weekends---Doug's and our family ar...,True,93957,2,Re: Is anyone using the Perd the weekend of 8/...,richard.sanders@enron.com,"mrmslane@aol.com, namuathome@aol.com, namuatho...",933001800,0
2179,Do you want to set up a time to meet tomorrow?...,False,190057,2,Tuesday Meeting,mark.taylor@enron.com,mark.dilworth@enron.com,937246140,0
3599,I'll be happy to schedule him. What do you think?,True,117527,2,Re: Neil Mayer,richard.sanders@enron.com,julia.murray@enron.com,943281420,0
4089,$575k,True,84439,2,Re: Havamann Arbitration PRIVILEGED AND CONFID...,richard.sanders@enron.com,john.nowlan@enron.com,945155460,1
...,...,...,...,...,...,...,...,...,...
246817,FYI...\n\n,False,15992,2,FW: Assignments for March 23,john.watson@pdq.net,kimberly.watson@enron.com,1015950664,1
247669,Cool.\n\n,True,49462,2,RE: Dominion Transmission Notices,chris.germany@enron.com,kathryn.bussell@enron.com,1016652403,1
249374,IN? WHEN?\n\n,True,213777,2,RE: man night again?,joe.parks@enron.com,"brianc@saltgrass.com, erwollam@hotmail.com, bc...",1020173281,1
249960,http://hometown.aol.com/trogg522/myhomepage/in...,False,43396,2,Daddy's little Angel,chris.germany@enron.com,jfoard@coral-energy.com,1022245763,1


In [37]:
tmp = results['chains_eq_3'][0]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
142,"\nHey Paul, how is it going?? Attached you'll...",False,87415,3,How are you?,educanto@msn.com,d..thomas@enron.com,883935960,0
421,"Maria,\n\nThe Clearing docs we got in from the...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925474740,0
424,"Mark,\n\nDoes this mean that you would prefer ...",True,49280,3,Re: Documentation from OM,maria.nartey@enron.com,"mark.elliott@enron.com, richard.sage@enron.com...",925482120,0
425,"Maria,\n\nNot necessarily - it is just that th...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925485840,0
503,Wow - that is one nasty looking storm out ther...,False,112512,3,Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926502600,0
...,...,...,...,...,...,...,...,...,...
250676,She is going to print all the Appalachian Prod...,True,16029,3,RE: Assistant to print contracts,chris.germany@enron.com,"ed.mcmichael@enron.com, ruth.concannon@enron.com",1024576950,0
250686,OK to both. Let's use Heather Choate too if i...,True,16029,3,RE: Assistant to print contracts,ed.mcmichael@enron.com,"chris.germany@enron.com, ruth.concannon@enron.com",1024588182,0
250700,does that mean i need to cover\n \n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602537,0
250703,9369 TOMORROW\n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602851,0


In [38]:
tmp = results['chains_eq_3'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
508,713-853-7459,True,112512,3,Re: Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926513940,0
7314,Who's Dana?,True,56107,3,Re: EOL Credit Responses 2/2,tana.jones@enron.com,leslie.hansen@enron.com,949680720,0
12044,O.K. Don't forget!!\n\n,True,164027,3,Re: Saturday Breakfast,pyoung@pdq.net,tana.jones@enron.com,954509640,0
37844,503-464-3740,True,45105,3,Re: Deals #417310 & #417311,mark.guzman@enron.com,kimberly.hundl@enron.com,969522000,0
39459,vkamins@enron.com\n\nvkaminski@aol.com,False,131437,3,,vince.kaminski@enron.com,phil.sisneros@enron.com,970066800,0
...,...,...,...,...,...,...,...,...,...
245587,"Metamucil, baby!\n\n",True,6759,3,RE: A PREVIEW OF COMING ATTRACTIONS,chet_fenner@bmc.com,joe.parks@enron.com,1014996627,0
245991,>,False,30867,3,Centana Letter Agreement.DOC,sproctor@akllp.com,joe.parks@enron.com,1015350736,0
246930,WHat?\n\n,True,94117,3,RE: It's Happening!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1016028995,0
248301,"Doug, money! yes?",False,51146,3,Duke Field Services,joe.parks@enron.com,doug.sewell@enron.com,1017243513,0


In [39]:
tmp = results['chains_eq_3'][2]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
142,"\nHey Paul, how is it going?? Attached you'll...",False,87415,3,How are you?,educanto@msn.com,d..thomas@enron.com,883935960,0
421,"Maria,\n\nThe Clearing docs we got in from the...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925474740,0
424,"Mark,\n\nDoes this mean that you would prefer ...",True,49280,3,Re: Documentation from OM,maria.nartey@enron.com,"mark.elliott@enron.com, richard.sage@enron.com...",925482120,0
425,"Maria,\n\nNot necessarily - it is just that th...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925485840,0
503,Wow - that is one nasty looking storm out ther...,False,112512,3,Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926502600,0
...,...,...,...,...,...,...,...,...,...
250676,She is going to print all the Appalachian Prod...,True,16029,3,RE: Assistant to print contracts,chris.germany@enron.com,"ed.mcmichael@enron.com, ruth.concannon@enron.com",1024576950,0
250686,OK to both. Let's use Heather Choate too if i...,True,16029,3,RE: Assistant to print contracts,ed.mcmichael@enron.com,"chris.germany@enron.com, ruth.concannon@enron.com",1024588182,0
250700,does that mean i need to cover\n \n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602537,0
250703,9369 TOMORROW\n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602851,0


In [41]:
tmp = results['chains_ge_4_lt_10'][0]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
302,Are you guys around this weekend? Any particu...,False,199022,4,Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918222960,0
303,I'm flying solo this weekend. No particular p...,True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918225720,0
305,"Happy hour with staff, not family :-(",True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918234000,0
462,not a thing yet,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,926337960,0
604,sorry to say but I haven't heard a thing,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,927200280,0
...,...,...,...,...,...,...,...,...,...
250874,Good point. That will be good time to come in...,True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035400,0
250875,"Wooo, what a day! Blood-red screen, except fo...",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035638,0
250876,its called liquidation\n\n,True,203187,4,RE: YOU CAN THANK ME LATER,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1025035715,0
250879,"Si, Se?or Paras!\n\n \n\n",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025036494,1


In [42]:
tmp = results['chains_ge_4_lt_10'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
302,Are you guys around this weekend? Any particu...,False,199022,4,Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918222960,0
303,I'm flying solo this weekend. No particular p...,True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918225720,0
305,"Happy hour with staff, not family :-(",True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918234000,0
462,not a thing yet,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,926337960,0
604,sorry to say but I haven't heard a thing,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,927200280,0
...,...,...,...,...,...,...,...,...,...
250874,Good point. That will be good time to come in...,True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035400,0
250875,"Wooo, what a day! Blood-red screen, except fo...",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035638,0
250876,its called liquidation\n\n,True,203187,4,RE: YOU CAN THANK ME LATER,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1025035715,0
250879,"Si, Se?or Paras!\n\n \n\n",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025036494,0


In [43]:
tmp = results['chains_ge_4_lt_10'][2]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
11343,michaelpshannon@yahoo.com,True,123037,5,Re:,benjamin.rogers@enron.com,brandon.neff@enron.com,953893500,0
14882,http://www.lonestarford.com/newcars/expedition...,False,131016,4,,mike.carson@enron.com,mcarson@gtemail.net,956919300,0
21849,Weasel!!,True,121380,4,Re:,benjamin.rogers@enron.com,7028587@skytel.com,962005560,0
28491,Thanks!,True,123326,4,Re:,benjamin.rogers@enron.com,jonathan.hoff@enron.com,965724480,0
33998,37176,True,130927,6,Re:,matthew.lenhart@enron.com,paul.lucci@enron.com,968165640,0
...,...,...,...,...,...,...,...,...,...
247810,:-)\n\n,True,32686,6,RE: Citrix application?,jimmy.manguba@enron.com,chris.germany@enron.com,1016728973,0
248442,I'm LOOOOOKING!!!!\n\n,True,150570,4,RE: Questions We Need Dominion To Answer,chris.germany@enron.com,sproctor@akllp.com,1017430736,0
250606,http://bible.gospelcom.net/,False,124830,4,,chris.germany@enron.com,trogg522@aol.com,1024425791,0
250704,TOMARROW.\n\n \n\n,True,80625,4,RE: Go Baby!,chet_fenner@bmc.com,joe.parks@enron.com,1024602900,0


In [45]:
tmp = results['chains_ge_10'][0]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
24123,Perhaps...,True,124647,30,Re:,chris.dorland@enron.com,mmolloy@oebi.com,963321240,0
31926,sure. maybe.,True,130941,28,Re:,matthew.lenhart@enron.com,shelliott@dttus.com,967212240,0
49710,So.........,True,130003,48,RE:,mark.guzman@enron.com,katie.trullinger@wfsg.com,973506000,0
50356,Cool.\n\n,True,200957,12,RE: FW: What's up?,katie.trullinger@wfsg.com,mark.guzman@enron.com,973600260,0
50804,thanks.,True,130986,13,Re:,matthew.lenhart@enron.com,val.generes@ac.com,973686600,0
...,...,...,...,...,...,...,...,...,...
237701,http://breeders.dogbreedinfo.com/index.php?a_i...,False,126887,41,,eric.bass@enron.com,shanna.husser@enron.com,1012422754,0
239211,when?\n\n,True,122336,233,RE:,amanda.rybarski@enron.com,mike.maggi@enron.com,1012585956,0
241746,sorry!\n\n,True,131203,798,RE:,michelle.nelson@enron.com,mike.maggi@enron.com,1013011133,0
244551,Permanently?\n\n,True,127175,36,RE:,frank.hayden@enron.com,joe.parks@enron.com,1014416162,0


In [46]:
tmp = results['chains_ge_10'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
24123,Perhaps...,True,124647,30,Re:,chris.dorland@enron.com,mmolloy@oebi.com,963321240,0
31926,sure. maybe.,True,130941,28,Re:,matthew.lenhart@enron.com,shelliott@dttus.com,967212240,0
49710,So.........,True,130003,48,RE:,mark.guzman@enron.com,katie.trullinger@wfsg.com,973506000,0
50356,Cool.\n\n,True,200957,12,RE: FW: What's up?,katie.trullinger@wfsg.com,mark.guzman@enron.com,973600260,0
50804,thanks.,True,130986,13,Re:,matthew.lenhart@enron.com,val.generes@ac.com,973686600,0
...,...,...,...,...,...,...,...,...,...
237701,http://breeders.dogbreedinfo.com/index.php?a_i...,False,126887,41,,eric.bass@enron.com,shanna.husser@enron.com,1012422754,0
239211,when?\n\n,True,122336,233,RE:,amanda.rybarski@enron.com,mike.maggi@enron.com,1012585956,0
241746,sorry!\n\n,True,131203,798,RE:,michelle.nelson@enron.com,mike.maggi@enron.com,1013011133,0
244551,Permanently?\n\n,True,127175,36,RE:,frank.hayden@enron.com,joe.parks@enron.com,1014416162,0


In [47]:
tmp = results['chains_ge_10'][2]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
955,Hi Gerald: We have executed agreements with...,True,23316,21,Re: CA Data Sheet,kay.young@enron.com,gerald.nemec@enron.com,930038100,0
1580,"G, How is it going? Been a while since we sp...",False,127447,27,,gerald.nemec@enron.com,gtownsend@manorisd.net,934198740,0
1608,"GT, The theme of the party sounds excellent. ...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934288740,0
1668,"GT, I will be taking Friday off. Probably dr...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934794240,0
1867,"G, I will be there about 9 pm tonight. My ce...",True,127447,27,Re:,gerald.nemec@enron.com,gtownsend@manorisd.net,935769600,0
...,...,...,...,...,...,...,...,...,...
250216,nothing more than what the rags say..............,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023218609,0
250217,i'm going to las vegas in august for a couple ...,True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219055,0
250218,My last fun trip was to portland. Going to se...,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023219293,0
250219,"dude, serious? Portland...portland is soooooo...",True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219865,0


In [48]:
email_dir = Path(Path.cwd().parent, Path('data/raw/maildir'))

In [50]:
email_dir = Path(Path.cwd().parent, Path('data/raw/maildir'))
data_dir = Path(Path.cwd().parent, Path('data/interim'))


def parse_emails(path):
    with open(path, 'r', encoding='windows-1252') as f:
        parsed_email = email.message_from_file(f)
    return parsed_email


def get_parsed_emails(paths):
    emails = []
    for i, path in enumerate(paths):
        eml = parse_emails(path)
        tms = int(parse(eml['Date']).timestamp())
        emails.append((i, eml, tms))
    return emails


def get_parsed_emails(paths, dic=None):
    for path in paths:
        eml = parse_emails(path)
        temp = {k:v for k, v in eml.items() + [('Message', eml.get_payload()), ('Timestamp', int(parse(eml['Date']).timestamp()))]}
        for k,v in dic.items():
            dic[k].append(temp.get(k))
    return dic


def remove_spaces(string):
    if string is not None:        
        string = re.sub('\s+', ' ', string)
        string = string.split(', ')
    return string


def get_chain(data):
    df_ = data.loc[:, ['key', 'Timestamp', 'Reply']].sort_values(by=['key', 'Reply', 'Timestamp'])
    chains = {}
    counter = 0
    for idx, row in df_.iterrows():
        key = f"{row['key']}_{counter:03d}"
        if key not in chains:
            counter = 0
            key = f"{row['key']}_{counter:03d}"
            chains[key] = {'length': 1, 'email_ids': [idx]}
        else:
            if row['Reply']:
                chains[key]['length'] += 1
                chains[key]['email_ids'].append(idx)
            else:
                counter += 1
                key = f"{row['key']}_{counter:03d}"
                chains[key] = {'length': 1, 'email_ids': [idx]}

    chains_new = {}
    for i, row in enumerate(chains):
        chains_new[i] = {'chain_id': row} | chains[row]

    return chains_new


def get_longest_chain(chain):
    MAX = 0
    longest_chain = []
    for k, v in chain.items():
        if v['length'] > MAX:
            MAX = v['length']
            longest_chain = (k, v['length'], v['email_ids'])
        elif v['length'] == MAX:
            if isinstance(longest_chain, tuple):
                longest_chain = [longest_chain, (k, v['length'] ,v['email_ids'])]
            else:
                longest_chain.append((k, v['length'] ,v['email_ids']))
        
    return longest_chain


def assign_chain_id(data, chain):
    data.loc[:,'Chain'] = None
    data.loc[:,'Chain_len'] = None
    for k,v in chain.items():
        data.loc[v['email_ids'], 'Chain'] = k
        data.loc[v['email_ids'], 'Chain_len'] = int(v['length'])

In [55]:
clean_emails = []
for path in email_dir.rglob('*.'):
    if 'all_documents' not in str(path.parent) and 'discussion_threads' not in str(path.parent):
        clean_emails.append(path)
email_dict = {
    'Message-ID': [],
    'Date': [],
    'From': [],
    'To': [],
    'Subject': [],
    'Cc': [],
    'Mime-Version': [],
    'Content-Type': [],
    'Content-Transfer-Encoding': [],
    'Bcc': [],
    'X-From': [],
    'X-To': [],
    'X-cc': [],
    'X-bcc': [],
    'X-Folder': [],
    'X-Origin': [],
    'X-FileName': [],
    'Message': [],
    'Timestamp': []
}
df = pd.DataFrame(get_parsed_emails(clean_emails, email_dict))

In [56]:
df

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Message,Timestamp
0,<32259334.1075852468311.JavaMail.evans@thyme>,"Thu, 4 Oct 2001 15:05:17 -0700 (PDT)",john.shelk@enron.com,"richard.shapiro@enron.com, linda.robertson@enr...",Summary of Administration Comments on Bingaman...,,1.0,text/plain; charset=us-ascii,7bit,,"Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,\nI have read through the 19 pages of Administ...,1002233117
1,<16152007.1075852468365.JavaMail.evans@thyme>,"Tue, 25 Sep 2001 09:25:07 -0700 (PDT)",john.shelk@enron.com,"richard.shapiro@enron.com, d..steffes@enron.co...",EPSA/EEI on Reliability,"linda.robertson@enron.com, carin.nersesian@enr...",1.0,text/plain; charset=us-ascii,7bit,"linda.robertson@enron.com, carin.nersesian@enr...","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS...","Robertson, Linda </O=ENRON/OU=NA/CN=RECIPIENTS...",,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,\nThis follows up on Rick's inquiry late last ...,1001435107
2,<26474922.1075852468285.JavaMail.evans@thyme>,"Fri, 5 Oct 2001 08:21:31 -0700 (PDT)",john.shelk@enron.com,charles.yeung@enron.com,Reliability and Security Arguments (RTOs),"janel.guerrero@enron.com, d..steffes@enron.com...",1.0,text/plain; charset=us-ascii,7bit,"janel.guerrero@enron.com, d..steffes@enron.com...","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Yeung, Charles </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Guerrero, Janel </O=ENRON/OU=NA/CN=RECIPIENTS/...",,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,\nThis responds to Charles's voice mail and th...,1002295291
3,<10118998.1075852468340.JavaMail.evans@thyme>,"Fri, 28 Sep 2001 12:11:10 -0700 (PDT)",john.shelk@enron.com,"joe.connor@enron.com, richard.ingersoll@enron....",RE: NERC Statements on Impact of Security Thre...,,1.0,text/plain; charset=us-ascii,7bit,,"Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Connor, Joe </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...",,,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,I agree with Joe. The IOUs will point to NERC...,1001704270
4,<24576280.1075861591387.JavaMail.evans@thyme>,"Fri, 2 Nov 2001 05:33:16 -0800 (PST)",john.shelk@enron.com,"d..steffes@enron.com, linda.robertson@enron.co...",Barton Staff Meeting,"john.shelk@enron.com, richard.shapiro@enron.com",1.0,text/plain; charset=us-ascii,quoted-printable,"john.shelk@enron.com, richard.shapiro@enron.com","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Steffes, James D. </O=ENRON/OU=NA/CN=RECIPIENT...","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...",,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,Yesterday I spent about 45 minutes with the th...,1004707996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330684,<1689472.1075857281689.JavaMail.evans@thyme>,"Wed, 22 Nov 2000 03:18:00 -0800 (PST)",fool@motleyfool.com,benjamin.rogers@enron.com,Investing Basics: Gathering Company Information,,1.0,text/plain; charset=ANSI_X3.4-1968,quoted-printable,,The Motley Fool <Fool@MotleyFool.com>,benjamin.rogers@enron.com,,,\Benjamin_Rogers_Dec2000_4\Notes Folders\Motle...,Rogers-B,brogers.nsf,______________________________________________...,974891880
330685,<31985452.1075857282898.JavaMail.evans@thyme>,"Tue, 31 Oct 2000 01:12:00 -0800 (PST)",fool@motleyfool.com,benjamin.rogers@enron.com,Breakfast With The Fool: Liftoff at Expedia,,1.0,text/plain; charset=us-ascii,7bit,,The Motley Fool <Fool@MotleyFool.com>,benjamin.rogers@enron.com,,,\Benjamin_Rogers_Dec2000_4\Notes Folders\Motle...,Rogers-B,brogers.nsf,______________________________________________...,972983520
330686,<4043728.1075852093007.JavaMail.evans@thyme>,"Tue, 29 May 2001 15:21:21 -0700 (PDT)",fool@motleyfool.com,benjamin.rogers@enron.com,"FoolWatch: Tom Gardner, Microsoft and College ...",,1.0,text/plain; charset=us-ascii,7bit,,The Motley Fool <Fool@MotleyFool.com>@ENRON <I...,"Rogers, Benjamin </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\BROGERS (Non-Privileged)\Rogers, Benjamin\Mot...",ROGERS-B,BROGERS (Non-Privileged).pst,\n=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3...,991174881
330687,<24053764.1075857283068.JavaMail.evans@thyme>,"Thu, 26 Oct 2000 02:29:00 -0700 (PDT)",fool@motleyfool.com,benjamin.rogers@enron.com,Breakfast With The Fool: InfoSpace Makes Money...,,1.0,text/plain; charset=us-ascii,7bit,,The Motley Fool <Fool@MotleyFool.com>,benjamin.rogers@enron.com,,,\Benjamin_Rogers_Dec2000_4\Notes Folders\Motle...,Rogers-B,brogers.nsf,______________________________________________...,972552540


In [62]:
df.loc[tmp[tmp['label'] > -1].index,'Message-ID']

58487     <21931006.1075843765071.JavaMail.evans@thyme>
67266     <20343021.1075852783231.JavaMail.evans@thyme>
71590     <11703650.1075840283161.JavaMail.evans@thyme>
74646     <32721671.1075857427471.JavaMail.evans@thyme>
77957     <22797037.1075861578908.JavaMail.evans@thyme>
112864    <14579785.1075855357313.JavaMail.evans@thyme>
120152    <15211290.1075854177642.JavaMail.evans@thyme>
139558     <7508741.1075840715401.JavaMail.evans@thyme>
139594     <2632422.1075840703129.JavaMail.evans@thyme>
167723    <17416328.1075846724449.JavaMail.evans@thyme>
176322    <28504577.1075852177724.JavaMail.evans@thyme>
178335    <24770644.1075862329775.JavaMail.evans@thyme>
200599     <8671127.1075854943082.JavaMail.evans@thyme>
209198     <5654181.1075840041726.JavaMail.evans@thyme>
213331     <8138183.1075856157586.JavaMail.evans@thyme>
223166     <7885015.1075855113677.JavaMail.evans@thyme>
229021     <4947823.1075847137140.JavaMail.evans@thyme>
241232     <9979189.1075847982114.JavaMail.evans

In [70]:
for idx, row in df.loc[tmp[tmp['label'] > -1].index,'Message'].iteritems():
    print(row)

thank you so much.  so the news aint great for utilities?

-----Original Message-----
From: Jeff.Dasovich@enron.com [mailto:Jeff.Dasovich@enron.com]
Sent: Wednesday, January 03, 2001 2:14 PM
To: Kari Dohn
Subject: Additional Materials



Greetings Kari:

Forgive the delay.  Much going on today, PUC draft decision in particular.
The draft does not look promising for the utilities' financial position.

Attached are our comments on the Governor's Proposals and some more detail
on the demand-reduction proposal.  We continue to work on the Nord Pool
research for you and will turn that around as quickly as we can.

Again, don't hesitate to contact me if there's anything else I can help
with, or if there's anything else that you need. (415.782.7822)

Best,
Jeff

(See attached file: Comments on Governor's Proposals 010301 .doc)(See
attached file: Demand buy-down proposal.doc)


Let me think about this.
I love you.
Mom

I did receive your e-mail.

Ken Lay





"Michael Milken" <mmilken@knowledg

In [67]:
df.loc[tmp[tmp['label'] > -1].index,'Message-ID']

58487     <21931006.1075843765071.JavaMail.evans@thyme>
67266     <20343021.1075852783231.JavaMail.evans@thyme>
71590     <11703650.1075840283161.JavaMail.evans@thyme>
74646     <32721671.1075857427471.JavaMail.evans@thyme>
77957     <22797037.1075861578908.JavaMail.evans@thyme>
112864    <14579785.1075855357313.JavaMail.evans@thyme>
120152    <15211290.1075854177642.JavaMail.evans@thyme>
139558     <7508741.1075840715401.JavaMail.evans@thyme>
139594     <2632422.1075840703129.JavaMail.evans@thyme>
167723    <17416328.1075846724449.JavaMail.evans@thyme>
176322    <28504577.1075852177724.JavaMail.evans@thyme>
178335    <24770644.1075862329775.JavaMail.evans@thyme>
200599     <8671127.1075854943082.JavaMail.evans@thyme>
209198     <5654181.1075840041726.JavaMail.evans@thyme>
213331     <8138183.1075856157586.JavaMail.evans@thyme>
223166     <7885015.1075855113677.JavaMail.evans@thyme>
229021     <4947823.1075847137140.JavaMail.evans@thyme>
241232     <9979189.1075847982114.JavaMail.evans

In [73]:
df.loc[58487,'From']

'kari.dohn@gov.ca.gov'

In [72]:
tmp.loc[58487,:]

Message                            d-i-r-t-y
Reply                                   True
Chain                                 130942
Chain_len                                122
Subject                                  RE:
Sender             matthew.lenhart@enron.com
Recipients    shirley.s.elliott@citicorp.com
Timestamp                          975587580
label                                      0
Name: 58487, dtype: object

In [74]:
df_ = pd.read_pickle(Path(data_dir, 'parsed_emails_chains_all.pkl'))

FileNotFoundError: [Errno 2] No such file or directory: '/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_all.pkl'