In [1]:
import email
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
from datetime import datetime, timedelta
from dateutil.parser import parse
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec

plt.rcParams['figure.dpi'] = 100
plt.rcParams["figure.autolayout"] = True

In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
models_dir = Path(Path.cwd().parent, 'models')
evals_dir = Path(data_dir, 'evals_6') # evals_4
labels_dir = Path(data_dir, 'labels_4') # Path(data_dir, 'labels_2')
models_dir = Path(Path.cwd().parent, 'models')
datasets_order = {
    'chains_eq_2': 0,
    'chains_eq_3': 1,
    'chains_ge_4_lt_10': 2,
    'chains_ge_10': 3}

datasets_names = {
    'chains_eq_2': r'CL $= 2$',
    'chains_eq_3': r'CL $= 3$',
    'chains_ge_4_lt_10': r'10 $>$ CL $\geq$ 4',
    'chains_ge_10': r'CL $\geq$ 10'}

sizes = {}
for path in Path(models_dir).glob('d2v*.model'):
    size = len(Doc2Vec.load(str(path)).dv.vectors)
    name = '_'.join(path.stem.split('_')[1:])
    sizes[name] = size


In [3]:
df_kmeans = pd.concat([pd.read_csv(path) for path in evals_dir.glob('eval_km*.csv')], ignore_index=True)
df_kmeans['dimensions'] = df_kmeans['dataset'].apply(lambda s: s.split('_')[-1])
df_kmeans['dataset'] = df_kmeans['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_kmeans['order'] = df_kmeans['dataset'].map(datasets_order)
df_kmeans['name'] = df_kmeans['dataset'].map(datasets_names)
df_kmeans = df_kmeans.sort_values('order')
df_kmeans['k_noise'] = df_kmeans['n_clusters'].astype(int)
df_kmeans['method'] = 'KMeans'

df_dbscan = pd.concat([pd.read_csv(path) for path in evals_dir.glob('eval_dbscan*.csv')], ignore_index=True)
df_dbscan['size'] = df_dbscan['dataset'].map(sizes)
df_dbscan['prop_noise'] = df_dbscan['n_noise']/df_dbscan['size']
df_dbscan['dimensions'] = df_dbscan['dataset'].apply(lambda s: s.split('_')[-1])
df_dbscan['dataset'] = df_dbscan['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_dbscan['order'] = df_dbscan['dataset'].map(datasets_order)
df_dbscan['name'] = df_dbscan['dataset'].map(datasets_names)
df_dbscan = df_dbscan.sort_values('order')
df_dbscan['k_noise'] = df_dbscan.apply(lambda x: f"{x['n_clusters']} ({x['prop_noise']:.3f})", axis=1)
df_dbscan['method'] = 'DBSCAN'
df_dbscan = df_dbscan.loc[df_dbscan['epsilon'] != 0.2,:]

df_hdbscan = pd.concat([pd.read_csv(path) for path in evals_dir.glob('*_hdbscan*.csv')], ignore_index=True)
df_hdbscan['size'] = df_hdbscan['dataset'].map(sizes)
df_hdbscan['prop_noise'] = df_hdbscan['n_noise']/df_hdbscan['size']
df_hdbscan['dimensions'] = df_hdbscan['dataset'].apply(lambda s: s.split('_')[-1])
df_hdbscan['dataset'] = df_hdbscan['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_hdbscan['order'] = df_hdbscan['dataset'].map(datasets_order)
df_hdbscan['name'] = df_hdbscan['dataset'].map(datasets_names)
df_hdbscan = df_hdbscan.sort_values('order')
df_hdbscan['k_noise'] = df_hdbscan.apply(lambda x: f"{x['n_clusters']} ({x['prop_noise']:.3f})", axis=1)
df_hdbscan['method'] = 'HDBSCAN'
datasets = df_dbscan.sort_values(by='order')['dataset'].unique()

eval_scores = ['sl_score', 'ch_score', 'db_score', 'entropy']

# Tables of best scores

In [4]:
# def labels_km(df):
#     return f'labels_km_{df.loc[0,"dataset"]}_{df.loc[0,"dimensions"]}_{df.loc[0,"n_clusters"]:02d}_{df.loc[0,"distance"]}.pkl'

# def labels_db(df):
#     return f'labels_dbscan_{df.loc[0,"dataset"]}_{df.loc[0,"dimensions"]}_{df.loc[0,"epsilon"].astype(str)[:7]}*_{df.loc[0,"min_pts"].astype(int):02d}_{df.loc[0,"distance"]}.pkl'

# def labels_hd(df):
#     return f'labels_hdbscan_{df.loc[0,"dataset"]}_{df.loc[0,"dimensions"]}_{df.loc[0,"min_clt_size"].astype(int):02d}_{df.loc[0,"min_samples"].astype(int):02d}_{df.loc[0,"distance"]}.pkl'

def eps_check(eps):
    val = str(eps)
    if len(val) > 8:
        val = val[:7] + '*'
    return val


def labels_km(df, dataset):
    return f'labels_km_{dataset}_{df["dimensions"]}_{df["n_clusters"]:02d}_{df["distance"]}.pkl'


def labels_db(df, dataset):
    return f'labels_dbscan_{dataset}_{df["dimensions"]}_{eps_check(df["epsilon"])}_{int(df["min_pts"]):02d}_{df["distance"]}.pkl'


def labels_hd(df, dataset):
    return f'labels_hdbscan_{dataset}_{df["dimensions"]}_{int(df["min_clt_size"]):02d}_{int(df["min_samples"]):02d}_{df["distance"]}.pkl'

In [5]:
best_labs_all = {}
ord_cols = ['method', 'distance', 'dimensions', 'k_noise', 'sl_score', 'ch_score', 'db_score',
            'entropy', 'epsilon', 'min_pts', 'nn', 'min_clt_size', 'min_samples', 'n_clusters']
ord_name = {
    'method':'Method',
    'distance':'Distance',
    'dimensions':'Dim',
    'k_noise': r'$k$ (% Noise)',
    'sl_score':'SL',
    'ch_score':'CH',
    'db_score':'DB',
    'entropy':'Entropy',
    'epsilon': 'Eps',
    'min_pts': 'MinPts',
    'nn': 'NN',
    'min_clt_size': 'Min Clt Size',
    'min_samples': 'Min Samples',
    'n_clusters': 'N Clusters',
    'focus': 'Score'
}
dis_name = {'euclidean':'Euclidean', 'cosine': 'Cosine', 'wmd':'WMD', 'l2':'L2 Norm'}
iterator_datasets = iter(datasets)
iterator_scores = iter(eval_scores)


In [6]:
def get_best_labs(datasets, df1, df2, df3, eval_scores, ord_name, ord_cols):
    for dat in datasets:
        tab_best = pd.DataFrame()
        # print(dat)
        for score in eval_scores:
            if score in ['db_score', 'entropy']:
                asc = True
            else:
                asc= False
            tmp1 = pd.concat([
                df1.loc[df1['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
                df2.loc[df2['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
                df3.loc[df3['dataset']==dat, :].sort_values(score,ascending=asc).head(1)
            ])
            tmp1['focus'] = score
            tmp1['focus'] = tmp1['focus'].map(ord_name)
            # tmp1['distance'] = tmp1['distance'].map(dis_name)
            tmp1 = tmp1.sort_values(score, ascending=asc)
            tab_best = pd.concat([tab_best, tmp1.loc[:,ord_cols+['focus']].dropna(how='all',axis=1)])

        best_labels_ = {}
        for score in eval_scores:
            for idx, row in tab_best.iterrows():
                if row['method'] == 'KMeans':
                    label = labels_km(row, dat)
                elif row['method'] == 'DBSCAN':
                    label = labels_db(row, dat)
                elif row['method'] == 'HDBSCAN':
                    label = labels_hd(row, dat)
                # print(row['focus'])
                # print(label)
                if row['focus'] in best_labels_:
                    best_labels_[row['focus']].append(label)
                else:
                    best_labels_[row['focus']] = [label]

        best_labels_ = {k:list(set(v)) for k, v in best_labels_.items()}
        best_labels = {}
        for k, v in best_labels_.items():
            for i in range(len(v)):
                if k not in best_labels:
                    best_labels[k] = {v[i].split('_')[1]: v[i]}
                else:
                    best_labels[k].update({v[i].split('_')[1]: v[i]})
        # best_labels
        best_labs_all[dat] = best_labels
    return best_labs_all

In [7]:
best_labels_all = get_best_labs(datasets, df_kmeans, df_dbscan, df_hdbscan, eval_scores, ord_name, ord_cols)
    

In [8]:
parsed_emails = {dat:pd.read_pickle(Path(data_dir,f'parsed_emails_{dat}.pkl')) for dat in datasets}

In [9]:
def get_cluster_tables(df):
    return df.rename(columns={'index':'k', 'label':'Counts'}).pivot_table(index='k', columns=['Score', 'Method'], fill_value=-1).astype(str).replace('-1', np.nan)

In [10]:
def print_emails(df):
    for idx, row in df.iterrows():
        print(f"<Cluster: {row['cluster']}>, Chain ID: {row['Chain']:06d}, Index: {idx}")
        print('Date:', datetime.fromtimestamp(row['Timestamp']))
        print('Subject:', row['Subject'])
        print(f"Sender: {row['Sender']}, Recipient: {row['Recipients']}")
        print('Message:', row['Message'])
        print('\n')

In [11]:
def get_labeled(label, dat, parsed_emails):
    labs_array = pd.read_pickle(Path(labels_dir, label))
    df_best = parsed_emails[dat]
    df_best['cluster'] = labs_array
    return df_best

In [12]:
scr = ['SL','CH','DB','Entropy']
ids = [['dbscan', 'dbscan', 'dbscan', 'dbscan'], ['dbscan', 'dbscan', 'dbscan', 'dbscan'], ['dbscan', 'dbscan', 'dbscan', 'dbscan'], ['dbscan', 'dbscan', 'dbscan', 'km']]
# ids = [[2, 2, 0, 1], [1, 1, 1, 1], [2, 1, 2, 2], [1, 2, 2, 1]]
# ids = [[1, 2, 0, 1], [1, 2, 2, 0], [2, 0, 1, 2], [2, 2, 2, 0]]
# ids = [[1, 2, 0, 1], [1, 2, 2, 0], [2, 0, 1, 2], [2, 2, 2, 0]]

In [13]:
best_labs_all

{'chains_eq_2': {'SL': {'hdbscan': 'labels_hdbscan_chains_eq_2_50_05_34_euclidean.pkl',
   'dbscan': 'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
   'km': 'labels_km_chains_eq_2_300_02_euclidean.pkl'},
  'CH': {'dbscan': 'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
   'km': 'labels_km_chains_eq_2_50_02_euclidean.pkl',
   'hdbscan': 'labels_hdbscan_chains_eq_2_50_02_27_euclidean.pkl'},
  'DB': {'dbscan': 'labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl',
   'hdbscan': 'labels_hdbscan_chains_eq_2_50_05_32_euclidean.pkl',
   'km': 'labels_km_chains_eq_2_300_20_euclidean.pkl'},
  'Entropy': {'dbscan': 'labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl',
   'km': 'labels_km_chains_eq_2_50_02_euclidean.pkl',
   'hdbscan': 'labels_hdbscan_chains_eq_2_300_02_33_euclidean.pkl'}},
 'chains_eq_3': {'SL': {'hdbscan': 'labels_hdbscan_chains_eq_3_300_02_06_l2.pkl',
   'dbscan': 'labels_dbscan_chains_eq_3_50_0.27_02_wmd.pkl',
   'km': 'labels_km_chains_eq_3_300_02_euclidean.pkl'},


In [14]:
datasets

array(['chains_eq_2', 'chains_eq_3', 'chains_ge_4_lt_10', 'chains_ge_10'],
      dtype=object)

In [15]:
dfs = []
for i in range(4):
    dat = datasets[i]
    for j in range(4):
        L = best_labs_all[dat][scr[j]][ids[i][j]]
        dfs.append(L)

In [16]:
dfs

['labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
 'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
 'labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl',
 'labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl',
 'labels_dbscan_chains_eq_3_50_0.27_02_wmd.pkl',
 'labels_dbscan_chains_eq_3_50_0.78_04_euclidean.pkl',
 'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl',
 'labels_dbscan_chains_eq_3_50_0.43_15_cosine.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.69_08_euclidean.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.43_07_cosine.pkl',
 'labels_dbscan_chains_ge_10_300_0.63_02_euclidean.pkl',
 'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl',
 'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl',
 'labels_km_chains_ge_10_300_02_euclidean.pkl']

## Chains len 2

In [17]:
iter_dat = iter(datasets)
iter_lab = iter(range(len(dfs)))
dat = next(iter_dat)
lab = next(iter_lab)

In [18]:
dfs[lab]

'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl'

In [19]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [20]:
df['cluster'].value_counts()

-1    23522
 1      314
 0        5
Name: cluster, dtype: int64

In [21]:
df[df['cluster'] == 1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
4089,$575k,True,84439,2,Re: Havamann Arbitration PRIVILEGED AND CONFID...,richard.sanders@enron.com,john.nowlan@enron.com,945155460,1
11591,Ughh...,True,67274,2,Re: EnronOnline-Entergy,tana.jones@enron.com,leslie.hansen@enron.com,954233040,1
12984,thanks!!!,True,41373,2,Re: Crosstex / Proliance,debra.perlingiere@enron.com,russell.diamond@enron.com,955466700,1
13207,Thanks!!!,True,126092,2,Re:,debra.perlingiere@enron.com,monica.richards@enron.com,955624740,1
14398,2000-1969=31,True,381,2,Re: #30,phillip.allen@enron.com,hargr@webtv.net,956665320,1
...,...,...,...,...,...,...,...,...,...
246817,FYI...\n\n,False,15992,2,FW: Assignments for March 23,john.watson@pdq.net,kimberly.watson@enron.com,1015950664,1
247669,Cool.\n\n,True,49462,2,RE: Dominion Transmission Notices,chris.germany@enron.com,kathryn.bussell@enron.com,1016652403,1
249374,IN? WHEN?\n\n,True,213777,2,RE: man night again?,joe.parks@enron.com,"brianc@saltgrass.com, erwollam@hotmail.com, bc...",1020173281,1
249960,http://hometown.aol.com/trogg522/myhomepage/in...,False,43396,2,Daddy's little Angel,chris.germany@enron.com,jfoard@coral-energy.com,1022245763,1


In [38]:
s1 = df.loc[df['cluster']==1,'Chain'].value_counts()
s1

41882     2
8141      2
28526     2
150121    2
213502    2
         ..
145632    1
180907    1
117525    1
51905     1
104155    1
Name: Chain, Length: 301, dtype: int64

In [40]:
s1[s1>1].index

Int64Index([ 41882,   8141,  28526, 150121, 213502,  17740,  44392, 201141,
            102871, 124020, 182377, 101968,  90830],
           dtype='int64')

In [43]:
print_emails(df.loc[df['Chain'].isin(s1[s1>1].index),:])

<Cluster: 1>, Chain ID: 101968, Index: 82591
Date: 2001-02-22 12:30:00
Subject: Re: Location
Sender: jane.tholt@enron.com, Recipient: sara.solorio@enron.com
Message: eb3209c


<Cluster: 1>, Chain ID: 101968, Index: 95713
Date: 2001-04-03 08:57:00
Subject: Re: Location
Sender: jane.tholt@enron.com, Recipient: sara.solorio@enron.com
Message: EB3209C


<Cluster: 1>, Chain ID: 182377, Index: 96398
Date: 2001-04-04 13:37:00
Subject: Re: Test
Sender: kevin.ruscitti@enron.com, Recipient: elisabeth_a_ruscitti@reliantenergy.com
Message: http://www.fortune.com/


<Cluster: 1>, Chain ID: 182377, Index: 96696
Date: 2001-04-04 23:37:00
Subject: Re: Test
Sender: kevin.ruscitti@enron.com, Recipient: elisabeth_a_ruscitti@reliantenergy.com
Message: http://www.fortune.com/ 


<Cluster: 1>, Chain ID: 201141, Index: 102102
Date: 2001-04-18 11:43:00
Subject: Re: Whispering Pines Golf Outing
Sender: hunter.shively@enron.com, Recipient: becky.young@enron.com
Message: 28


<Cluster: 1>, Chain ID: 201141, Inde

In [34]:
df.loc[df['Chain'].isin(df.loc[df['cluster'] == 1, 'Chain']),:].sort_values(by=['cluster', 'Timestamp', 'Chain'])

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
4116,Yes. We have approx. $110 k sitting with our a...,True,84439,2,Re: Havamann Arbitration PRIVILEGED AND CONFID...,richard.sanders@enron.com,john.nowlan@enron.com,945167520,-1
11524,"As usual, you are the best!",True,67274,2,Re: EnronOnline-Entergy,tana.jones@enron.com,leslie.hansen@enron.com,954160080,-1
12969,What is the status of these to entities? \n\n...,False,41373,2,Crosstex / Proliance,debra.perlingiere@enron.com,russell.diamond@enron.com,955463460,-1
13189,Do you still have the draft for Citizens Utili...,False,126092,2,,debra.perlingiere@enron.com,monica.richards@enron.com,955621800,-1
14765,"Kay & Neal,\n\nThanks for remembering my birth...",True,381,2,Re: #30,phillip.allen@enron.com,hargr@webtv.net,956842800,-1
...,...,...,...,...,...,...,...,...,...
246817,FYI...\n\n,False,15992,2,FW: Assignments for March 23,john.watson@pdq.net,kimberly.watson@enron.com,1015950664,1
247669,Cool.\n\n,True,49462,2,RE: Dominion Transmission Notices,chris.germany@enron.com,kathryn.bussell@enron.com,1016652403,1
249374,IN? WHEN?\n\n,True,213777,2,RE: man night again?,joe.parks@enron.com,"brianc@saltgrass.com, erwollam@hotmail.com, bc...",1020173281,1
249960,http://hometown.aol.com/trogg522/myhomepage/in...,False,43396,2,Daddy's little Angel,chris.germany@enron.com,jfoard@coral-energy.com,1022245763,1


In [23]:
print_emails(df.loc[df['Chain'].isin(df.loc[df['cluster'] == 1, 'Chain']),:].sort_values(by=['Timestamp','Chain']))

<Cluster: 1>, Chain ID: 084439, Index: 4089
Date: 1999-12-14 08:11:00
Subject: Re: Havamann Arbitration PRIVILEGED AND CONFIDENTIAL
Sender: richard.sanders@enron.com, Recipient: john.nowlan@enron.com
Message: $575k


<Cluster: -1>, Chain ID: 084439, Index: 4116
Date: 1999-12-14 11:32:00
Subject: Re: Havamann Arbitration PRIVILEGED AND CONFIDENTIAL
Sender: richard.sanders@enron.com, Recipient: john.nowlan@enron.com
Message: Yes. We have approx. $110 k sitting with our attorneys in London and we have 
a claim under our charterers insurance policy----which we are pursuing 
aggressively. You  may be able to reserve some amount for this potential 
recovery. 


<Cluster: -1>, Chain ID: 067274, Index: 11524
Date: 2000-03-27 14:28:00
Subject: Re: EnronOnline-Entergy
Sender: tana.jones@enron.com, Recipient: leslie.hansen@enron.com
Message: As usual, you are the best!


<Cluster: 1>, Chain ID: 067274, Index: 11591
Date: 2000-03-28 10:44:00
Subject: Re: EnronOnline-Entergy
Sender: tana.jones@enro

In [197]:
print_emails(df.loc[df['Chain'].isin(df.loc[df['cluster'] == 0, 'Chain']),:].sort_values(by=['Timestamp','Chain']))

<Cluster: -1>, Chain ID: 209486, Index: 782
Date: 1999-06-03 17:41:00
Subject: derivatives documentation software
Sender: tana.jones@enron.com, Recipient: ian.howells@documentum.com
Message: I have been given the name of your company by a consultant we have hired to 
advise us on sofware systems to manage our physical and financial 
confirmation process and ISDA Master Agreements.  I have also seen 
information on your company in the January 1999 issue of Risk Magazine.  I am 
a Senior Legal Specialist in the Legal Dept. and am looking at software 
systems that can help us manage our documentation needs.

If you are unfamiliar with our company,  Enron is the largest integrated 
marketer of energy in the United States.  We have a Web Page located at 
www.enron.com.  Enron Capital & Trade Resources Corp. is the marketing 
affiliate of our parent, Enron Corp.

I would like to obtain marketing material about the services you provide, and 
after I have a chance to look at the information, w

In [None]:
print_emails(df.loc[df['Chain'].isin(df.loc[df['cluster'] == 0, 'Chain']),:].sort_values(by=['Timestamp','Chain']))

In [30]:
s1 = df[df['cluster']==1].sort_values(by=['Chain','Timestamp'])['Chain'].value_counts()

In [31]:
s1

17740     2
213502    2
41882     2
102871    2
201141    2
         ..
91659     1
90623     1
90481     1
89215     1
219871    1
Name: Chain, Length: 301, dtype: int64

In [28]:
s1[s1].index

Int64Index([], dtype='int64')

In [32]:
len(s1[s1 > 1])

13

In [215]:
print_emails(df.loc[df['Chain'].isin(s1[s1 >1].index),:].sort_values(by=['Timestamp','Chain']))

<Cluster: 1>, Chain ID: 101968, Index: 82591
Date: 2001-02-22 12:30:00
Subject: Re: Location
Sender: jane.tholt@enron.com, Recipient: sara.solorio@enron.com
Message: eb3209c


<Cluster: 1>, Chain ID: 101968, Index: 95713
Date: 2001-04-03 08:57:00
Subject: Re: Location
Sender: jane.tholt@enron.com, Recipient: sara.solorio@enron.com
Message: EB3209C


<Cluster: 1>, Chain ID: 182377, Index: 96398
Date: 2001-04-04 13:37:00
Subject: Re: Test
Sender: kevin.ruscitti@enron.com, Recipient: elisabeth_a_ruscitti@reliantenergy.com
Message: http://www.fortune.com/


<Cluster: 1>, Chain ID: 182377, Index: 96696
Date: 2001-04-04 23:37:00
Subject: Re: Test
Sender: kevin.ruscitti@enron.com, Recipient: elisabeth_a_ruscitti@reliantenergy.com
Message: http://www.fortune.com/ 


<Cluster: 1>, Chain ID: 201141, Index: 102102
Date: 2001-04-18 11:43:00
Subject: Re: Whispering Pines Golf Outing
Sender: hunter.shively@enron.com, Recipient: becky.young@enron.com
Message: 28


<Cluster: 1>, Chain ID: 201141, Inde

In [216]:
print_emails(df.loc[df['Chain'].isin(s1[s1 ==1].index),:].sort_values(by=['Timestamp','Chain']))

<Cluster: 1>, Chain ID: 084439, Index: 4089
Date: 1999-12-14 08:11:00
Subject: Re: Havamann Arbitration PRIVILEGED AND CONFIDENTIAL
Sender: richard.sanders@enron.com, Recipient: john.nowlan@enron.com
Message: $575k


<Cluster: -1>, Chain ID: 084439, Index: 4116
Date: 1999-12-14 11:32:00
Subject: Re: Havamann Arbitration PRIVILEGED AND CONFIDENTIAL
Sender: richard.sanders@enron.com, Recipient: john.nowlan@enron.com
Message: Yes. We have approx. $110 k sitting with our attorneys in London and we have 
a claim under our charterers insurance policy----which we are pursuing 
aggressively. You  may be able to reserve some amount for this potential 
recovery. 


<Cluster: -1>, Chain ID: 067274, Index: 11524
Date: 2000-03-27 14:28:00
Subject: Re: EnronOnline-Entergy
Sender: tana.jones@enron.com, Recipient: leslie.hansen@enron.com
Message: As usual, you are the best!


<Cluster: 1>, Chain ID: 067274, Index: 11591
Date: 2000-03-28 10:44:00
Subject: Re: EnronOnline-Entergy
Sender: tana.jones@enro

In [221]:
lab = next(iter_lab)

In [222]:
lab = next(iter_lab)

2

In [223]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [224]:
df['cluster'].value_counts()

-1    23525
 0      314
 1        2
Name: cluster, dtype: int64

In [226]:
df[df['cluster'] == 1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
5471,"Laura,\n\nCongratulations. Well deserved.\n\nV...",False,37831,2,Congrats,vince.kaminski@enron.com,laura.luce@enron.com,947585580,1
11999,Per your request...\n,False,104809,2,Mac Definitions,tana.jones@enron.com,brent.hendry@enron.com,954493380,1


In [231]:
print_emails(df[df['Chain'].isin(df[df['cluster'] == 1]['Chain'])])

<Cluster: 1>, Chain ID: 037831, Index: 5471
Date: 2000-01-11 11:13:00
Subject: Congrats
Sender: vince.kaminski@enron.com, Recipient: laura.luce@enron.com
Message: Laura,

Congratulations. Well deserved.

Vince


<Cluster: -1>, Chain ID: 037831, Index: 5480
Date: 2000-01-11 11:50:00
Subject: Re: Congrats
Sender: laura.luce@enron.com, Recipient: vince.kaminski@enron.com
Message: Vince,

You beat me to the congrats.  The surprise was that I already believed you 
were a Managing Director, so a long overdue congratulations to you.

LauraVince J Kaminski@ECT
01/11/2000 10:13 AM
To: Laura Luce/HOU/ECT@ECT
cc:  
Subject: Congrats

Laura,

Congratulations. Well deserved.

Vince




<Cluster: 1>, Chain ID: 104809, Index: 11999
Date: 2000-03-31 11:03:00
Subject: Mac Definitions
Sender: tana.jones@enron.com, Recipient: brent.hendry@enron.com
Message: Per your request...



<Cluster: -1>, Chain ID: 104809, Index: 12035
Date: 2000-03-31 14:25:00
Subject: Re: Mac Definitions
Sender: brent.hendry@enro

## Chains len 3

In [25]:
dat = next(iter_dat)
dat

'chains_eq_3'

In [33]:
lab = next(iter_lab)
print(lab)
dfs[lab]

6


'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl'

In [237]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [238]:
df['cluster'].value_counts()

0    8766
1       3
Name: cluster, dtype: int64

In [239]:
df[df['cluster'] == 1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
173501,This news item involves a project to export Bo...,False,71164,3,FYI - LNG Terminal in California - using Boliv...,jose.bestard@enron.com,richard.shapiro@enron.com,1003427561,1
195269,"nu znachit delo bilo tak:\n\npriexal, potomkaz...",True,131320,3,RE:,vladi.pimenov@enron.com,nshand@condenast.co.uk,1005584282,1
195312,"V obshem ,vse po poriadku.\n\nKogda priexal, u...",True,131320,3,RE:,vladi.pimenov@enron.com,nshand@condenast.co.uk,1005586003,1


In [240]:
print_emails(df[df['Chain'].isin(df[df['cluster'] == 1]['Chain'])])

<Cluster: 0>, Chain ID: 071164, Index: 173224
Date: 2001-10-18 17:35:57
Subject: RE: FYI - LNG Terminal in California - using Bolivina gas
Sender: richard.shapiro@enron.com, Recipient: jose.bestard@enron.com
Message: What's our potential involvement?

 


<Cluster: 1>, Chain ID: 071164, Index: 173501
Date: 2001-10-18 19:52:41
Subject: FYI - LNG Terminal in California - using Bolivina gas
Sender: jose.bestard@enron.com, Recipient: richard.shapiro@enron.com
Message: This news item involves a project to export Bolivian gas through Chile or Peru to California. Proyecto Pacific LNG ser? presentado en California

http://energypress.com/cgi-bin/npublisher/extras/viewnews.cgi?category=1&id=1003264703 

Este lunes 15 de octubre, una delegaci?n boliviana estar? en California para realizar el mismo trabajo que hizo hace semanas atr?s, con M?xico: presentar el proyecto oficialmente a las autoridades californianas y a las empresas petroleras privadas para su consideraci?n.

Los representantes de Ca

In [244]:
dfs

['labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
 'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
 'labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl',
 'labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl',
 'labels_dbscan_chains_eq_3_50_0.27_02_wmd.pkl',
 'labels_dbscan_chains_eq_3_50_0.78_04_euclidean.pkl',
 'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl',
 'labels_dbscan_chains_eq_3_50_0.43_15_cosine.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.69_08_euclidean.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl',
 'labels_dbscan_chains_ge_4_lt_10_50_0.43_07_cosine.pkl',
 'labels_dbscan_chains_ge_10_300_0.63_02_euclidean.pkl',
 'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl',
 'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl',
 'labels_km_chains_ge_10_300_02_euclidean.pkl']

In [245]:
lab = next(iter_lab)

In [246]:
dfs[lab]

'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl'

In [34]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [35]:
df['cluster'].value_counts()

0    8766
1       3
Name: cluster, dtype: int64

In [36]:
df[df['cluster'] == 0]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
142,"\nHey Paul, how is it going?? Attached you'll...",False,87415,3,How are you?,educanto@msn.com,d..thomas@enron.com,883935960,0
421,"Maria,\n\nThe Clearing docs we got in from the...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925474740,0
424,"Mark,\n\nDoes this mean that you would prefer ...",True,49280,3,Re: Documentation from OM,maria.nartey@enron.com,"mark.elliott@enron.com, richard.sage@enron.com...",925482120,0
425,"Maria,\n\nNot necessarily - it is just that th...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925485840,0
503,Wow - that is one nasty looking storm out ther...,False,112512,3,Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926502600,0
...,...,...,...,...,...,...,...,...,...
250676,She is going to print all the Appalachian Prod...,True,16029,3,RE: Assistant to print contracts,chris.germany@enron.com,"ed.mcmichael@enron.com, ruth.concannon@enron.com",1024576950,0
250686,OK to both. Let's use Heather Choate too if i...,True,16029,3,RE: Assistant to print contracts,ed.mcmichael@enron.com,"chris.germany@enron.com, ruth.concannon@enron.com",1024588182,0
250700,does that mean i need to cover\n \n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602537,0
250703,9369 TOMORROW\n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602851,0


In [38]:
s1 = df[df['cluster']==0].sort_values(by=['Timestamp','Chain'])['Chain'].value_counts()

In [39]:
s1

87415     3
189979    3
168135    3
23239     3
113212    3
         ..
95209     2
29346     2
131320    1
220192    1
200159    1
Name: Chain, Length: 2946, dtype: int64

In [43]:
df.loc[df['Chain'].isin(s1[s1==3].index),:].sort_values(by=['Chain','Timestamp']).iloc[:12,:]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
105284,Yair:\n\nTracy and I would like to discuss the...,False,321,3,"""Transfer"" provision",sara.shackleton@enron.com,"yyaish@exchange.ml.com, tracy.ngo@enron.com, t...",988137780,0
105730,Yair:\n\nI am unavailable Wed - Fri next week....,True,321,3,"RE: ""Transfer"" provision",sara.shackleton@enron.com,"yyaish@exchange.ml.com, tracy.ngo@enron.com, t...",988198440,0
106662,Yair: Please let us know if we can conference...,True,321,3,"RE: ""Transfer"" provision",sara.shackleton@enron.com,"yyaish@exchange.ml.com, tracy.ngo@enron.com, t...",988308780,0
78341,"Diana is again not in today, but Sean Crandall...",True,367,3,Re: # 509638,kate.symes@enron.com,sharen.cason@enron.com,981534000,0
78355,This was entered before we had the option in E...,True,367,3,Re: # 509638,kate.symes@enron.com,sharen.cason@enron.com,981535260,0
78367,Did you find out anything about this deal? We...,False,367,3,# 509638,sharen.cason@enron.com,kate.symes@enron.com,981536220,0
67858,"This deal is coded not to be confirmed, but it...",False,392,3,#484043,sharen.cason@enron.com,kate.symes@enron.com,977481840,0
69154,"This was Stan Cocke's deal, and since he just ...",True,392,3,Re: #484043,kate.symes@enron.com,sharen.cason@enron.com,978515520,0
69229,"Yes, please change to be confirmed. The only ...",True,392,3,Re: #484043,sharen.cason@enron.com,kate.symes@enron.com,978524640,0
69681,Matt Motley and Mike Swerzbin both said this t...,True,395,3,Re: #486435,kate.symes@enron.com,kimberly.allen@enron.com,978622740,0


In [44]:
print_emails(df.loc[df['Chain'].isin(s1[s1==3].index),:].sort_values(by=['Chain','Timestamp']).iloc[:12,:])

<Cluster: 0>, Chain ID: 000321, Index: 105284
Date: 2001-04-24 20:43:00
Subject: "Transfer" provision
Sender: sara.shackleton@enron.com, Recipient: yyaish@exchange.ml.com, tracy.ngo@enron.com, tracy.ngo@enron.com
Message: Yair:

Tracy and I would like to discuss the "Transfer" issue with you tomorrow, if 
possible.  We think this should be a short discussion (maybe 10 minutes).  
How is 1 pm NY time, tomorrow Wednesday April 25?  This seems to be the most 
convenient for both of us, but we will entertain times before 3 pm NY time. 

Would you please email your response?

Thanks.

Sara Shackleton
Enron North America Corp.
1400 Smith Street, EB 3801a
Houston, Texas  77002
713-853-5620 (phone)
713-646-3490 (fax)
sara.shackleton@enron.com


<Cluster: 0>, Chain ID: 000321, Index: 105730
Date: 2001-04-25 13:34:00
Subject: RE: "Transfer" provision
Sender: sara.shackleton@enron.com, Recipient: yyaish@exchange.ml.com, tracy.ngo@enron.com, tracy.ngo@enron.com
Message: Yair:

I am unavailable Wed

# Chains greater or equal to 4 and less than 10

## SL, DBSCAN

In [45]:
dat = next(iter_dat)
dat

'chains_ge_4_lt_10'

In [47]:
lab = next(iter_lab)
print(lab)
dfs[lab]

8


'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl'

In [48]:
dfs[lab]

'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl'

In [49]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [50]:
df['cluster'].value_counts()

-1    9535
 0     139
 1       2
Name: cluster, dtype: int64

In [51]:
df[df['cluster'] == 1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
139056,sshhhhhh......let's keep it between us. (and ...,True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,jeff.dasovich@enron.com,drothrock@cmta.net,994864800,1
139285,i think you just lucked out for now.....when t...,True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,drothrock@cmta.net,jeff.dasovich@enron.com,994900260,1


In [52]:
print_emails(df[df['Chain'].isin(df[df['cluster'] == 1]['Chain'])])

<Cluster: -1>, Chain ID: 167107, Index: 138964
Date: 2001-07-11 12:45:00
Subject: Re: Sher Shops Alternative Edison Bailout Plan
Sender: drothrock@cmta.net, Recipient: jeff.dasovich@enron.com
Message: worse for SCE and generators, who have to eat the small guy share of the 
undercollection
between them. No transmission sale.

D

Jeff.Dasovich@enron.com wrote:

> better or worse than ours?
>
>
>                     Dorothy
>                     Rothrock             To:     Jeff.Dasovich@enron.com
>                                          ta.net>              Subject:     Re: Sher Shops 
Alternative Edison
>                                          Bailout Plan
>                     07/11/2001
>                     12:20 PM
>
>
>
> let me know if delaney doesn't send to you...
>
> d
>
> Jeff.Dasovich@enron.com wrote:
>
> > Thanks.  415.782.7854.  Better or worse than ours?
> >
> >
> >                     Dorothy
> >                     Rothrock             To:     Jeff.Dasovich@enron.co

In [53]:
s1 = df[df['cluster']==0].sort_values(by=['Timestamp','Chain'])['Chain'].value_counts()

In [54]:
s1

66002     4
181778    3
30087     2
72642     2
66001     2
         ..
88666     1
165384    1
150376    1
6768      1
203188    1
Name: Chain, Length: 122, dtype: int64

In [57]:
df.loc[df['Chain'].isin(s1[s1>3].index),:].sort_values(by=['Chain','Timestamp']).iloc[:24,:]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
210610,"Dutch,\n\nPlease give me a call @ 212.589.6260...",False,66002,7,Enron delta breakdown by month,shood@manfinancial.com,"dutch.quigley@enron.com, mtimmins@manfinancial...",1006961546,-1
211534,>\n\n>,True,66002,7,RE: Enron delta breakdown by month,shood@manfinancial.com,"dutch.quigley@enron.com, mtimmins@manfinancial...",1007128036,0
212116,- enron delta breakdown 12-3.xls,True,66002,7,RE: Enron delta breakdown by month,shood@manfinancial.com,"dutch.quigley@enron.com, mtimmins@manfinancial...",1007383382,-1
212540,\n >\n>,True,66002,7,RE: Enron delta breakdown by month,shood@manfinancial.com,"dutch.quigley@enron.com, mtimmins@manfinancial...",1007469745,0
212547,\n >\n>,True,66002,7,RE: Enron delta breakdown by month,shood@manfinancial.com,"dutch.quigley@enron.com, mtimmins@manfinancial...",1007471605,0
213310,- enron delta breakdown 12-6 bod.xls,True,66002,7,RE: Enron delta breakdown by month,shood@manfinancial.com,"dutch.quigley@enron.com, mtimmins@manfinancial...",1007642696,-1
213312,>\n\n>,True,66002,7,RE: Enron delta breakdown by month,shood@manfinancial.com,"dutch.quigley@enron.com, mtimmins@manfinancial...",1007643751,0


In [58]:
print_emails(df.loc[df['Chain'].isin(s1[s1>3].index),:].sort_values(by=['Chain','Timestamp']).iloc[:24,:])

<Cluster: -1>, Chain ID: 066002, Index: 210610
Date: 2001-11-28 16:32:26
Subject: Enron delta breakdown by month
Sender: shood@manfinancial.com, Recipient: dutch.quigley@enron.com, mtimmins@manfinancial.com, mtimmins@manfinancial.com
Message: Dutch,

Please give me a call @ 212.589.6260, so we can discuss the attached
spreadsheet.  It includes your futures position, option delta, and overall
delta broken down by month (NYMEX-NG). >

Thank you,

Stephen

 - enron delta breakdown.xls 


<Cluster: 0>, Chain ID: 066002, Index: 211534
Date: 2001-11-30 14:47:16
Subject: RE: Enron delta breakdown by month
Sender: shood@manfinancial.com, Recipient: dutch.quigley@enron.com, mtimmins@manfinancial.com, mtimmins@manfinancial.com
Message:  >

> 


<Cluster: -1>, Chain ID: 066002, Index: 212116
Date: 2001-12-03 13:43:02
Subject: RE: Enron delta breakdown by month
Sender: shood@manfinancial.com, Recipient: dutch.quigley@enron.com, mtimmins@manfinancial.com, mtimmins@manfinancial.com
Message:  - enron

In [59]:
df.loc[df['Chain'].isin(s1[s1==3].index),:].sort_values(by=['Chain','Timestamp']).iloc[:24,:]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
168535,done.\n\n,True,181778,4,RE: Tenaska IV,j..farmer@enron.com,megan.parker@enron.com,1002912007,0
192153,Done.\n\n,True,181778,4,RE: Tenaska IV,j..farmer@enron.com,megan.parker@enron.com,1005061244,0
219686,"Darren:\nIf you're in the office today, I need...",False,181778,4,Tenaska IV,megan.parker@enron.com,j..farmer@enron.com,1009392283,-1
221323,done.\n\n,True,181778,4,RE: Tenaska IV,j..farmer@enron.com,megan.parker@enron.com,1009807367,0


In [60]:
print_emails(df.loc[df['Chain'].isin(s1[s1==3].index),:].sort_values(by=['Chain','Timestamp']).iloc[:24,:])

<Cluster: 0>, Chain ID: 181778, Index: 168535
Date: 2001-10-12 20:40:07
Subject: RE: Tenaska IV
Sender: j..farmer@enron.com, Recipient: megan.parker@enron.com
Message: done.

 


<Cluster: 0>, Chain ID: 181778, Index: 192153
Date: 2001-11-06 16:40:44
Subject: RE: Tenaska IV
Sender: j..farmer@enron.com, Recipient: megan.parker@enron.com
Message: Done.

 


<Cluster: -1>, Chain ID: 181778, Index: 219686
Date: 2001-12-26 19:44:43
Subject: Tenaska IV
Sender: megan.parker@enron.com, Recipient: j..farmer@enron.com
Message: Darren:
If you're in the office today, I need some changes to Tenaska IV.

Deal 384258	Aug 2001 - change demand fee from 4,117,198.00 to 4,110,035.82
		Sep 2001 - change demand fee from 2,846,818.24 to 2,833,539.74

Thanks,
Megan


<Cluster: 0>, Chain ID: 181778, Index: 221323
Date: 2001-12-31 15:02:47
Subject: RE: Tenaska IV
Sender: j..farmer@enron.com, Recipient: megan.parker@enron.com
Message: done.

 




## CH, DBSCAN

In [61]:
lab = next(iter_lab)
print(lab)
dfs[lab]

9


'labels_dbscan_chains_ge_4_lt_10_50_0.69_08_euclidean.pkl'

In [63]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [64]:
df['cluster'].value_counts()

-1    9529
 0     139
 1       8
Name: cluster, dtype: int64

In [65]:
df[df['cluster'] == 1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
138964,"worse for SCE and generators, who have to eat ...",True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,drothrock@cmta.net,jeff.dasovich@enron.com,994848300,1
138981,Thanks. 415.782.7854. Better or worse than o...,True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,jeff.dasovich@enron.com,drothrock@cmta.net,994852800,1
138992,better or worse than ours?\tDorothy Rothrock \...,True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,jeff.dasovich@enron.com,drothrock@cmta.net,994854660,1
139001,i think you just lucked out for now.....when t...,True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,drothrock@cmta.net,jeff.dasovich@enron.com,994857060,1
139027,"glad we're not a generator.\n\nbest,\njeff\tDo...",True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,jeff.dasovich@enron.com,drothrock@cmta.net,994860600,1
139056,sshhhhhh......let's keep it between us. (and ...,True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,jeff.dasovich@enron.com,drothrock@cmta.net,994864800,1
139265,"worse for SCE and generators, who have to eat ...",True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,drothrock@cmta.net,jeff.dasovich@enron.com,994891500,1
139285,i think you just lucked out for now.....when t...,True,167107,8,Re: Sher Shops Alternative Edison Bailout Plan,drothrock@cmta.net,jeff.dasovich@enron.com,994900260,1


In [66]:
print_emails(df[df['Chain'].isin(df[df['cluster'] == 1]['Chain'])])

<Cluster: 1>, Chain ID: 167107, Index: 138964
Date: 2001-07-11 12:45:00
Subject: Re: Sher Shops Alternative Edison Bailout Plan
Sender: drothrock@cmta.net, Recipient: jeff.dasovich@enron.com
Message: worse for SCE and generators, who have to eat the small guy share of the 
undercollection
between them. No transmission sale.

D

Jeff.Dasovich@enron.com wrote:

> better or worse than ours?
>
>
>                     Dorothy
>                     Rothrock             To:     Jeff.Dasovich@enron.com
>                                          ta.net>              Subject:     Re: Sher Shops 
Alternative Edison
>                                          Bailout Plan
>                     07/11/2001
>                     12:20 PM
>
>
>
> let me know if delaney doesn't send to you...
>
> d
>
> Jeff.Dasovich@enron.com wrote:
>
> > Thanks.  415.782.7854.  Better or worse than ours?
> >
> >
> >                     Dorothy
> >                     Rothrock             To:     Jeff.Dasovich@enron.com

In [53]:
s1 = df[df['cluster']==0].sort_values(by=['Timestamp','Chain'])['Chain'].value_counts()

In [54]:
s1

66002     4
181778    3
30087     2
72642     2
66001     2
         ..
88666     1
165384    1
150376    1
6768      1
203188    1
Name: Chain, Length: 122, dtype: int64

In [None]:
s1

In [67]:
df[df['cluster'] == 0]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
11343,michaelpshannon@yahoo.com,True,123037,5,Re:,benjamin.rogers@enron.com,brandon.neff@enron.com,953893500,0
14882,http://www.lonestarford.com/newcars/expedition...,False,131016,4,,mike.carson@enron.com,mcarson@gtemail.net,956919300,0
21849,Weasel!!,True,121380,4,Re:,benjamin.rogers@enron.com,7028587@skytel.com,962005560,0
28491,Thanks!,True,123326,4,Re:,benjamin.rogers@enron.com,jonathan.hoff@enron.com,965724480,0
33998,37176,True,130927,6,Re:,matthew.lenhart@enron.com,paul.lucci@enron.com,968165640,0
...,...,...,...,...,...,...,...,...,...
247810,:-)\n\n,True,32686,6,RE: Citrix application?,jimmy.manguba@enron.com,chris.germany@enron.com,1016728973,0
248442,I'm LOOOOOKING!!!!\n\n,True,150570,4,RE: Questions We Need Dominion To Answer,chris.germany@enron.com,sproctor@akllp.com,1017430736,0
250606,http://bible.gospelcom.net/,False,124830,4,,chris.germany@enron.com,trogg522@aol.com,1024425791,0
250704,TOMARROW.\n\n \n\n,True,80625,4,RE: Go Baby!,chet_fenner@bmc.com,joe.parks@enron.com,1024602900,0


In [68]:
print_emails(df[df['Chain'].isin(df[df['cluster'] == 0]['Chain'])])

<Cluster: -1>, Chain ID: 123037, Index: 7197
Date: 2000-02-03 11:24:00
Subject: Re:
Sender: benjamin.rogers@enron.com, Recipient: brandon.neff@enron.com
Message: I'll gladly take a % of your profits for getting you into the stock.  Or I'll 
just take that money from you this Saturday.
Ben


<Cluster: -1>, Chain ID: 123037, Index: 7299
Date: 2000-02-04 15:10:00
Subject: Re:
Sender: benjamin.rogers@enron.com, Recipient: brandon.neff@enron.com
Message: That's fine - more people I can take money from.


<Cluster: -1>, Chain ID: 124830, Index: 11291
Date: 2000-03-23 17:59:00
Subject: RE: FW:
Sender: chris.germany@enron.com, Recipient: trogg522@aol.com
Message: ---------------------- Forwarded by Chris Germany/HOU/ECT on 03/23/2000 04:59 
PM ---------------------------Chris Germany
12/15/99 12:32 PM
To: "Germany Jr, Don (WD)" 
cc:  
Subject: RE: FW:  

I think so.  I believe several people will be here on the 1st.



<Cluster: 0>, Chain ID: 123037, Index: 11343
Date: 2000-03-24 11:25:00
Subj

In [69]:
s1 = df[df['cluster']==0].sort_values(by=['Timestamp','Chain'])['Chain'].value_counts()

In [70]:
s1

66002     4
181778    3
30087     2
72642     2
66001     2
         ..
88666     1
165384    1
150376    1
6768      1
203188    1
Name: Chain, Length: 122, dtype: int64

In [75]:
s2 = df.loc[df['Chain'].isin(s1.index),'Chain'].value_counts()
s2

41647     9
214561    9
218660    9
30087     9
145062    9
         ..
15632     4
13819     4
174669    4
155798    4
213798    3
Name: Chain, Length: 122, dtype: int64

In [81]:
9*6

54

In [84]:
print_emails(df.loc[df['Chain'].isin(s2[s2==9].index),:].sort_values(by=['Chain','Timestamp']).iloc[36:54,:])

<Cluster: -1>, Chain ID: 218660, Index: 193402
Date: 2001-11-07 22:47:58
Subject: RE: thinking of you
Sender: jason.wolfe@enron.com, Recipient: eellwanger@triumphboats.com
Message: Around North Carolina? no

 


<Cluster: -1>, Chain ID: 218660, Index: 193403
Date: 2001-11-07 22:48:15
Subject: RE: thinking of you
Sender: eellwanger@triumphboats.com, Recipient: jason.wolfe@enron.com
Message: Around Houston, ya dill wacker.




<Cluster: 0>, Chain ID: 218660, Index: 193405
Date: 2001-11-07 22:49:43
Subject: RE: thinking of you
Sender: jason.wolfe@enron.com, Recipient: eellwanger@triumphboats.com
Message: really? why?

 


<Cluster: -1>, Chain ID: 218660, Index: 193411
Date: 2001-11-07 22:54:40
Subject: RE: thinking of you
Sender: jason.wolfe@enron.com, Recipient: eellwanger@triumphboats.com
Message: whatever. just let me know when you aren't dicking around

 


<Cluster: -1>, Chain ID: 218660, Index: 194055
Date: 2001-11-08 21:47:42
Subject: RE: thinking of you
Sender: eellwanger@triumphb

## DB, DBSCAN

In [33]:
lab = next(iter_lab)
print(lab)
dfs[lab]

6


'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl'

In [246]:
dfs[lab]

'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl'

In [34]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [35]:
df['cluster'].value_counts()

0    8766
1       3
Name: cluster, dtype: int64

## Entropy, DBSCAN

In [33]:
lab = next(iter_lab)
print(lab)
dfs[lab]

6


'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl'

In [246]:
dfs[lab]

'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl'

In [34]:
labs_array = pd.read_pickle(Path(labels_dir, dfs[lab]))
df = parsed_emails[dat]
df['cluster'] = labs_array

In [35]:
df['cluster'].value_counts()

0    8766
1       3
Name: cluster, dtype: int64

## Chains greater or equal than 10

In [72]:
dat

'chains_ge_10'

In [74]:
df = dfs[dat][0]

In [75]:
df[df['cluster'] != -1]['Chain'].value_counts()

131203    798
126965    290
122336    233
122191    123
130942    122
         ... 
219868     10
129576     10
105298     10
124251      9
104153      9
Name: Chain, Length: 192, dtype: int64

In [76]:
df['Chain'].max()

219868

In [77]:
df.sort_values(by=['Timestamp','Chain'])

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
955,Hi Gerald: We have executed agreements with...,True,23316,21,Re: CA Data Sheet,kay.young@enron.com,gerald.nemec@enron.com,930038100,1
1580,"G, How is it going? Been a while since we sp...",False,127447,27,,gerald.nemec@enron.com,gtownsend@manorisd.net,934198740,1
1608,"GT, The theme of the party sounds excellent. ...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934288740,1
1668,"GT, I will be taking Friday off. Probably dr...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934794240,1
1867,"G, I will be there about 9 pm tonight. My ce...",True,127447,27,Re:,gerald.nemec@enron.com,gtownsend@manorisd.net,935769600,1
...,...,...,...,...,...,...,...,...,...
250216,nothing more than what the rags say..............,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023218609,1
250217,i'm going to las vegas in august for a couple ...,True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219055,1
250218,My last fun trip was to portland. Going to se...,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023219293,1
250219,"dude, serious? Portland...portland is soooooo...",True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219865,1


In [80]:
print_emails(df.sort_values(by=['Timestamp','Chain'])[df['cluster'] != -1])

  print_emails(df.sort_values(by=['Timestamp','Chain'])[df['cluster'] != -1])


Chain ID: 023316 Index: 955
Date: 1999-06-22 09:55:00
Subject: Re: CA Data Sheet
Sender: kay.young@enron.com , Recipient:  gerald.nemec@enron.com
Message: Hi Gerald:  We have  executed agreements  with Amoco Corp. and several 
affiliates relating to everything from possible purchase of Amoco Gas Company 
to producer financing of purchase of some of their assets.  One between HPL/ 
Amoco Production Company relates to evaluation of Tyler Field.  None have 
restrictive provisions that would preclude another agreement to the best of 
my knowledge.  Let me know if you would like to see any of them or you are 
welcome to come and browse.

Kay


Chain ID: 127447 Index: 1580
Date: 1999-08-09 13:39:00
Subject: 
Sender: gerald.nemec@enron.com , Recipient:  gtownsend@manorisd.net
Message: G,  How is it going?  Been a while since we spoke.  What have you been up to?

As for me, just a lot of work and little vacation worked in there.  Nothing 
real exciting.  Finally finished the redo of that damn 

In [71]:
for idx, row in df_best.sort_values(by=['Timestamp','Chain'])[df_best['cluster'] != -1].iterrows():
    print('Chain ID:', f'{row["Chain"]:06d}' ,'Index:', idx)
    print('Subject:', row['Subject'])
    print('Sender:', row['Sender'], 'Recipient: ', row['Recipients'])
    print('Message:',row['Message'])
    print('\n')

Chain ID: 124647 Index: 24123
Subject: Re:
Sender: chris.dorland@enron.com Recipient:  mmolloy@oebi.com
Message: Perhaps...


Chain ID: 130941 Index: 31926
Subject: Re:
Sender: matthew.lenhart@enron.com Recipient:  shelliott@dttus.com
Message: sure.  maybe.


Chain ID: 130003 Index: 49710
Subject: RE:
Sender: mark.guzman@enron.com Recipient:  katie.trullinger@wfsg.com
Message: So.........


Chain ID: 200957 Index: 50356
Subject: RE: FW: What's up?
Sender: katie.trullinger@wfsg.com Recipient:  mark.guzman@enron.com
Message: Cool.




Chain ID: 130986 Index: 50804
Subject: Re:
Sender: matthew.lenhart@enron.com Recipient:  val.generes@ac.com
Message: thanks.


Chain ID: 130942 Index: 58487
Subject: RE:
Sender: matthew.lenhart@enron.com Recipient:  shirley.s.elliott@citicorp.com
Message: d-i-r-t-y


Chain ID: 130003 Index: 59640
Subject: RE:
Sender: mark.guzman@enron.com Recipient:  katie.trullinger@wfsg.com
Message: No.  


Chain ID: 128697 Index: 63984
Subject: Re:
Sender: jeff.dasovich@

  for idx, row in df_best.sort_values(by=['Timestamp','Chain'])[df_best['cluster'] != -1].iterrows():


In [78]:
df_best.groupby('cluster')['Chain'].count()

cluster
-1    5229
 0      93
 1       2
Name: Chain, dtype: int64

In [79]:
df_best.loc[df_best['Chain']==131203,'Chain_len']

173523    798
173526    798
173531    798
173535    798
173538    798
         ... 
241753    798
241755    798
241759    798
241761    798
241941    798
Name: Chain_len, Length: 798, dtype: object

In [82]:
df.loc[df['cluster']==1,'Chain'].value_counts()

131203    797
126965    290
122336    233
122191    123
130942    122
         ... 
219868     10
129576     10
105298     10
124251      9
104153      9
Name: Chain, Length: 192, dtype: int64

In [92]:
print_emails(df.loc[df['cluster']==1])

Chain ID: 131203 Index: 224024
Date: 2002-01-07 21:54:47
Subject: RE:
Sender: mike.maggi@enron.com , Recipient:  michelle.nelson@enron.com
Message: ok

 


Chain ID: 122336 Index: 229806
Date: 2002-01-17 17:36:36
Subject: RE:
Sender: mike.maggi@enron.com , Recipient:  amanda.rybarski@enron.com
Message: ok

 




In [96]:
df_best.loc[df_best['cluster']==0,'Chain'].value_counts()

131203    25
122336     6
125390     5
127857     4
122191     4
129255     3
130920     2
212224     2
130987     2
219868     2
129004     2
125120     2
122292     2
130003     2
124001     2
130453     1
121444     1
124647     1
126965     1
129545     1
130276     1
129318     1
124662     1
126887     1
127175     1
108478     1
128612     1
128525     1
96466      1
130941     1
202347     1
127808     1
111694     1
123544     1
126810     1
218433     1
122161     1
130620     1
128697     1
130942     1
130986     1
200957     1
126676     1
Name: Chain, dtype: int64

In [93]:
print_emails(df_best.loc[df_best['cluster']==0])

Chain ID: 124647 Index: 24123
Date: 2000-07-11 15:14:00
Subject: Re:
Sender: chris.dorland@enron.com , Recipient:  mmolloy@oebi.com
Message: Perhaps...


Chain ID: 130941 Index: 31926
Date: 2000-08-25 16:04:00
Subject: Re:
Sender: matthew.lenhart@enron.com , Recipient:  shelliott@dttus.com
Message: sure.  maybe.


Chain ID: 130003 Index: 49710
Date: 2000-11-06 11:20:00
Subject: RE:
Sender: mark.guzman@enron.com , Recipient:  katie.trullinger@wfsg.com
Message: So.........


Chain ID: 200957 Index: 50356
Date: 2000-11-07 13:31:00
Subject: RE: FW: What's up?
Sender: katie.trullinger@wfsg.com , Recipient:  mark.guzman@enron.com
Message: Cool.




Chain ID: 130986 Index: 50804
Date: 2000-11-08 13:30:00
Subject: Re:
Sender: matthew.lenhart@enron.com , Recipient:  val.generes@ac.com
Message: thanks.


Chain ID: 130942 Index: 58487
Date: 2000-11-30 13:33:00
Subject: RE:
Sender: matthew.lenhart@enron.com , Recipient:  shirley.s.elliott@citicorp.com
Message: d-i-r-t-y


Chain ID: 130003 Index: 5

In [23]:
best_labs_all[dat]['SL'][1]

'labels_dbscan_chains_eq_3_50_0.27_02_wmd.pkl'

In [24]:
best_labs_all[dat]['CH'][1]

'labels_dbscan_chains_eq_3_50_0.78_04_euclidean.pkl'

In [25]:
best_labs_all[dat]['DB'][1]

'labels_dbscan_chains_eq_3_50_0.25_02_wmd.pkl'

In [26]:
best_labs_all[dat]['Entropy'][1]

'labels_dbscan_chains_eq_3_50_0.43_15_cosine.pkl'

In [None]:
best_labs_all[dat]['SL'][2]

In [52]:
best_labs_all[dat]['CH'][2]

'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl'

In [54]:
best_labs_all[dat]['DB'][0]

'labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl'

In [55]:
best_labs_all[dat]['Entropy'][1]

'labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl'

In [28]:
best_labs_all[dat]['SL'][2]

'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl'

In [29]:
best_labs_all[dat]['CH'][1]

'labels_dbscan_chains_ge_4_lt_10_50_0.69_08_euclidean.pkl'

In [30]:
best_labs_all[dat]['DB'][2]

'labels_dbscan_chains_ge_4_lt_10_50_0.5_02_euclidean.pkl'

In [31]:
best_labs_all[dat]['Entropy'][2]

'labels_dbscan_chains_ge_4_lt_10_50_0.43_07_cosine.pkl'

In [33]:
best_labs_all[dat]['SL'][1]

'labels_dbscan_chains_ge_10_300_0.63_02_euclidean.pkl'

In [34]:
best_labs_all[dat]['CH'][2]

'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl'

In [35]:
best_labs_all[dat]['DB'][2]

'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl'

In [37]:
best_labs_all[dat]['Entropy'][1]

'labels_km_chains_ge_10_300_02_euclidean.pkl'

In [76]:
for idx, row in df_best.sort_values(by=['Timestamp','Chain']).loc[(df_best['Chain'] == 131203) &(df_best['cluster'] != -1),:].iterrows():
    print('Chain ID:', f'{row["Chain"]:06d}' ,'Index:', idx)
    print('Date:', datetime.fromtimestamp(row['Timestamp']))
    print('Subject:', row['Subject'])
    print('Sender:', row['Sender'], ', Recipient: ', row['Recipients'])
    print('Message:',row['Message'])

Chain ID: 131203 Index: 200565
Date: 2001-11-19 15:41:01
Subject: RE:
Sender: mike.maggi@enron.com , Recipient:  michelle.nelson@enron.com
Message: terrible, yours?

 
Chain ID: 131203 Index: 200584
Date: 2001-11-19 15:49:12
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: good.

 
Chain ID: 131203 Index: 200607
Date: 2001-11-19 15:55:52
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: whatever.

 
Chain ID: 131203 Index: 200658
Date: 2001-11-19 16:16:43
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: why?  

 
Chain ID: 131203 Index: 200791
Date: 2001-11-19 16:56:32
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: you're cooking?

 
Chain ID: 131203 Index: 200799
Date: 2001-11-19 16:59:32
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: that's cute.

 
Chain ID: 131203 Index: 2