In [152]:
import email
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
from datetime import datetime, timedelta
from dateutil.parser import parse
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec

plt.rcParams['figure.dpi'] = 100
plt.rcParams["figure.autolayout"] = True

In [153]:
data_dir = Path(Path.cwd().parent, 'data/interim')
models_dir = Path(Path.cwd().parent, 'models')
evals_dir = Path(data_dir, 'evals_6') # evals_4
labels_dir = Path(data_dir, 'labels_4') # Path(data_dir, 'labels_2')
models_dir = Path(Path.cwd().parent, 'models')
datasets_order = {
    'chains_eq_2': 0,
    'chains_eq_3': 1,
    'chains_ge_4_lt_10': 2,
    'chains_ge_10': 3}

datasets_names = {
    'chains_eq_2': r'CL $= 2$',
    'chains_eq_3': r'CL $= 3$',
    'chains_ge_4_lt_10': r'10 $>$ CL $\geq$ 4',
    'chains_ge_10': r'CL $\geq$ 10'}

sizes = {}
for path in Path(models_dir).glob('d2v*.model'):
    size = len(Doc2Vec.load(str(path)).dv.vectors)
    name = '_'.join(path.stem.split('_')[1:])
    sizes[name] = size


In [154]:
df_kmeans = pd.concat([pd.read_csv(path) for path in evals_dir.glob('eval_km*.csv')], ignore_index=True)
df_kmeans['dimensions'] = df_kmeans['dataset'].apply(lambda s: s.split('_')[-1])
df_kmeans['dataset'] = df_kmeans['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_kmeans['order'] = df_kmeans['dataset'].map(datasets_order)
df_kmeans['name'] = df_kmeans['dataset'].map(datasets_names)
df_kmeans = df_kmeans.sort_values('order')
df_kmeans['k_noise'] = df_kmeans['n_clusters'].astype(int)
df_kmeans['method'] = 'KMeans'

df_dbscan = pd.concat([pd.read_csv(path) for path in evals_dir.glob('eval_dbscan*.csv')], ignore_index=True)
df_dbscan['size'] = df_dbscan['dataset'].map(sizes)
df_dbscan['prop_noise'] = df_dbscan['n_noise']/df_dbscan['size']
df_dbscan['dimensions'] = df_dbscan['dataset'].apply(lambda s: s.split('_')[-1])
df_dbscan['dataset'] = df_dbscan['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_dbscan['order'] = df_dbscan['dataset'].map(datasets_order)
df_dbscan['name'] = df_dbscan['dataset'].map(datasets_names)
df_dbscan = df_dbscan.sort_values('order')
df_dbscan['k_noise'] = df_dbscan.apply(lambda x: f"{x['n_clusters']} ({x['prop_noise']:.3f})", axis=1)
df_dbscan['method'] = 'DBSCAN'
df_dbscan = df_dbscan.loc[df_dbscan['epsilon'] != 0.2,:]

df_hdbscan = pd.concat([pd.read_csv(path) for path in evals_dir.glob('*_hdbscan*.csv')], ignore_index=True)
df_hdbscan['size'] = df_hdbscan['dataset'].map(sizes)
df_hdbscan['prop_noise'] = df_hdbscan['n_noise']/df_hdbscan['size']
df_hdbscan['dimensions'] = df_hdbscan['dataset'].apply(lambda s: s.split('_')[-1])
df_hdbscan['dataset'] = df_hdbscan['dataset'].apply(lambda s: '_'.join(s.split('_')[:-1]))
df_hdbscan['order'] = df_hdbscan['dataset'].map(datasets_order)
df_hdbscan['name'] = df_hdbscan['dataset'].map(datasets_names)
df_hdbscan = df_hdbscan.sort_values('order')
df_hdbscan['k_noise'] = df_hdbscan.apply(lambda x: f"{x['n_clusters']} ({x['prop_noise']:.3f})", axis=1)
df_hdbscan['method'] = 'HDBSCAN'
datasets = df_dbscan.sort_values(by='order')['dataset'].unique()

eval_scores = ['sl_score', 'ch_score', 'db_score', 'entropy']

# Tables of best scores

In [155]:
# def labels_km(df):
#     return f'labels_km_{df.loc[0,"dataset"]}_{df.loc[0,"dimensions"]}_{df.loc[0,"n_clusters"]:02d}_{df.loc[0,"distance"]}.pkl'

# def labels_db(df):
#     return f'labels_dbscan_{df.loc[0,"dataset"]}_{df.loc[0,"dimensions"]}_{df.loc[0,"epsilon"].astype(str)[:7]}*_{df.loc[0,"min_pts"].astype(int):02d}_{df.loc[0,"distance"]}.pkl'

# def labels_hd(df):
#     return f'labels_hdbscan_{df.loc[0,"dataset"]}_{df.loc[0,"dimensions"]}_{df.loc[0,"min_clt_size"].astype(int):02d}_{df.loc[0,"min_samples"].astype(int):02d}_{df.loc[0,"distance"]}.pkl'

def eps_check(eps):
    val = str(eps)
    if len(val) > 8:
        val = val[:7] + '*'
    return val


def labels_km(df, dataset):
    return f'labels_km_{dataset}_{df["dimensions"]}_{df["n_clusters"]:02d}_{df["distance"]}.pkl'


def labels_db(df, dataset):
    return f'labels_dbscan_{dataset}_{df["dimensions"]}_{eps_check(df["epsilon"])}_{int(df["min_pts"]):02d}_{df["distance"]}.pkl'


def labels_hd(df, dataset):
    return f'labels_hdbscan_{dataset}_{df["dimensions"]}_{int(df["min_clt_size"]):02d}_{int(df["min_samples"]):02d}_{df["distance"]}.pkl'

In [156]:
best_labs_all = {}
ord_cols = ['method', 'distance', 'dimensions', 'n_clusters', 'k_noise', 'sl_score', 'ch_score',
            'db_score', 'entropy', 'epsilon', 'min_pts', 'nn', 'min_clt_size', 'min_samples']
ord_name = {
    'method':'Method',
    'distance':'Distance',
    'dimensions':'Dim',
    'n_clusters': r'$k$',
    'k_noise': r'$k$ (% Noise)',
    'sl_score':'SL',
    'ch_score':'CH',
    'db_score':'DB',
    'entropy':'Entropy',
    'epsilon': 'Eps',
    'min_pts': 'MinPts',
    'nn': 'NN',
    'min_clt_size': 'Min Clt Size',
    'min_samples': 'Min Samples',
    'focus': 'Score'
}
dis_name = {'euclidean':'Euclidean', 'cosine': 'Cosine', 'wmd':'WMD', 'l2':'L2 Norm'}
iterator_datasets = iter(datasets)
iterator_scores = iter(eval_scores)


In [158]:
def prepare_best_table(tab_best, dis_name, ord_name):
    tab_best['distance'] = tab_best['distance'].map(dis_name)
    tab_best[['min_pts', 'min_clt_size', 'min_samples']] = tab_best[['min_pts', 'min_clt_size', 'min_samples']].fillna(-1).astype(int).astype(str).replace('-1','-')
    tab_best.columns = tab_best.columns.map(ord_name)
    return tab_best.drop(columns=[r'$k$ (% Noise)']).set_index(['Score', 'Method'])
    

In [159]:
dat = next(iterator_datasets)
tab_best = pd.DataFrame()
print(dat)
for score in eval_scores:
    if score in ['db_score', 'entropy']:
        asc = True
    else:
        asc= False
    tmp1 = pd.concat([
        df_kmeans.loc[df_kmeans['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_dbscan.loc[df_dbscan['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_hdbscan.loc[df_hdbscan['dataset']==dat,:].sort_values(score,ascending=asc).head(1)
    ])
    tmp1['focus'] = score
    tmp1['focus'] = tmp1['focus'].map(ord_name)
    # tmp1['distance'] = tmp1['distance'].map(dis_name)
    tmp1 = tmp1.sort_values(score, ascending=asc)
    tab_best = pd.concat([tab_best, tmp1.loc[:,ord_cols+['focus']].dropna(how='all',axis=1)])

best_labels_ = {}
for score in eval_scores:
    for idx, row in tab_best.iterrows():
        if row['method'] == 'KMeans':
            label = labels_km(row, dat)
        elif row['method'] == 'DBSCAN':
            label = labels_db(row, dat)
        elif row['method'] == 'HDBSCAN':
            label = labels_hd(row, dat)
        # print(row['focus'])
        # print(label)
        if row['focus'] in best_labels_:
            best_labels_[row['focus']].append(label)
        else:
            best_labels_[row['focus']] = [label]

best_labels_ = {k:list(set(v)) for k, v in best_labels_.items()}
best_labels = {}
for k, v in best_labels_.items():
    for i in range(len(v)):
        if k not in best_labels:
            best_labels[k] = {i: v[i]}
        else:
            best_labels[k].update({i: v[i]})
# best_labels
best_labs_all[dat] = best_labels
# best_labs_all
# tab_best['distance'] = tab_best['distance'].map(dis_name)
# tab_best[['MinPts', 'Min Clt Size', 'Min Samples']] = tab_best[['MinPts', 'Min Clt Size', 'Min Samples']].fillna(-1).astype(int).astype(str).replace('-1','')
# tab_best.columns = tab_best.columns.map(ord_name)
# tab_best.drop(columns=[r'$k$ (% Noise)']).set_index(['Score', 'Method'])
tab = prepare_best_table(tab_best, dis_name, ord_name)
tab

chains_eq_2


Unnamed: 0_level_0,Unnamed: 1_level_0,Distance,Dim,$k$,SL,CH,DB,Entropy,Eps,MinPts,Min Clt Size,Min Samples
Score,Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SL,DBSCAN,Euclidean,300,2,0.971257,3354.887297,0.290183,0.080688,0.82,5,-,-
SL,HDBSCAN,Euclidean,50,2,0.886469,808.641864,0.616822,0.202499,,-,5,34
SL,KMeans,Euclidean,300,2,0.134493,796.314484,5.126896,0.683153,,-,-,-
CH,DBSCAN,Euclidean,300,2,0.971257,3354.887297,0.290183,0.080688,0.82,5,-,-
CH,KMeans,Euclidean,50,2,0.123197,966.62677,4.727751,0.672388,,-,-,-
CH,HDBSCAN,Euclidean,50,2,0.879582,826.450913,0.625766,0.241777,,-,2,27
DB,DBSCAN,Euclidean,300,2,0.968049,2435.317202,0.261363,0.038351,0.76,2,-,-
DB,HDBSCAN,Euclidean,50,2,0.886469,808.641864,0.616822,0.202499,,-,5,32
DB,KMeans,Euclidean,300,20,-0.003933,104.039509,2.850879,1.602286,,-,-,-
Entropy,DBSCAN,WMD,50,2,0.582316,12.698288,1.067944,0.000871,0.54,2,-,-


In [160]:
tab.\
    to_html(na_rep='-', float_format="%.3f")

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th></th>\n      <th>Distance</th>\n      <th>Dim</th>\n      <th>$k$</th>\n      <th>SL</th>\n      <th>CH</th>\n      <th>DB</th>\n      <th>Entropy</th>\n      <th>Eps</th>\n      <th>MinPts</th>\n      <th>Min Clt Size</th>\n      <th>Min Samples</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th>Method</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th rowspan="3" valign="top">SL</th>\n      <th>DBSCAN</th>\n      <td>Euclidean</td>\n      <td>300</td>\n      <td>2</td>\n      <td>0.971</td>\n      <td>3354.887</td>\n      <td>0.290</td>\n      <td>0.081</td>\n      <td>0.820</td>\n      <td>5</td>\n      <td>-</td>\n      <td>-</td>\n    </tr>\n    <tr>\n      <th>HDBSCAN

In [161]:
dat = next(iterator_datasets)
tab_best = pd.DataFrame()
print(dat)
for score in eval_scores:
    if score in ['db_score', 'entropy']:
        asc = True
    else:
        asc= False
    tmp1 = pd.concat([
        df_kmeans.loc[df_kmeans['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_dbscan.loc[df_dbscan['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_hdbscan.loc[df_hdbscan['dataset']==dat,:].sort_values(score,ascending=asc).head(1)
    ])
    tmp1['focus'] = score
    tmp1['focus'] = tmp1['focus'].map(ord_name)
    # tmp1['distance'] = tmp1['distance'].map(dis_name)
    tmp1 = tmp1.sort_values(score, ascending=asc)
    tab_best = pd.concat([tab_best, tmp1.loc[:,ord_cols+['focus']].dropna(how='all',axis=1)])

best_labels_ = {}
for score in eval_scores:
    for idx, row in tab_best.iterrows():
        if row['method'] == 'KMeans':
            label = labels_km(row, dat)
        elif row['method'] == 'DBSCAN':
            label = labels_db(row, dat)
        elif row['method'] == 'HDBSCAN':
            label = labels_hd(row, dat)
        # print(row['focus'])
        # print(label)
        if row['focus'] in best_labels_:
            best_labels_[row['focus']].append(label)
        else:
            best_labels_[row['focus']] = [label]

best_labels_ = {k:list(set(v)) for k, v in best_labels_.items()}
best_labels = {}
for k, v in best_labels_.items():
    for i in range(len(v)):
        if k not in best_labels:
            best_labels[k] = {i: v[i]}
        else:
            best_labels[k].update({i: v[i]})
# best_labels
best_labs_all[dat] = best_labels
# best_labs_all
# tab_best['distance'] = tab_best['distance'].map(dis_name)
# tab_best[['MinPts', 'Min Clt Size', 'Min Samples']] = tab_best[['MinPts', 'Min Clt Size', 'Min Samples']].fillna(-1).astype(int).astype(str).replace('-1','')
# tab_best.columns = tab_best.columns.map(ord_name)
# tab_best.drop(columns=[r'$k$ (% Noise)']).set_index(['Score', 'Method'])
tab = prepare_best_table(tab_best, dis_name, ord_name)
tab

chains_eq_3


Unnamed: 0_level_0,Unnamed: 1_level_0,Distance,Dim,$k$,SL,CH,DB,Entropy,Eps,MinPts,Min Clt Size,Min Samples
Score,Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SL,DBSCAN,WMD,50,2,12.712286,1.056969,0.003072,,0.27,2,-,-
SL,HDBSCAN,L2 Norm,300,2,0.832007,30.547183,0.537805,0.076197,,-,2,6
SL,KMeans,Euclidean,300,2,0.125059,235.633476,5.859281,0.677666,,-,-,-
CH,DBSCAN,Euclidean,50,2,0.967235,4375.641036,0.140199,0.137406,0.78,4,-,-
CH,KMeans,Euclidean,300,2,0.125059,235.633476,5.859281,0.677666,,-,-,-
CH,HDBSCAN,L2 Norm,300,4,0.376214,180.775041,0.895671,0.133833,,-,2,5
DB,DBSCAN,WMD,50,2,12.712286,1.056969,0.003072,,0.25,2,-,-
DB,HDBSCAN,L2 Norm,300,2,0.832007,30.547183,0.537805,0.076197,,-,2,6
DB,KMeans,Euclidean,300,19,0.056731,29.395593,2.522628,1.361204,,-,-,-
Entropy,DBSCAN,Cosine,50,2,0.01915,1.177548,0.906909,0.001262,0.43,15,-,-


In [162]:
tab.\
    to_html(na_rep='-', float_format="%.3f")

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th></th>\n      <th>Distance</th>\n      <th>Dim</th>\n      <th>$k$</th>\n      <th>SL</th>\n      <th>CH</th>\n      <th>DB</th>\n      <th>Entropy</th>\n      <th>Eps</th>\n      <th>MinPts</th>\n      <th>Min Clt Size</th>\n      <th>Min Samples</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th>Method</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th rowspan="3" valign="top">SL</th>\n      <th>DBSCAN</th>\n      <td>WMD</td>\n      <td>50</td>\n      <td>2</td>\n      <td>12.712</td>\n      <td>1.057</td>\n      <td>0.003</td>\n      <td>-</td>\n      <td>0.270</td>\n      <td>2</td>\n      <td>-</td>\n      <td>-</td>\n    </tr>\n    <tr>\n      <th>HDBSCAN</th>\n      

In [163]:
dat = next(iterator_datasets)
tab_best = pd.DataFrame()
print(dat)
for score in eval_scores:
    if score in ['db_score', 'entropy']:
        asc = True
    else:
        asc= False
    tmp1 = pd.concat([
        df_kmeans.loc[df_kmeans['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_dbscan.loc[df_dbscan['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_hdbscan.loc[df_hdbscan['dataset']==dat,:].sort_values(score,ascending=asc).head(1)
    ])
    tmp1['focus'] = score
    tmp1['focus'] = tmp1['focus'].map(ord_name)
    # tmp1['distance'] = tmp1['distance'].map(dis_name)
    tmp1 = tmp1.sort_values(score, ascending=asc)
    tab_best = pd.concat([tab_best, tmp1.loc[:,ord_cols+['focus']].dropna(how='all',axis=1)])

best_labels_ = {}
for score in eval_scores:
    for idx, row in tab_best.iterrows():
        if row['method'] == 'KMeans':
            label = labels_km(row, dat)
        elif row['method'] == 'DBSCAN':
            label = labels_db(row, dat)
        elif row['method'] == 'HDBSCAN':
            label = labels_hd(row, dat)
        # print(row['focus'])
        # print(label)
        if row['focus'] in best_labels_:
            best_labels_[row['focus']].append(label)
        else:
            best_labels_[row['focus']] = [label]

best_labels_ = {k:list(set(v)) for k, v in best_labels_.items()}
best_labels = {}
for k, v in best_labels_.items():
    for i in range(len(v)):
        if k not in best_labels:
            best_labels[k] = {i: v[i]}
        else:
            best_labels[k].update({i: v[i]})
# best_labels
best_labs_all[dat] = best_labels
# best_labs_all
# tab_best['distance'] = tab_best['distance'].map(dis_name)
# tab_best[['MinPts', 'Min Clt Size', 'Min Samples']] = tab_best[['MinPts', 'Min Clt Size', 'Min Samples']].fillna(-1).astype(int).astype(str).replace('-1','')
# tab_best.columns = tab_best.columns.map(ord_name)
# tab_best.drop(columns=[r'$k$ (% Noise)']).set_index(['Score', 'Method'])
tab = prepare_best_table(tab_best, dis_name, ord_name)
tab

chains_ge_4_lt_10


Unnamed: 0_level_0,Unnamed: 1_level_0,Distance,Dim,$k$,SL,CH,DB,Entropy,Eps,MinPts,Min Clt Size,Min Samples
Score,Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SL,DBSCAN,Euclidean,50,2,0.971391,4648.157274,0.07774,0.074447,0.5,2,-,-
SL,HDBSCAN,Euclidean,300,2,0.95208,595.956277,0.444803,0.074447,,-,2,29
SL,KMeans,Euclidean,300,2,0.130799,250.298321,5.986984,0.672911,,-,-,-
CH,DBSCAN,Euclidean,50,2,0.965472,8049.609924,0.120699,0.211335,0.69,8,-,-
CH,HDBSCAN,Euclidean,300,2,0.95208,595.956277,0.444803,0.074447,,-,2,28
CH,KMeans,Euclidean,50,2,0.026429,255.962747,6.04197,0.692301,,-,-,-
DB,DBSCAN,Euclidean,50,2,0.971391,4648.157274,0.07774,0.074447,0.5,2,-,-
DB,HDBSCAN,Euclidean,300,2,0.944887,261.491551,0.335176,0.074023,,-,2,25
DB,KMeans,Euclidean,300,18,0.060717,43.332352,2.735548,1.354116,,-,-,-
Entropy,DBSCAN,Cosine,50,2,0.058241,0.828526,1.081101,0.00108,0.43,7,-,-


In [164]:
tab.\
    to_html(na_rep='-', float_format="%.3f")

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th></th>\n      <th>Distance</th>\n      <th>Dim</th>\n      <th>$k$</th>\n      <th>SL</th>\n      <th>CH</th>\n      <th>DB</th>\n      <th>Entropy</th>\n      <th>Eps</th>\n      <th>MinPts</th>\n      <th>Min Clt Size</th>\n      <th>Min Samples</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th>Method</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th rowspan="3" valign="top">SL</th>\n      <th>DBSCAN</th>\n      <td>Euclidean</td>\n      <td>50</td>\n      <td>2</td>\n      <td>0.971</td>\n      <td>4648.157</td>\n      <td>0.078</td>\n      <td>0.074</td>\n      <td>0.500</td>\n      <td>2</td>\n      <td>-</td>\n      <td>-</td>\n    </tr>\n    <tr>\n      <th>HDBSCAN<

In [165]:
dat = next(iterator_datasets)
tab_best = pd.DataFrame()
print(dat)
for score in eval_scores:
    if score in ['db_score', 'entropy']:
        asc = True
    else:
        asc= False
    tmp1 = pd.concat([
        df_kmeans.loc[df_kmeans['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_dbscan.loc[df_dbscan['dataset']==dat, :].sort_values(score,ascending=asc).head(1),
        df_hdbscan.loc[df_hdbscan['dataset']==dat,:].sort_values(score,ascending=asc).head(1)
    ])
    tmp1['focus'] = score
    tmp1['focus'] = tmp1['focus'].map(ord_name)
    # tmp1['distance'] = tmp1['distance'].map(dis_name)
    tmp1 = tmp1.sort_values(score, ascending=asc)
    tab_best = pd.concat([tab_best, tmp1.loc[:,ord_cols+['focus']].dropna(how='all',axis=1)])

best_labels_ = {}
for score in eval_scores:
    for idx, row in tab_best.iterrows():
        if row['method'] == 'KMeans':
            label = labels_km(row, dat)
        elif row['method'] == 'DBSCAN':
            label = labels_db(row, dat)
        elif row['method'] == 'HDBSCAN':
            label = labels_hd(row, dat)
        # print(row['focus'])
        # print(label)
        if row['focus'] in best_labels_:
            best_labels_[row['focus']].append(label)
        else:
            best_labels_[row['focus']] = [label]

best_labels_ = {k:list(set(v)) for k, v in best_labels_.items()}
best_labels = {}
for k, v in best_labels_.items():
    for i in range(len(v)):
        if k not in best_labels:
            best_labels[k] = {i: v[i]}
        else:
            best_labels[k].update({i: v[i]})
# best_labels
best_labs_all[dat] = best_labels
# best_labs_all
# tab_best['distance'] = tab_best['distance'].map(dis_name)
# tab_best[['MinPts', 'Min Clt Size', 'Min Samples']] = tab_best[['MinPts', 'Min Clt Size', 'Min Samples']].fillna(-1).astype(int).astype(str).replace('-1','')
# tab_best.columns = tab_best.columns.map(ord_name)
# tab_best.drop(columns=[r'$k$ (% Noise)']).set_index(['Score', 'Method'])
tab = prepare_best_table(tab_best, dis_name, ord_name)
tab

chains_ge_10


Unnamed: 0_level_0,Unnamed: 1_level_0,Distance,Dim,$k$,SL,CH,DB,Entropy,Eps,MinPts,Min Clt Size,Min Samples,NN
Score,Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
SL,DBSCAN,Euclidean,300,2,0.974457,3260.838531,0.149425,0.102108,0.63,2,-,-,
SL,HDBSCAN,Euclidean,300,2,0.742529,85.182302,0.488786,0.215816,,-,3,25,
SL,KMeans,Euclidean,300,2,0.151042,1.693616,0.740964,0.001799,,-,-,-,
CH,DBSCAN,Euclidean,50,2,0.971363,4488.032424,0.080403,0.102967,0.55,2,-,-,
CH,HDBSCAN,L2 Norm,300,2,0.7201,236.593386,0.525198,0.454319,,-,4,22,
CH,KMeans,Euclidean,50,2,0.04345,130.748332,5.806007,0.615378,,-,-,-,
DB,DBSCAN,Euclidean,50,2,0.971363,4488.032424,0.080403,0.102967,0.55,2,-,-,
DB,HDBSCAN,Euclidean,300,2,0.742529,85.182302,0.488786,0.215816,,-,3,25,
DB,KMeans,Euclidean,300,3,0.146484,1.79102,0.726094,0.003599,,-,-,-,
Entropy,KMeans,Euclidean,300,2,0.151042,1.693616,0.740964,0.001799,,-,-,-,


In [166]:
tab.\
    to_html(na_rep='-', float_format="%.3f")

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th></th>\n      <th>Distance</th>\n      <th>Dim</th>\n      <th>$k$</th>\n      <th>SL</th>\n      <th>CH</th>\n      <th>DB</th>\n      <th>Entropy</th>\n      <th>Eps</th>\n      <th>MinPts</th>\n      <th>Min Clt Size</th>\n      <th>Min Samples</th>\n      <th>NN</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th>Method</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th rowspan="3" valign="top">SL</th>\n      <th>DBSCAN</th>\n      <td>Euclidean</td>\n      <td>300</td>\n      <td>2</td>\n      <td>0.974</td>\n      <td>3260.839</td>\n      <td>0.149</td>\n      <td>0.102</td>\n      <td>0.630</td>\n      <td>2</td>\n      <td>-</td>\n      <td>-</td>\n  

In [167]:
best_labs_all

{'chains_eq_2': {'SL': {0: 'labels_km_chains_eq_2_300_02_euclidean.pkl',
   1: 'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl',
   2: 'labels_hdbscan_chains_eq_2_50_05_34_euclidean.pkl'},
  'CH': {0: 'labels_km_chains_eq_2_50_02_euclidean.pkl',
   1: 'labels_hdbscan_chains_eq_2_50_02_27_euclidean.pkl',
   2: 'labels_dbscan_chains_eq_2_300_0.82_05_euclidean.pkl'},
  'DB': {0: 'labels_dbscan_chains_eq_2_300_0.76_02_euclidean.pkl',
   1: 'labels_hdbscan_chains_eq_2_50_05_32_euclidean.pkl',
   2: 'labels_km_chains_eq_2_300_20_euclidean.pkl'},
  'Entropy': {0: 'labels_km_chains_eq_2_50_02_euclidean.pkl',
   1: 'labels_dbscan_chains_eq_2_50_0.54_02_wmd.pkl',
   2: 'labels_hdbscan_chains_eq_2_300_02_33_euclidean.pkl'}},
 'chains_eq_3': {'SL': {0: 'labels_hdbscan_chains_eq_3_300_02_06_l2.pkl',
   1: 'labels_km_chains_eq_3_300_02_euclidean.pkl',
   2: 'labels_dbscan_chains_eq_3_50_0.27_02_wmd.pkl'},
  'CH': {0: 'labels_dbscan_chains_eq_3_50_0.78_04_euclidean.pkl',
   1: 'labels_km_chains_

In [168]:
clusters_all = {}
for dat in datasets:
    clusters = {}
    for i, scr in best_labs_all[dat].items():
        for j, lab in scr.items():
            p = [p for p in labels_dir.glob(lab.lower())][0]
            if i not in clusters:
                clusters[i] = {j:pd.read_pickle(p)}
            else:
                clusters[i].update({j:pd.read_pickle(p)})
            # if dat in clusters:
            #     clusters[dat].append(pd.read_pickle(Path(labels_dir, lab)))
            # else:
            #     clusters[dat] = [pd.read_pickle(Path(labels_dir, lab))]
    clusters_all[dat] = clusters

In [169]:
lab

'labels_km_chains_ge_10_300_02_euclidean.pkl'

In [170]:
parsed_emails = {dat:pd.read_pickle(Path(data_dir,f'parsed_emails_{dat}.pkl')) for dat in datasets}

In [171]:
results_all = {}
for dat in datasets:
    results = {}
    for i, a in clusters_all[dat].items():
        for j, b in a.items():
            tmp = parsed_emails[dat].copy()
            tmp['label'] = b
            if i not in results:
                results[i] = {j:tmp}
            else:
                results[i].update({j:tmp})
    results_all[dat] = results

In [172]:
def get_cluster_tables(df):
    return df.rename(columns={'index':'k', 'label':'Counts'}).pivot_table(index='k', columns=['Score', 'Method'], fill_value=-1).astype(str).replace('-1', np.nan)

In [173]:
iterator_datasets = iter(datasets)

In [174]:
dat = next(iterator_datasets)
print(dat)
df_counts = pd.DataFrame()
for i, a in results_all[dat].items():
    for j, b in a.items():
        # print(best_labs_all[dat][i][j].split('_'))
        # print(i, j)
        # print(b['label'].value_counts())
        # if j == 0:
        tmp3 = pd.DataFrame(b['label'].value_counts()).reset_index()
        tmp3['Score'] = i
        tmp3['Method'] = best_labs_all[dat][i][j].split('_')[1]

        df_counts = pd.concat([df_counts, tmp3], ignore_index=True)
        # else:
        #     pass
           
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables(df_counts)
df_cls_tab

chains_eq_2


Unnamed: 0_level_0,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts
Score,CH,CH,CH,DB,DB,DB,Entropy,Entropy,Entropy,SL,SL,SL
Method,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km
k,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
-1,23522.0,23505.0,,23525.0,23510.0,,5.0,23494.0,,23522.0,23510.0,
0,5.0,22.0,9500.0,314.0,17.0,637.0,23834.0,2.0,9500.0,5.0,17.0,8372.0
1,314.0,314.0,14341.0,2.0,314.0,2418.0,2.0,345.0,14341.0,314.0,314.0,15469.0
2,,,,,,690.0,,,,,,
3,,,,,,7833.0,,,,,,
4,,,,,,7906.0,,,,,,
5,,,,,,1.0,,,,,,
6,,,,,,1.0,,,,,,
7,,,,,,1.0,,,,,,
8,,,,,,3414.0,,,,,,


In [175]:
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k']).T.\
df_cls_tab.to_html(na_rep='')

'<table border="1" class="dataframe">\n  <thead>\n    <tr>\n      <th></th>\n      <th colspan="12" halign="left">Counts</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th colspan="3" halign="left">CH</th>\n      <th colspan="3" halign="left">DB</th>\n      <th colspan="3" halign="left">Entropy</th>\n      <th colspan="3" halign="left">SL</th>\n    </tr>\n    <tr>\n      <th>Method</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n    </tr>\n    <tr>\n      <th>k</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>-1</th>\n      <td>23522</td>\n      <td

In [176]:
dat = next(iterator_datasets)
print(dat)
df_counts = pd.DataFrame()
for i, a in results_all[dat].items():
    for j, b in a.items():
        # print(best_labs_all[dat][i][j].split('_'))
        # print(i, j)
        # print(b['label'].value_counts())
        # if j == 0:
        tmp3 = pd.DataFrame(b['label'].value_counts()).reset_index()
        tmp3['Score'] = i
        tmp3['Method'] = best_labs_all[dat][i][j].split('_')[1]
        df_counts = pd.concat([df_counts, tmp3], ignore_index=True)
        # else:
            # pass

# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables(df_counts)
df_cls_tab

chains_eq_3


Unnamed: 0_level_0,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts
Score,CH,CH,CH,DB,DB,DB,Entropy,Entropy,Entropy,SL,SL,SL
Method,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km
k,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
-1,8639.0,8344.0,,,8632.0,,863.0,411.0,,,8632.0,
0,126.0,6.0,3615.0,8766.0,135.0,1.0,7905.0,8356.0,3615.0,8766.0,135.0,3615.0
1,4.0,2.0,5154.0,3.0,2.0,4168.0,1.0,2.0,5154.0,3.0,2.0,5154.0
2,,415.0,,,,1.0,,,,,,
3,,2.0,,,,1.0,,,,,,
4,,,,,,1.0,,,,,,
5,,,,,,3.0,,,,,,
6,,,,,,557.0,,,,,,
7,,,,,,3.0,,,,,,
8,,,,,,3.0,,,,,,


In [177]:
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k']).T.\
df_cls_tab.to_html(na_rep='')

'<table border="1" class="dataframe">\n  <thead>\n    <tr>\n      <th></th>\n      <th colspan="12" halign="left">Counts</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th colspan="3" halign="left">CH</th>\n      <th colspan="3" halign="left">DB</th>\n      <th colspan="3" halign="left">Entropy</th>\n      <th colspan="3" halign="left">SL</th>\n    </tr>\n    <tr>\n      <th>Method</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n    </tr>\n    <tr>\n      <th>k</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>-1</th>\n      <td>8639</td>\n      <td>

In [178]:
dat = next(iterator_datasets)
print(dat)
df_counts = pd.DataFrame()
for i, a in results_all[dat].items():
    for j, b in a.items():
        # print(best_labs_all[dat][i][j].split('_'))
        # print(i, j)
        # print(b['label'].value_counts())
        # if j == 0:
        tmp3 = pd.DataFrame(b['label'].value_counts()).reset_index()
        tmp3['Score'] = i
        tmp3['Method'] = best_labs_all[dat][i][j].split('_')[1]
        df_counts = pd.concat([df_counts, tmp3], ignore_index=True)
        # else:
        #     pass

# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables(df_counts)
df_cls_tab

chains_ge_4_lt_10


Unnamed: 0_level_0,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts
Score,CH,CH,CH,DB,DB,DB,Entropy,Entropy,Entropy,SL,SL,SL
Method,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km
k,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
-1,9529.0,9535.0,,9535.0,9534.0,,278.0,179.0,,9535.0,9535.0,
0,139.0,2.0,4639.0,139.0,140.0,5.0,9397.0,9495.0,3868.0,139.0,2.0,3868.0
1,8.0,139.0,5037.0,2.0,2.0,4621.0,1.0,2.0,5808.0,2.0,139.0,5808.0
2,,,,,,357.0,,,,,,
3,,,,,,338.0,,,,,,
4,,,,,,1.0,,,,,,
5,,,,,,9.0,,,,,,
6,,,,,,7.0,,,,,,
7,,,,,,2370.0,,,,,,
8,,,,,,1789.0,,,,,,


In [179]:
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k']).T.\
df_cls_tab.to_html(na_rep='')

'<table border="1" class="dataframe">\n  <thead>\n    <tr>\n      <th></th>\n      <th colspan="12" halign="left">Counts</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th colspan="3" halign="left">CH</th>\n      <th colspan="3" halign="left">DB</th>\n      <th colspan="3" halign="left">Entropy</th>\n      <th colspan="3" halign="left">SL</th>\n    </tr>\n    <tr>\n      <th>Method</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n    </tr>\n    <tr>\n      <th>k</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>-1</th>\n      <td>9529</td>\n      <td>

In [180]:
dat = next(iterator_datasets)
print(dat)
df_counts = pd.DataFrame()
for i, a in results_all[dat].items():
    for j, b in a.items():
        # print(best_labs_all[dat][i][j].split('_'))
        # print(i, j)
        # print(b['label'].value_counts())
        # if j == 0:
        tmp3 = pd.DataFrame(b['label'].value_counts()).reset_index()
        tmp3['Score'] = i
        tmp3['Method'] = best_labs_all[dat][i][j].split('_')[1]
        df_counts = pd.concat([df_counts, tmp3], ignore_index=True)
        # else:
        #     pass

# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables(df_counts)
df_cls_tab

chains_ge_10


Unnamed: 0_level_0,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts,Counts
Score,CH,CH,CH,DB,DB,DB,Entropy,Entropy,Entropy,SL,SL,SL
Method,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km,dbscan,hdbscan,km
k,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
-1,5230.0,5182.0,,5230.0,5199.0,,60.0,777.0,,5229.0,5199.0,
0,92.0,118.0,3698.0,92.0,7.0,5322.0,5262.0,4545.0,1.0,93.0,7.0,1.0
1,2.0,24.0,1626.0,2.0,118.0,1.0,2.0,2.0,5323.0,2.0,118.0,5323.0
2,,,,,,1.0,,,,,,


In [181]:
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k']).T.\
df_cls_tab.to_html(na_rep='')

'<table border="1" class="dataframe">\n  <thead>\n    <tr>\n      <th></th>\n      <th colspan="12" halign="left">Counts</th>\n    </tr>\n    <tr>\n      <th>Score</th>\n      <th colspan="3" halign="left">CH</th>\n      <th colspan="3" halign="left">DB</th>\n      <th colspan="3" halign="left">Entropy</th>\n      <th colspan="3" halign="left">SL</th>\n    </tr>\n    <tr>\n      <th>Method</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n      <th>dbscan</th>\n      <th>hdbscan</th>\n      <th>km</th>\n    </tr>\n    <tr>\n      <th>k</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>-1</th>\n      <td>5230</td>\n      <td>

In [182]:

for d in clusters.values():
    for i, c in d.items():
        print(len(c))

5324
5324
5324
5324
5324
5324
5324
5324
5324
5324
5324
5324


In [187]:
def get_cluster_tables2(df):
    return df.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])

def get_cluster_counts(dat, results, labels):
    df_counts = pd.DataFrame()
    for i, a in results[dat].items():
        for j, b in a.items():
            tmp3 = pd.DataFrame(b['label'].value_counts()).reset_index()
            tmp3['Score'] = i
            tmp3['Method'] = labels[dat][i][j].split('_')[1]

            df_counts = pd.concat([df_counts, tmp3], ignore_index=True)
    return df_counts

In [188]:
iterator_datasets = iter(datasets)

In [189]:
dat = next(iterator_datasets)
print(dat)
        
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables2(get_cluster_counts(dat, results_all, best_labs_all))
df_cls_tab

chains_eq_2


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Counts
Score,Method,k,Unnamed: 3_level_1
SL,km,1,15469
SL,km,0,8372
SL,dbscan,-1,23522
SL,dbscan,1,314
SL,dbscan,0,5
SL,hdbscan,-1,23510
SL,hdbscan,1,314
SL,hdbscan,0,17
CH,km,1,14341
CH,km,0,9500


In [190]:
dat = next(iterator_datasets)
print(dat)
        
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables2(get_cluster_counts(dat, results_all, best_labs_all))
df_cls_tab

chains_eq_3


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Counts
Score,Method,k,Unnamed: 3_level_1
SL,hdbscan,-1,8632
SL,hdbscan,0,135
SL,hdbscan,1,2
SL,km,1,5154
SL,km,0,3615
SL,dbscan,0,8766
SL,dbscan,1,3
CH,dbscan,-1,8639
CH,dbscan,0,126
CH,dbscan,1,4


In [191]:
dat = next(iterator_datasets)
print(dat)
        
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables2(get_cluster_counts(dat, results_all, best_labs_all))
df_cls_tab

chains_ge_4_lt_10


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Counts
Score,Method,k,Unnamed: 3_level_1
SL,hdbscan,-1,9535
SL,hdbscan,1,139
SL,hdbscan,0,2
SL,km,1,5808
SL,km,0,3868
SL,dbscan,-1,9535
SL,dbscan,0,139
SL,dbscan,1,2
CH,km,1,5037
CH,km,0,4639


In [192]:
dat = next(iterator_datasets)
print(dat)
        
# df_counts.rename(columns={'index':'k', 'label':'Counts'}).set_index(['Score', 'Method', 'k'])
df_cls_tab = get_cluster_tables2(get_cluster_counts(dat, results_all, best_labs_all))
df_cls_tab

chains_ge_10


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Counts
Score,Method,k,Unnamed: 3_level_1
SL,hdbscan,-1,5199
SL,hdbscan,1,118
SL,hdbscan,0,7
SL,km,1,5323
SL,km,0,1
SL,dbscan,-1,5229
SL,dbscan,0,93
SL,dbscan,1,2
CH,hdbscan,-1,5182
CH,hdbscan,0,118


## Chains len 3

In [43]:
best_labs_all[dat]

{'SL': {0: 'labels_hdbscan_chains_ge_10_300_03_25_euclidean.pkl',
  1: 'labels_km_chains_ge_10_300_02_euclidean.pkl',
  2: 'labels_dbscan_chains_ge_10_300_0.63_02_euclidean.pkl'},
 'CH': {0: 'labels_hdbscan_chains_ge_10_300_04_22_l2.pkl',
  1: 'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl',
  2: 'labels_km_chains_ge_10_50_02_euclidean.pkl'},
 'DB': {0: 'labels_hdbscan_chains_ge_10_300_03_25_euclidean.pkl',
  1: 'labels_dbscan_chains_ge_10_50_0.55_02_euclidean.pkl',
  2: 'labels_km_chains_ge_10_300_03_euclidean.pkl'},
 'Entropy': {0: 'labels_hdbscan_chains_ge_10_50_02_34_cosine.pkl',
  1: 'labels_dbscan_chains_ge_10_300_5.43320*_02_euclidean.pkl',
  2: 'labels_km_chains_ge_10_300_02_euclidean.pkl'}}

In [44]:
best_labs_all[dat]['SL'][2]

'labels_dbscan_chains_ge_10_300_0.63_02_euclidean.pkl'

In [57]:
dat

'chains_ge_10'

In [48]:
labs_array = pd.read_pickle(Path(labels_dir, best_labs_all[dat]['SL'][2]))

In [52]:
df_best = parsed_emails[dat]
df_best['cluster'] = labs_array

In [69]:
df_best

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
955,Hi Gerald: We have executed agreements with...,True,23316,21,Re: CA Data Sheet,kay.young@enron.com,gerald.nemec@enron.com,930038100,-1
1580,"G, How is it going? Been a while since we sp...",False,127447,27,,gerald.nemec@enron.com,gtownsend@manorisd.net,934198740,-1
1608,"GT, The theme of the party sounds excellent. ...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934288740,-1
1668,"GT, I will be taking Friday off. Probably dr...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934794240,-1
1867,"G, I will be there about 9 pm tonight. My ce...",True,127447,27,Re:,gerald.nemec@enron.com,gtownsend@manorisd.net,935769600,-1
...,...,...,...,...,...,...,...,...,...
250216,nothing more than what the rags say..............,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023218609,-1
250217,i'm going to las vegas in august for a couple ...,True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219055,-1
250218,My last fun trip was to portland. Going to se...,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023219293,-1
250219,"dude, serious? Portland...portland is soooooo...",True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219865,-1


In [56]:
df_best[df_best['cluster'] != -1]['Chain'].value_counts()


131203    26
122336     7
125390     5
127857     4
122191     4
129255     3
130920     2
212224     2
130987     2
219868     2
129004     2
125120     2
122292     2
130003     2
124001     2
130453     1
121444     1
124647     1
126965     1
129545     1
130276     1
129318     1
124662     1
126887     1
127175     1
108478     1
128612     1
128525     1
96466      1
130941     1
202347     1
127808     1
111694     1
123544     1
126810     1
218433     1
122161     1
130620     1
128697     1
130942     1
130986     1
200957     1
126676     1
Name: Chain, dtype: int64

In [66]:
df_best['Chain'].max()

219868

In [70]:
df_best.sort_values(by=['Timestamp','Chain'])

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,cluster
955,Hi Gerald: We have executed agreements with...,True,23316,21,Re: CA Data Sheet,kay.young@enron.com,gerald.nemec@enron.com,930038100,-1
1580,"G, How is it going? Been a while since we sp...",False,127447,27,,gerald.nemec@enron.com,gtownsend@manorisd.net,934198740,-1
1608,"GT, The theme of the party sounds excellent. ...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934288740,-1
1668,"GT, I will be taking Friday off. Probably dr...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934794240,-1
1867,"G, I will be there about 9 pm tonight. My ce...",True,127447,27,Re:,gerald.nemec@enron.com,gtownsend@manorisd.net,935769600,-1
...,...,...,...,...,...,...,...,...,...
250216,nothing more than what the rags say..............,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023218609,-1
250217,i'm going to las vegas in august for a couple ...,True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219055,-1
250218,My last fun trip was to portland. Going to se...,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023219293,-1
250219,"dude, serious? Portland...portland is soooooo...",True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219865,-1


In [71]:
for idx, row in df_best.sort_values(by=['Timestamp','Chain'])[df_best['cluster'] != -1].iterrows():
    print('Chain ID:', f'{row["Chain"]:06d}' ,'Index:', idx)
    print('Subject:', row['Subject'])
    print('Sender:', row['Sender'], 'Recipient: ', row['Recipients'])
    print('Message:',row['Message'])
    print('\n')

Chain ID: 124647 Index: 24123
Subject: Re:
Sender: chris.dorland@enron.com Recipient:  mmolloy@oebi.com
Message: Perhaps...


Chain ID: 130941 Index: 31926
Subject: Re:
Sender: matthew.lenhart@enron.com Recipient:  shelliott@dttus.com
Message: sure.  maybe.


Chain ID: 130003 Index: 49710
Subject: RE:
Sender: mark.guzman@enron.com Recipient:  katie.trullinger@wfsg.com
Message: So.........


Chain ID: 200957 Index: 50356
Subject: RE: FW: What's up?
Sender: katie.trullinger@wfsg.com Recipient:  mark.guzman@enron.com
Message: Cool.




Chain ID: 130986 Index: 50804
Subject: Re:
Sender: matthew.lenhart@enron.com Recipient:  val.generes@ac.com
Message: thanks.


Chain ID: 130942 Index: 58487
Subject: RE:
Sender: matthew.lenhart@enron.com Recipient:  shirley.s.elliott@citicorp.com
Message: d-i-r-t-y


Chain ID: 130003 Index: 59640
Subject: RE:
Sender: mark.guzman@enron.com Recipient:  katie.trullinger@wfsg.com
Message: No.  


Chain ID: 128697 Index: 63984
Subject: Re:
Sender: jeff.dasovich@

  for idx, row in df_best.sort_values(by=['Timestamp','Chain'])[df_best['cluster'] != -1].iterrows():


In [91]:
def print_emails(df):
    for idx, row in df.iterrows():
        print('Chain ID:', f'{row["Chain"]:06d}' ,'Index:', idx)
        print('Date:', datetime.fromtimestamp(row['Timestamp']))
        print('Subject:', row['Subject'])
        print('Sender:', row['Sender'], ', Recipient: ', row['Recipients'])
        print('Message:',row['Message'])
        print('\n')

In [78]:
df_best.groupby('cluster')['Chain'].count()

cluster
-1    5229
 0      93
 1       2
Name: Chain, dtype: int64

In [79]:
df_best.loc[df_best['Chain']==131203,'Chain_len']

173523    798
173526    798
173531    798
173535    798
173538    798
         ... 
241753    798
241755    798
241759    798
241761    798
241941    798
Name: Chain_len, Length: 798, dtype: object

In [95]:
df_best.loc[df_best['cluster']==1,'Chain'].value_counts()

131203    1
122336    1
Name: Chain, dtype: int64

In [92]:
print_emails(df_best.loc[df_best['cluster']==1])

Chain ID: 131203 Index: 224024
Date: 2002-01-07 21:54:47
Subject: RE:
Sender: mike.maggi@enron.com , Recipient:  michelle.nelson@enron.com
Message: ok

 


Chain ID: 122336 Index: 229806
Date: 2002-01-17 17:36:36
Subject: RE:
Sender: mike.maggi@enron.com , Recipient:  amanda.rybarski@enron.com
Message: ok

 




In [96]:
df_best.loc[df_best['cluster']==0,'Chain'].value_counts()

131203    25
122336     6
125390     5
127857     4
122191     4
129255     3
130920     2
212224     2
130987     2
219868     2
129004     2
125120     2
122292     2
130003     2
124001     2
130453     1
121444     1
124647     1
126965     1
129545     1
130276     1
129318     1
124662     1
126887     1
127175     1
108478     1
128612     1
128525     1
96466      1
130941     1
202347     1
127808     1
111694     1
123544     1
126810     1
218433     1
122161     1
130620     1
128697     1
130942     1
130986     1
200957     1
126676     1
Name: Chain, dtype: int64

In [93]:
print_emails(df_best.loc[df_best['cluster']==0])

Chain ID: 124647 Index: 24123
Date: 2000-07-11 15:14:00
Subject: Re:
Sender: chris.dorland@enron.com , Recipient:  mmolloy@oebi.com
Message: Perhaps...


Chain ID: 130941 Index: 31926
Date: 2000-08-25 16:04:00
Subject: Re:
Sender: matthew.lenhart@enron.com , Recipient:  shelliott@dttus.com
Message: sure.  maybe.


Chain ID: 130003 Index: 49710
Date: 2000-11-06 11:20:00
Subject: RE:
Sender: mark.guzman@enron.com , Recipient:  katie.trullinger@wfsg.com
Message: So.........


Chain ID: 200957 Index: 50356
Date: 2000-11-07 13:31:00
Subject: RE: FW: What's up?
Sender: katie.trullinger@wfsg.com , Recipient:  mark.guzman@enron.com
Message: Cool.




Chain ID: 130986 Index: 50804
Date: 2000-11-08 13:30:00
Subject: Re:
Sender: matthew.lenhart@enron.com , Recipient:  val.generes@ac.com
Message: thanks.


Chain ID: 130942 Index: 58487
Date: 2000-11-30 13:33:00
Subject: RE:
Sender: matthew.lenhart@enron.com , Recipient:  shirley.s.elliott@citicorp.com
Message: d-i-r-t-y


Chain ID: 130003 Index: 5

In [76]:
for idx, row in df_best.sort_values(by=['Timestamp','Chain']).loc[(df_best['Chain'] == 131203) &(df_best['cluster'] != -1),:].iterrows():
    print('Chain ID:', f'{row["Chain"]:06d}' ,'Index:', idx)
    print('Date:', datetime.fromtimestamp(row['Timestamp']))
    print('Subject:', row['Subject'])
    print('Sender:', row['Sender'], ', Recipient: ', row['Recipients'])
    print('Message:',row['Message'])

Chain ID: 131203 Index: 200565
Date: 2001-11-19 15:41:01
Subject: RE:
Sender: mike.maggi@enron.com , Recipient:  michelle.nelson@enron.com
Message: terrible, yours?

 
Chain ID: 131203 Index: 200584
Date: 2001-11-19 15:49:12
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: good.

 
Chain ID: 131203 Index: 200607
Date: 2001-11-19 15:55:52
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: whatever.

 
Chain ID: 131203 Index: 200658
Date: 2001-11-19 16:16:43
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: why?  

 
Chain ID: 131203 Index: 200791
Date: 2001-11-19 16:56:32
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: you're cooking?

 
Chain ID: 131203 Index: 200799
Date: 2001-11-19 16:59:32
Subject: RE:
Sender: michelle.nelson@enron.com , Recipient:  mike.maggi@enron.com
Message: that's cute.

 
Chain ID: 131203 Index: 2

In [49]:
labs_array

array([-1, -1, -1, ..., -1, -1, -1], dtype=int32)

In [41]:
results_all[dat]['SL']

{0:                                                   Message  Reply   Chain  \
 955     Hi Gerald:  We have  executed agreements  with...   True   23316   
 1580    G,  How is it going?  Been a while since we sp...  False  127447   
 1608    GT,  The theme of the party sounds excellent. ...   True  127447   
 1668    GT,  I will be taking Friday off.  Probably dr...   True  127447   
 1867    G,  I will be there about 9 pm tonight.  My ce...   True  127447   
 ...                                                   ...    ...     ...   
 250216  nothing more than what the rags say..............   True  104153   
 250217  i'm going to las vegas in august for a couple ...   True  104153   
 250218  My last fun trip was to portland.  Going to se...   True  104153   
 250219  dude, serious?  Portland...portland is soooooo...   True  104153   
 250221  I'll work on it.  Vegas works, as a start........   True  104153   
 
        Chain_len             Subject                  Sender  \
 955  

In [37]:
tmp = results['chains_eq_2'][0]

KeyError: 'chains_eq_2'

In [18]:
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
144,Wish we could go - but we're off to Ft. Lauder...,True,75963,2,Re: Friday,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,909762180,1
145,Hey Marc - any chance you guys might like to j...,True,75963,2,Re: Friday,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,909762960,1
157,This message was returned to me - it looks lik...,False,192107,2,Undeliverable message,mark.taylor@enron.com,per.sekse@enron.com,910973340,1
158,I think you can go straight to performance rev...,True,192107,2,Re: Undeliverable message,mark.taylor@enron.com,per.sekse@enron.com,911468460,1
165,Anita:\n\nI seem to remember that our traders ...,False,98742,2,LNG hedging for China,mark.taylor@enron.com,anita.fam@enron.com,911501880,0
...,...,...,...,...,...,...,...,...,...
250917,"Jeff,\n\nThe files are in DesertSkyCurtail in ...",False,107371,2,May curtailment spread,mark.fisher@enron.com,jeff.duff@enron.com,1025701860,0
250920,"Mark,\n\nI checked the sums with what was sent...",True,107371,2,Re: May curtailment spread,jeff.duff@enron.com,mark.fisher@enron.com,1025704560,1
250970,"Tim,\n\nIn Oct 2001 I produced the attached re...",False,74063,2,Fluvanna and Trew Ranch reports,mark.fisher@enron.com,"tim.derrick@enron.com, jeff.duff@enron.com, je...",1026400320,1
250971,"Thanks. I will use this report, and we should...",True,74063,2,Re: Fluvanna and Trew Ranch reports,tim.derrick@enron.com,"mark.fisher@enron.com, jeff.duff@enron.com, je...",1026403800,1


In [19]:
tmp = results['chains_eq_2'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
784,Thank you for your help. I look forward to he...,True,209486,2,RE: derivatives documentation software,tana.jones@enron.com,ian.howells@documentum.com,928482540,0
1336,Of all the weekends---Doug's and our family ar...,True,93957,2,Re: Is anyone using the Perd the weekend of 8/...,richard.sanders@enron.com,"mrmslane@aol.com, namuathome@aol.com, namuatho...",933001800,0
2179,Do you want to set up a time to meet tomorrow?...,False,190057,2,Tuesday Meeting,mark.taylor@enron.com,mark.dilworth@enron.com,937246140,0
3599,I'll be happy to schedule him. What do you think?,True,117527,2,Re: Neil Mayer,richard.sanders@enron.com,julia.murray@enron.com,943281420,0
4089,$575k,True,84439,2,Re: Havamann Arbitration PRIVILEGED AND CONFID...,richard.sanders@enron.com,john.nowlan@enron.com,945155460,1
...,...,...,...,...,...,...,...,...,...
246817,FYI...\n\n,False,15992,2,FW: Assignments for March 23,john.watson@pdq.net,kimberly.watson@enron.com,1015950664,1
247669,Cool.\n\n,True,49462,2,RE: Dominion Transmission Notices,chris.germany@enron.com,kathryn.bussell@enron.com,1016652403,1
249374,IN? WHEN?\n\n,True,213777,2,RE: man night again?,joe.parks@enron.com,"brianc@saltgrass.com, erwollam@hotmail.com, bc...",1020173281,1
249960,http://hometown.aol.com/trogg522/myhomepage/in...,False,43396,2,Daddy's little Angel,chris.germany@enron.com,jfoard@coral-energy.com,1022245763,1


In [20]:
tmp = results['chains_eq_3'][0]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
142,"\nHey Paul, how is it going?? Attached you'll...",False,87415,3,How are you?,educanto@msn.com,d..thomas@enron.com,883935960,0
421,"Maria,\n\nThe Clearing docs we got in from the...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925474740,0
424,"Mark,\n\nDoes this mean that you would prefer ...",True,49280,3,Re: Documentation from OM,maria.nartey@enron.com,"mark.elliott@enron.com, richard.sage@enron.com...",925482120,0
425,"Maria,\n\nNot necessarily - it is just that th...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925485840,0
503,Wow - that is one nasty looking storm out ther...,False,112512,3,Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926502600,0
...,...,...,...,...,...,...,...,...,...
250676,She is going to print all the Appalachian Prod...,True,16029,3,RE: Assistant to print contracts,chris.germany@enron.com,"ed.mcmichael@enron.com, ruth.concannon@enron.com",1024576950,0
250686,OK to both. Let's use Heather Choate too if i...,True,16029,3,RE: Assistant to print contracts,ed.mcmichael@enron.com,"chris.germany@enron.com, ruth.concannon@enron.com",1024588182,0
250700,does that mean i need to cover\n \n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602537,0
250703,9369 TOMORROW\n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602851,0


In [21]:
tmp = results['chains_eq_3'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
142,"\nHey Paul, how is it going?? Attached you'll...",False,87415,3,How are you?,educanto@msn.com,d..thomas@enron.com,883935960,0
421,"Maria,\n\nThe Clearing docs we got in from the...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925474740,0
424,"Mark,\n\nDoes this mean that you would prefer ...",True,49280,3,Re: Documentation from OM,maria.nartey@enron.com,"mark.elliott@enron.com, richard.sage@enron.com...",925482120,0
425,"Maria,\n\nNot necessarily - it is just that th...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925485840,0
503,Wow - that is one nasty looking storm out ther...,False,112512,3,Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926502600,0
...,...,...,...,...,...,...,...,...,...
250676,She is going to print all the Appalachian Prod...,True,16029,3,RE: Assistant to print contracts,chris.germany@enron.com,"ed.mcmichael@enron.com, ruth.concannon@enron.com",1024576950,0
250686,OK to both. Let's use Heather Choate too if i...,True,16029,3,RE: Assistant to print contracts,ed.mcmichael@enron.com,"chris.germany@enron.com, ruth.concannon@enron.com",1024588182,0
250700,does that mean i need to cover\n \n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602537,0
250703,9369 TOMORROW\n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602851,0


In [22]:
tmp = results['chains_eq_3'][2]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
508,713-853-7459,True,112512,3,Re: Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926513940,0
7314,Who's Dana?,True,56107,3,Re: EOL Credit Responses 2/2,tana.jones@enron.com,leslie.hansen@enron.com,949680720,0
12044,O.K. Don't forget!!\n\n,True,164027,3,Re: Saturday Breakfast,pyoung@pdq.net,tana.jones@enron.com,954509640,0
37844,503-464-3740,True,45105,3,Re: Deals #417310 & #417311,mark.guzman@enron.com,kimberly.hundl@enron.com,969522000,0
39459,vkamins@enron.com\n\nvkaminski@aol.com,False,131437,3,,vince.kaminski@enron.com,phil.sisneros@enron.com,970066800,0
...,...,...,...,...,...,...,...,...,...
245587,"Metamucil, baby!\n\n",True,6759,3,RE: A PREVIEW OF COMING ATTRACTIONS,chet_fenner@bmc.com,joe.parks@enron.com,1014996627,0
245991,>,False,30867,3,Centana Letter Agreement.DOC,sproctor@akllp.com,joe.parks@enron.com,1015350736,0
246930,WHat?\n\n,True,94117,3,RE: It's Happening!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1016028995,0
248301,"Doug, money! yes?",False,51146,3,Duke Field Services,joe.parks@enron.com,doug.sewell@enron.com,1017243513,0


In [23]:
tmp = results['chains_ge_4_lt_10'][0]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
302,Are you guys around this weekend? Any particu...,False,199022,4,Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918222960,0
303,I'm flying solo this weekend. No particular p...,True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918225720,0
305,"Happy hour with staff, not family :-(",True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918234000,0
462,not a thing yet,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,926337960,0
604,sorry to say but I haven't heard a thing,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,927200280,0
...,...,...,...,...,...,...,...,...,...
250874,Good point. That will be good time to come in...,True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035400,0
250875,"Wooo, what a day! Blood-red screen, except fo...",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035638,0
250876,its called liquidation\n\n,True,203187,4,RE: YOU CAN THANK ME LATER,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1025035715,0
250879,"Si, Se?or Paras!\n\n \n\n",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025036494,1


In [24]:
tmp = results['chains_ge_4_lt_10'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
302,Are you guys around this weekend? Any particu...,False,199022,4,Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918222960,0
303,I'm flying solo this weekend. No particular p...,True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918225720,0
305,"Happy hour with staff, not family :-(",True,199022,4,Re: Weekend,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,918234000,0
462,not a thing yet,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,926337960,0
604,sorry to say but I haven't heard a thing,True,69384,7,Re: Exxon,elizabeth.sager@enron.com,john.malowney@enron.com,927200280,0
...,...,...,...,...,...,...,...,...,...
250874,Good point. That will be good time to come in...,True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035400,0
250875,"Wooo, what a day! Blood-red screen, except fo...",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025035638,0
250876,its called liquidation\n\n,True,203187,4,RE: YOU CAN THANK ME LATER,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1025035715,0
250879,"Si, Se?or Paras!\n\n \n\n",True,203188,6,RE: YOU CAN THANK ME LATER,chet_fenner@bmc.com,joe.parks@enron.com,1025036494,0


In [25]:
tmp = results['chains_ge_4_lt_10'][2]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
11343,michaelpshannon@yahoo.com,True,123037,5,Re:,benjamin.rogers@enron.com,brandon.neff@enron.com,953893500,0
14882,http://www.lonestarford.com/newcars/expedition...,False,131016,4,,mike.carson@enron.com,mcarson@gtemail.net,956919300,0
21849,Weasel!!,True,121380,4,Re:,benjamin.rogers@enron.com,7028587@skytel.com,962005560,0
28491,Thanks!,True,123326,4,Re:,benjamin.rogers@enron.com,jonathan.hoff@enron.com,965724480,0
33998,37176,True,130927,6,Re:,matthew.lenhart@enron.com,paul.lucci@enron.com,968165640,0
...,...,...,...,...,...,...,...,...,...
247810,:-)\n\n,True,32686,6,RE: Citrix application?,jimmy.manguba@enron.com,chris.germany@enron.com,1016728973,0
248442,I'm LOOOOOKING!!!!\n\n,True,150570,4,RE: Questions We Need Dominion To Answer,chris.germany@enron.com,sproctor@akllp.com,1017430736,0
250606,http://bible.gospelcom.net/,False,124830,4,,chris.germany@enron.com,trogg522@aol.com,1024425791,0
250704,TOMARROW.\n\n \n\n,True,80625,4,RE: Go Baby!,chet_fenner@bmc.com,joe.parks@enron.com,1024602900,0


In [26]:
tmp = results['chains_ge_10'][0]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
24123,Perhaps...,True,124647,30,Re:,chris.dorland@enron.com,mmolloy@oebi.com,963321240,0
31926,sure. maybe.,True,130941,28,Re:,matthew.lenhart@enron.com,shelliott@dttus.com,967212240,0
49710,So.........,True,130003,48,RE:,mark.guzman@enron.com,katie.trullinger@wfsg.com,973506000,0
50356,Cool.\n\n,True,200957,12,RE: FW: What's up?,katie.trullinger@wfsg.com,mark.guzman@enron.com,973600260,0
50804,thanks.,True,130986,13,Re:,matthew.lenhart@enron.com,val.generes@ac.com,973686600,0
...,...,...,...,...,...,...,...,...,...
237701,http://breeders.dogbreedinfo.com/index.php?a_i...,False,126887,41,,eric.bass@enron.com,shanna.husser@enron.com,1012422754,0
239211,when?\n\n,True,122336,233,RE:,amanda.rybarski@enron.com,mike.maggi@enron.com,1012585956,0
241746,sorry!\n\n,True,131203,798,RE:,michelle.nelson@enron.com,mike.maggi@enron.com,1013011133,0
244551,Permanently?\n\n,True,127175,36,RE:,frank.hayden@enron.com,joe.parks@enron.com,1014416162,0


In [27]:
tmp = results['chains_ge_10'][1]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
24123,Perhaps...,True,124647,30,Re:,chris.dorland@enron.com,mmolloy@oebi.com,963321240,0
31926,sure. maybe.,True,130941,28,Re:,matthew.lenhart@enron.com,shelliott@dttus.com,967212240,0
49710,So.........,True,130003,48,RE:,mark.guzman@enron.com,katie.trullinger@wfsg.com,973506000,0
50356,Cool.\n\n,True,200957,12,RE: FW: What's up?,katie.trullinger@wfsg.com,mark.guzman@enron.com,973600260,0
50804,thanks.,True,130986,13,Re:,matthew.lenhart@enron.com,val.generes@ac.com,973686600,0
...,...,...,...,...,...,...,...,...,...
237701,http://breeders.dogbreedinfo.com/index.php?a_i...,False,126887,41,,eric.bass@enron.com,shanna.husser@enron.com,1012422754,0
239211,when?\n\n,True,122336,233,RE:,amanda.rybarski@enron.com,mike.maggi@enron.com,1012585956,0
241746,sorry!\n\n,True,131203,798,RE:,michelle.nelson@enron.com,mike.maggi@enron.com,1013011133,0
244551,Permanently?\n\n,True,127175,36,RE:,frank.hayden@enron.com,joe.parks@enron.com,1014416162,0


In [47]:
tmp = results['chains_ge_10'][2]
tmp[tmp['label'] > -1]

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,label
955,Hi Gerald: We have executed agreements with...,True,23316,21,Re: CA Data Sheet,kay.young@enron.com,gerald.nemec@enron.com,930038100,0
1580,"G, How is it going? Been a while since we sp...",False,127447,27,,gerald.nemec@enron.com,gtownsend@manorisd.net,934198740,0
1608,"GT, The theme of the party sounds excellent. ...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934288740,0
1668,"GT, I will be taking Friday off. Probably dr...",True,127447,27,RE:,gerald.nemec@enron.com,gtownsend@manorisd.net,934794240,0
1867,"G, I will be there about 9 pm tonight. My ce...",True,127447,27,Re:,gerald.nemec@enron.com,gtownsend@manorisd.net,935769600,0
...,...,...,...,...,...,...,...,...,...
250216,nothing more than what the rags say..............,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023218609,0
250217,i'm going to las vegas in august for a couple ...,True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219055,0
250218,My last fun trip was to portland. Going to se...,True,104153,10,RE: MID C Question,doug.sewell@enron.com,lisa.gang@enron.com,1023219293,0
250219,"dude, serious? Portland...portland is soooooo...",True,104153,10,RE: MID C Question,lisa.gang@enron.com,doug.sewell@enron.com,1023219865,0


In [150]:
email_dir = Path(Path.cwd().parent, Path('data/raw/maildir'))
data_dir = Path(Path.cwd().parent, Path('data/interim'))


def parse_emails(path):
    with open(path, 'r', encoding='windows-1252') as f:
        parsed_email = email.message_from_file(f)
    return parsed_email


def get_parsed_emails(paths):
    emails = []
    for i, path in enumerate(paths):
        eml = parse_emails(path)
        tms = int(parse(eml['Date']).timestamp())
        emails.append((i, eml, tms))
    return emails


def get_parsed_emails(paths, dic=None):
    for path in paths:
        eml = parse_emails(path)
        temp = {k:v for k, v in eml.items() + [('Message', eml.get_payload()), ('Timestamp', int(parse(eml['Date']).timestamp()))]}
        for k,v in dic.items():
            dic[k].append(temp.get(k))
    return dic


def remove_spaces(string):
    if string is not None:        
        string = re.sub('\s+', ' ', string)
        string = string.split(', ')
    return string


def get_chain(data):
    df_ = data.loc[:, ['key', 'Timestamp', 'Reply']].sort_values(by=['key', 'Reply', 'Timestamp'])
    chains = {}
    counter = 0
    for idx, row in df_.iterrows():
        key = f"{row['key']}_{counter:03d}"
        if key not in chains:
            counter = 0
            key = f"{row['key']}_{counter:03d}"
            chains[key] = {'length': 1, 'email_ids': [idx]}
        else:
            if row['Reply']:
                chains[key]['length'] += 1
                chains[key]['email_ids'].append(idx)
            else:
                counter += 1
                key = f"{row['key']}_{counter:03d}"
                chains[key] = {'length': 1, 'email_ids': [idx]}

    chains_new = {}
    for i, row in enumerate(chains):
        chains_new[i] = {'chain_id': row} | chains[row]

    return chains_new


def get_longest_chain(chain):
    MAX = 0
    longest_chain = []
    for k, v in chain.items():
        if v['length'] > MAX:
            MAX = v['length']
            longest_chain = (k, v['length'], v['email_ids'])
        elif v['length'] == MAX:
            if isinstance(longest_chain, tuple):
                longest_chain = [longest_chain, (k, v['length'] ,v['email_ids'])]
            else:
                longest_chain.append((k, v['length'] ,v['email_ids']))
        
    return longest_chain


def assign_chain_id(data, chain):
    data.loc[:,'Chain'] = None
    data.loc[:,'Chain_len'] = None
    for k,v in chain.items():
        data.loc[v['email_ids'], 'Chain'] = k
        data.loc[v['email_ids'], 'Chain_len'] = int(v['length'])

In [151]:
clean_emails = []
for path in email_dir.rglob('*.'):
    if 'all_documents' not in str(path.parent) and 'discussion_threads' not in str(path.parent):
        clean_emails.append(path)
email_dict = {
    'Message-ID': [],
    'Date': [],
    'From': [],
    'To': [],
    'Subject': [],
    'Cc': [],
    'Mime-Version': [],
    'Content-Type': [],
    'Content-Transfer-Encoding': [],
    'Bcc': [],
    'X-From': [],
    'X-To': [],
    'X-cc': [],
    'X-bcc': [],
    'X-Folder': [],
    'X-Origin': [],
    'X-FileName': [],
    'Message': [],
    'Timestamp': []
}
df = pd.DataFrame(get_parsed_emails(clean_emails, email_dict))

In [152]:
df

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Message,Timestamp
0,<32259334.1075852468311.JavaMail.evans@thyme>,"Thu, 4 Oct 2001 15:05:17 -0700 (PDT)",john.shelk@enron.com,"richard.shapiro@enron.com, linda.robertson@enr...",Summary of Administration Comments on Bingaman...,,1.0,text/plain; charset=us-ascii,7bit,,"Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,\nI have read through the 19 pages of Administ...,1002233117
1,<16152007.1075852468365.JavaMail.evans@thyme>,"Tue, 25 Sep 2001 09:25:07 -0700 (PDT)",john.shelk@enron.com,"richard.shapiro@enron.com, d..steffes@enron.co...",EPSA/EEI on Reliability,"linda.robertson@enron.com, carin.nersesian@enr...",1.0,text/plain; charset=us-ascii,7bit,"linda.robertson@enron.com, carin.nersesian@enr...","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS...","Robertson, Linda </O=ENRON/OU=NA/CN=RECIPIENTS...",,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,\nThis follows up on Rick's inquiry late last ...,1001435107
2,<26474922.1075852468285.JavaMail.evans@thyme>,"Fri, 5 Oct 2001 08:21:31 -0700 (PDT)",john.shelk@enron.com,charles.yeung@enron.com,Reliability and Security Arguments (RTOs),"janel.guerrero@enron.com, d..steffes@enron.com...",1.0,text/plain; charset=us-ascii,7bit,"janel.guerrero@enron.com, d..steffes@enron.com...","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Yeung, Charles </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Guerrero, Janel </O=ENRON/OU=NA/CN=RECIPIENTS/...",,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,\nThis responds to Charles's voice mail and th...,1002295291
3,<10118998.1075852468340.JavaMail.evans@thyme>,"Fri, 28 Sep 2001 12:11:10 -0700 (PDT)",john.shelk@enron.com,"joe.connor@enron.com, richard.ingersoll@enron....",RE: NERC Statements on Impact of Security Thre...,,1.0,text/plain; charset=us-ascii,7bit,,"Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Connor, Joe </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...",,,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,I agree with Joe. The IOUs will point to NERC...,1001704270
4,<24576280.1075861591387.JavaMail.evans@thyme>,"Fri, 2 Nov 2001 05:33:16 -0800 (PST)",john.shelk@enron.com,"d..steffes@enron.com, linda.robertson@enron.co...",Barton Staff Meeting,"john.shelk@enron.com, richard.shapiro@enron.com",1.0,text/plain; charset=us-ascii,quoted-printable,"john.shelk@enron.com, richard.shapiro@enron.com","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...","Steffes, James D. </O=ENRON/OU=NA/CN=RECIPIENT...","Shelk, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=J...",,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Co...",Steffes-J,JSTEFFE (Non-Privileged).pst,Yesterday I spent about 45 minutes with the th...,1004707996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330684,<1689472.1075857281689.JavaMail.evans@thyme>,"Wed, 22 Nov 2000 03:18:00 -0800 (PST)",fool@motleyfool.com,benjamin.rogers@enron.com,Investing Basics: Gathering Company Information,,1.0,text/plain; charset=ANSI_X3.4-1968,quoted-printable,,The Motley Fool <Fool@MotleyFool.com>,benjamin.rogers@enron.com,,,\Benjamin_Rogers_Dec2000_4\Notes Folders\Motle...,Rogers-B,brogers.nsf,______________________________________________...,974891880
330685,<31985452.1075857282898.JavaMail.evans@thyme>,"Tue, 31 Oct 2000 01:12:00 -0800 (PST)",fool@motleyfool.com,benjamin.rogers@enron.com,Breakfast With The Fool: Liftoff at Expedia,,1.0,text/plain; charset=us-ascii,7bit,,The Motley Fool <Fool@MotleyFool.com>,benjamin.rogers@enron.com,,,\Benjamin_Rogers_Dec2000_4\Notes Folders\Motle...,Rogers-B,brogers.nsf,______________________________________________...,972983520
330686,<4043728.1075852093007.JavaMail.evans@thyme>,"Tue, 29 May 2001 15:21:21 -0700 (PDT)",fool@motleyfool.com,benjamin.rogers@enron.com,"FoolWatch: Tom Gardner, Microsoft and College ...",,1.0,text/plain; charset=us-ascii,7bit,,The Motley Fool <Fool@MotleyFool.com>@ENRON <I...,"Rogers, Benjamin </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\BROGERS (Non-Privileged)\Rogers, Benjamin\Mot...",ROGERS-B,BROGERS (Non-Privileged).pst,\n=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3...,991174881
330687,<24053764.1075857283068.JavaMail.evans@thyme>,"Thu, 26 Oct 2000 02:29:00 -0700 (PDT)",fool@motleyfool.com,benjamin.rogers@enron.com,Breakfast With The Fool: InfoSpace Makes Money...,,1.0,text/plain; charset=us-ascii,7bit,,The Motley Fool <Fool@MotleyFool.com>,benjamin.rogers@enron.com,,,\Benjamin_Rogers_Dec2000_4\Notes Folders\Motle...,Rogers-B,brogers.nsf,______________________________________________...,972552540


In [153]:
df.loc[200599,:]

Message-ID                        <8671127.1075854943082.JavaMail.evans@thyme>
Date                                     Mon, 15 Oct 2001 16:18:06 -0700 (PDT)
From                                                           ksumme@isda.org
To                                                              board@isda.org
Subject                                            Draft Memorandum to Members
Cc                                                            rpickel@isda.org
Mime-Version                                                               1.0
Content-Type                                      text/plain; charset=us-ascii
Content-Transfer-Encoding                                                 7bit
Bcc                                                           rpickel@isda.org
X-From                                        Kimberly Summe <KSumme@isda.org>
X-To                                               ISDA BOARD <BOARD@isda.org>
X-cc                                          Robert

In [62]:
df.loc[tmp[tmp['label'] > -1].index,'Message-ID']

58487     <21931006.1075843765071.JavaMail.evans@thyme>
67266     <20343021.1075852783231.JavaMail.evans@thyme>
71590     <11703650.1075840283161.JavaMail.evans@thyme>
74646     <32721671.1075857427471.JavaMail.evans@thyme>
77957     <22797037.1075861578908.JavaMail.evans@thyme>
112864    <14579785.1075855357313.JavaMail.evans@thyme>
120152    <15211290.1075854177642.JavaMail.evans@thyme>
139558     <7508741.1075840715401.JavaMail.evans@thyme>
139594     <2632422.1075840703129.JavaMail.evans@thyme>
167723    <17416328.1075846724449.JavaMail.evans@thyme>
176322    <28504577.1075852177724.JavaMail.evans@thyme>
178335    <24770644.1075862329775.JavaMail.evans@thyme>
200599     <8671127.1075854943082.JavaMail.evans@thyme>
209198     <5654181.1075840041726.JavaMail.evans@thyme>
213331     <8138183.1075856157586.JavaMail.evans@thyme>
223166     <7885015.1075855113677.JavaMail.evans@thyme>
229021     <4947823.1075847137140.JavaMail.evans@thyme>
241232     <9979189.1075847982114.JavaMail.evans

In [None]:
for idx, row in df.loc[tmp[tmp['label'] > -1].index,'Message'].iteritems():
    print(row)

thank you so much.  so the news aint great for utilities?

-----Original Message-----
From: Jeff.Dasovich@enron.com [mailto:Jeff.Dasovich@enron.com]
Sent: Wednesday, January 03, 2001 2:14 PM
To: Kari Dohn
Subject: Additional Materials



Greetings Kari:

Forgive the delay.  Much going on today, PUC draft decision in particular.
The draft does not look promising for the utilities' financial position.

Attached are our comments on the Governor's Proposals and some more detail
on the demand-reduction proposal.  We continue to work on the Nord Pool
research for you and will turn that around as quickly as we can.

Again, don't hesitate to contact me if there's anything else I can help
with, or if there's anything else that you need. (415.782.7822)

Best,
Jeff

(See attached file: Comments on Governor's Proposals 010301 .doc)(See
attached file: Demand buy-down proposal.doc)


Let me think about this.
I love you.
Mom

I did receive your e-mail.

Ken Lay





"Michael Milken" <mmilken@knowledg

In [67]:
df.loc[tmp[tmp['label'] > -1].index,'Message-ID']

58487     <21931006.1075843765071.JavaMail.evans@thyme>
67266     <20343021.1075852783231.JavaMail.evans@thyme>
71590     <11703650.1075840283161.JavaMail.evans@thyme>
74646     <32721671.1075857427471.JavaMail.evans@thyme>
77957     <22797037.1075861578908.JavaMail.evans@thyme>
112864    <14579785.1075855357313.JavaMail.evans@thyme>
120152    <15211290.1075854177642.JavaMail.evans@thyme>
139558     <7508741.1075840715401.JavaMail.evans@thyme>
139594     <2632422.1075840703129.JavaMail.evans@thyme>
167723    <17416328.1075846724449.JavaMail.evans@thyme>
176322    <28504577.1075852177724.JavaMail.evans@thyme>
178335    <24770644.1075862329775.JavaMail.evans@thyme>
200599     <8671127.1075854943082.JavaMail.evans@thyme>
209198     <5654181.1075840041726.JavaMail.evans@thyme>
213331     <8138183.1075856157586.JavaMail.evans@thyme>
223166     <7885015.1075855113677.JavaMail.evans@thyme>
229021     <4947823.1075847137140.JavaMail.evans@thyme>
241232     <9979189.1075847982114.JavaMail.evans

In [73]:
df.loc[58487,'From']

'kari.dohn@gov.ca.gov'

In [72]:
tmp.loc[58487,:]

Message                            d-i-r-t-y
Reply                                   True
Chain                                 130942
Chain_len                                122
Subject                                  RE:
Sender             matthew.lenhart@enron.com
Recipients    shirley.s.elliott@citicorp.com
Timestamp                          975587580
label                                      0
Name: 58487, dtype: object

In [156]:
df[df['Message-ID']=='<32497489.1075861029532.JavaMail.evans@thyme>']

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Message,Timestamp
115113,<32497489.1075861029532.JavaMail.evans@thyme>,"Thu, 7 Mar 2002 14:45:49 -0800 (PST)",management.ubsw@enron.com,robert.badeer@enron.com,FW: Employee Transfer,,1.0,text/plain; charset=us-ascii,7bit,,UBSW Energy Information Risk Management </O=EN...,"Badeer, Robert </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,,"\Robert_Badeer_Mar2002_1\Badeer, Robert\Inbox",Badeer-R,rbadeer (Non-Privileged).pst,"Robert,\n\nPer the email below, access to Sita...",1015541149


In [159]:
for i, col in enumerate(df.columns):
    print(i, col)

0 Message-ID
1 Date
2 From
3 To
4 Subject
5 Cc
6 Mime-Version
7 Content-Type
8 Content-Transfer-Encoding
9 Bcc
10 X-From
11 X-To
12 X-cc
13 X-bcc
14 X-Folder
15 X-Origin
16 X-FileName
17 Message
18 Timestamp


In [158]:
df.loc[df['Message-ID'=='<32497489.1075861029532.JavaMail.evans@thyme>','Message']

115113    Robert,\n\nPer the email below, access to Sita...
Name: Message, dtype: object


In [158]:
df.loc[df['Message'=='<32497489.1075861029532.JavaMail.evans@thyme>', 'Message']

115113    Robert,\n\nPer the email below, access to Sita...
Name: Message, dtype: object


In [164]:
print(df.iloc[115113, 14])

\Robert_Badeer_Mar2002_1\Badeer, Robert\Inbox


In [161]:
print(df.iloc[115113, 17])

Robert,

Per the email below, access to Sitara has been granted.

ID = rbadeer
PW = changeme

Thanks!
Leah

 -----Original Message-----
From: 	Severson, Russ  
Sent:	Thursday, March 07, 2002 4:37 PM
To:	UBSW Energy Information Risk Management
Subject:	RE: Employee Transfer

West Trading

 -----Original Message-----
From: 	UBSW Energy Information Risk Management  
Sent:	Thursday, March 07, 2002 4:26 PM
To:	Severson, Russ
Subject:	FW: Employee Transfer

Russ,

Please approve access for Robert Badeer for Sitara and TDS.  Please let me know what access he should have in Sitara now that he is in Houston.

Thanks!
Leah

 -----Original Message-----
From: 	Rangel, Ina  
Sent:	Thursday, March 07, 2002 1:15 PM
To:	UBSW Energy Information Risk Management
Cc:	UBSW Energy IT Security and Controls; Marcinkowski, Danielle
Subject:	Employee Transfer

Robert Badeer is currently working in our Portland office and will be coming to work in Houston office permanently.    He needs to be setup on the follow