In [46]:
import pickle as pkl

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pathlib import Path

from gensim.models.doc2vec import Doc2Vec
from hdbscan import HDBSCAN
from sklearn.decomposition import PCA, TruncatedSVD
from scipy.spatial.distance import pdist, cosine

plt.rcParams['figure.dpi'] = 100
plt.rcParams["figure.autolayout"] = True

In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
labels_dir = Path(data_dir, 'labels_0')
models_dir = Path(Path.cwd().parent, 'models')

In [13]:
vectors = {path.stem[4:]:Doc2Vec.load(str(path)).dv.vectors for path in models_dir.glob('d2v*.model')}

In [6]:
best_km_sl_names = ['labels_km_chains_eq_3_300_02_euclidean',
 'labels_km_chains_ge_10_50_02_euclidean',
 'labels_km_chains_eq_3_50_02_euclidean',
 'labels_km_chains_ge_4_lt_10_300_02_euclidean',
 'labels_km_chains_ge_10_300_02_euclidean',
 'labels_km_chains_ge_4_lt_10_50_02_euclidean']

In [28]:
def read_pickle(path):
    with open(path, 'rb') as handle:
        file = pkl.load(handle)
    return np.asarray(file)

In [29]:
clusters = {'_'.join(path.stem.split('_')[2:-2]):read_pickle(path) for path in labels_dir.iterdir() if path.stem in best_km_sl_names}

In [35]:
n_clusters = set(clusters['chains_eq_3_50'])
clustered = {}
for i in range(len(vectors['chains_eq_3_50'])):
    if clusters['chains_eq_3_50'][i] in clustered:
        clustered[clusters['chains_eq_3_50'][i]].append(vectors['chains_eq_3_50'][i])
    else:
        clustered[clusters['chains_eq_3_50'][i]] = [vectors['chains_eq_3_50'][i]]

In [57]:
emails_list = [pd.read_pickle(path) for path in data_dir.glob('parsed_emails*')]

In [76]:
emails = {path.stem[14:]+s:pd.read_pickle(path) for path in data_dir.glob('parsed_emails*') for s in ['_50','_300']}

In [83]:
for k, v in clusters.items():
    emails[k]['clusters'] = v

In [88]:
emails['chains_eq_3_50']

Unnamed: 0,Message,Reply,Chain,Chain_len,Subject,Sender,Recipients,Timestamp,clusters
142,"\nHey Paul, how is it going?? Attached you'll...",False,87415,3,How are you?,educanto@msn.com,d..thomas@enron.com,883935960,1
421,"Maria,\n\nThe Clearing docs we got in from the...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925474740,1
424,"Mark,\n\nDoes this mean that you would prefer ...",True,49280,3,Re: Documentation from OM,maria.nartey@enron.com,"mark.elliott@enron.com, richard.sage@enron.com...",925482120,1
425,"Maria,\n\nNot necessarily - it is just that th...",True,49280,3,Re: Documentation from OM,mark.elliott@enron.com,"maria.nartey@enron.com, richard.sage@enron.com...",925485840,1
503,Wow - that is one nasty looking storm out ther...,False,112512,3,Morning!,mark.taylor@enron.com,marc.r.cutler@bankamerica.com,926502600,1
...,...,...,...,...,...,...,...,...,...
250676,She is going to print all the Appalachian Prod...,True,16029,3,RE: Assistant to print contracts,chris.germany@enron.com,"ed.mcmichael@enron.com, ruth.concannon@enron.com",1024576950,0
250686,OK to both. Let's use Heather Choate too if i...,True,16029,3,RE: Assistant to print contracts,ed.mcmichael@enron.com,"chris.germany@enron.com, ruth.concannon@enron.com",1024588182,0
250700,does that mean i need to cover\n \n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602537,0
250703,9369 TOMORROW\n\n,True,80624,3,RE: Go Baby!,joe.parks@enron.com,"'fenner@enron.com, chet_fenner@bmc.com",1024602851,0


In [89]:
[path.name for path in Path(data_dir, 'big').glob('*.pkl')]

['parsed_emails_chains_replies.pkl',
 'parsed_emails_chains_all.pkl',
 'parsed_emails_chains_split_0.pkl',
 'parsed_emails_chains_gt_1.pkl',
 'parsed_emails_chains_split_3.pkl',
 'parsed_emails_chains_split_2.pkl',
 'parsed_emails_chains_split_1.pkl']

In [240]:
df_all = pd.read_pickle(Path(data_dir, 'big', 'parsed_emails_chains_all.pkl'))
# df_all = df_all.dropna(subset='Message')

chains_count = df_all['Chain_len'].value_counts()/df_all['Chain_len'].value_counts().index
df_ct = chains_count.astype(int).reset_index().sort_values([0,'index'], ascending=[False, True]).set_index('index')

In [241]:
[
    len(df_all[df_all['Chain_len'] == 1]),
    len(df_all[df_all['Chain_len'] == 2]),
    len(df_all[df_all['Chain_len'] == 3]),
    len(df_all[(df_all['Chain_len'] >= 4) & (df_all['Chain_len'] < 10)]),
    len(df_all[df_all['Chain_len'] >= 10])
]

[203172, 24018, 8838, 9708, 5332]

In [239]:
[
    len(df_all[df_all['Chain_len'] == 1]),
    len(df_all[df_all['Chain_len'] == 2]),
    len(df_all[df_all['Chain_len'] == 3]),
    len(df_all[(df_all['Chain_len'] >= 4) & (df_all['Chain_len'] < 10)]),
    len(df_all[df_all['Chain_len'] >= 10])
]

[196212, 23841, 8769, 9676, 5324]

In [247]:
n_msg_cln = [196212, 23841, 8769, 9676, 5324]
n_msg_raw = [203172, 24018, 8838, 9708, 5332]

In [255]:
for i in range(len(n_msg_raw)):
    print(round(1-(n_msg_cln[i] / n_msg_raw[i]),3)*100)

3.4000000000000004
0.7000000000000001
0.8
0.3
0.2


In [199]:
len(df_all[df_all['Chain_len'] == 1])

196212

In [232]:
sum(df_ct.reset_index().loc[10:,'index'] * df_ct.reset_index().loc[10:,0])

4892

In [230]:
df_ct.loc[:9]

Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
1,203172
2,12009
3,2946
4,1077
5,463
6,222
7,116
8,67
9,45


In [231]:
df_ct.tail(5)

Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
122,1
123,1
233,1
290,1
798,1


In [94]:
df_all

220309

In [190]:
len(emails['chains_eq_2_50'].dropna(subset='Message'))/2

11920.5

In [186]:
for k,v in emails.items():
    print(k, len(v.))

chains_eq_2_50 23841
chains_eq_2_300 23841
chains_ge_10_50 5324
chains_ge_10_300 5324
chains_ge_4_lt_10_50 9676
chains_ge_4_lt_10_300 9676
chains_eq_3_50 8769
chains_eq_3_300 8769


In [177]:
[path.stem for path in data_dir.glob('parsed*')]

['parsed_emails_chains_eq_2',
 'parsed_emails_chains_ge_10',
 'parsed_emails_chains_ge_4_lt_10',
 'parsed_emails_chains_eq_3']

In [17]:
pca = PCA(n_components=2)
X = pca.fit_transform(d2vec)
sum(pca.explained_variance_ratio_)

0.06073138676583767

In [21]:
pca = PCA()
X = pca.fit_transform(d2vec)
sum(pca.explained_variance_ratio_)

256.4158363714814

In [18]:
tsvd = TruncatedSVD(n_components=2)
Y = tsvd.fit_transform(d2vec)
sum(tsvd.explained_variance_ratio_)

0.06014697067439556