In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import numpy as np
import pandas as pd
import pickle as pkl
import src.bayesian as bayes
import src.tools as tools
from pathlib import Path

path_to_data = 'data/'
training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)
training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', header=0)
test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)
test_info = pd.read_csv(
    path_to_data + 'test_info.csv', sep=',', header=0)

data_file = Path(path_to_data + 'data.p')

if data_file.is_file():
    print('Loading probabilities...')
    data = pkl.load(open(path_to_data + 'data.p', 'rb'))  
    print('Done')
else:
    print('Computing probabilities...')
    print('Computing recipient prior')
    p_r = bayes.compute_recipient_prior(training_info)
    print('Computing sender likelihood given recipient')
    p_s_r = bayes.compute_sender_likelihood_given_recipient(training, training_info)
    print('Computing mail likelihood given recipient and sender')
    p_w, p_w_r, p_w_r_s, r_s = bayes.compute_mail_likelihood_given_recipient_and_sender(training, training_info)
    data = {}
    data['p_r'] = p_r 
    data['p_s_r'] = p_s_r
    data['p_w_r_s'] = p_w_r_s
    data['p_w_r'] = p_w_r
    data['p_w'] = p_w
    data['r_s'] = r_s
    pkl.dump(data, open(path_to_data + 'data.p', 'wb'))
    print('Done')    

Loading probabilities...
Done


In [45]:
%%time
res = bayes.compute_results(test, test_info, data, a=0.6, b=0.2, c=0.2, k=100)
pkl.dump(res, open(path_to_data + 'bayesian_results.p', 'wb'))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
CPU times: user 14min 8s, sys: 6.13 s, total: 14min 14s
Wall time: 14min 41s


In [46]:
from src.postprocess import write_results_probas

path_to_results = 'results/'
write_results_probas(res, path_to_results, 'bayesian.txt')

In [47]:
res

KeyboardInterrupt: 

In [43]:
count = 0
for mid in res:
    if sum([x[0] for x in res[mid]]) == 0:
        count += 1
count, len(res)

(0, 2362)

In [48]:
pred = {}
for mid in res:
    pred[mid] = [x[1] for x in res[mid]]
    
pkl.dump(pred, open(path_to_data + 'bayesian_results.p', 'wb'))

['barry.tycholiz@enron.com',
 'tyrell.harrison@enron.com',
 'gerald.nemec@enron.com',
 'mara.bronstein@enron.com',
 'mawhitt@aol.com',
 'steve.walton@enron.com',
 'paul.lucci@enron.com',
 'theresa.staab@enron.com',
 'christina_122367@hotmail.com',
 'jay.reitmeyer@enron.com',
 'jake.thomas@enron.com',
 'j..legler@enron.com',
 'steven.p.south@enron.com',
 'j..bump@enron.com',
 'dave.fuller@enron.com',
 'heidi.dubose@enron.com',
 'stephanie.miller@enron.com',
 'frank.ermis@enron.com',
 'paul.kaufman@enron.com',
 'jason.williams@enron.com',
 'mike.grigsby@enron.com',
 'amy.felling@enron.com',
 'tim.murphy@elpaso.com',
 'eric.moon@enron.com',
 'jwhitt@gvec.net',
 'brian.redmond@enron.com',
 'sue.dolan@xemkt.com',
 'phillip.k.ellen@enron.com',
 'barbara.gray@enron.com',
 'm..landwehr@enron.com',
 'andrew.edison@enron.com',
 'mark.whitt@enron.com',
 'mawhitt@gvec.ney',
 'f..calger@enron.com',
 'scott.josey@enron.com',
 'paul.miller@enron.com',
 'brian.bierbach@enron.com',
 'robert.neustaedter

In [29]:
from heapq import heappush, heappop
t_ = res['325459']
h = []
for e in t_:
    heappush(h, e)

r_ = []
while h:
    r_ = [heappop(h)] + r_
r_

[(0.055929022335236868, 'chinn@millercanfield.com'),
 (0.050892631320865134, 'akurzer@tfsbrokers.com'),
 (0.029120461437296893, 'daniel.diamond@enron.com'),
 (0.024789271504881361, 'stephanie.panus@enron.com'),
 (0.0084473945863536206, 'tana.jones@enron.com'),
 (0.0014496253103108763, 'brenda.crabtree@sabre.com'),
 (0.0014090679087738387, 'rick.shoup@enron.com'),
 (0.0014090679087738387, 'rdezenzo@quallaby.com'),
 (0.0014090679087738387, 'dan.cummings@enron.com'),
 (0.0013889479848816382, 'erica.bess@enron.com')]

In [None]:
i = 0
for p in res:
    if i == 5:
        print(res[p], p)
        break
    i += 1
 
c_ = 0
for p in res:
    if sum([x[0] for x in res[p]]) == 0:
        c_ += 1
c_, len(res)

In [None]:
r_s = data['r_s']
r_s.keys()
n = []
for r in r_s:
    n.append(len(r_s[r]))
print(sorted(n))
print(sum(n))

In [None]:
mail_ = list(test_info[test_info['mid'] == 286748].body)[0]

In [None]:
for i in range(len(test)):
        sender = test.loc[i, 'sender']
        mids = test.loc[i, 'mids'].split()
        for mid in mids:
            if mid == '286748':
                print(sender)
                break

sender_ = 'sandra.f.brawner@enron.com'
recipients_ = data['r_s'][sender_]

# check all recipients are matched with sender
for r in recipients_:
    data['p_r'][r]
    data['p_s_r'][r][sender_]
    print(len(data['p_w_r'][r].keys()))
    break

In [None]:
data['p_r']['tim.belden@enron.com']
data['p_s_r']['tim.belden@enron.com'].keys()

In [None]:
for r in recipients_:
    print(bayes.predict(r, sender_, mail_, data))
    break

In [None]:
np.log(1e-40)

In [None]:
np.union1d(['a', 'b', 'c'], ["a", "d"])[0]

In [None]:
truth = list(training_info[training_info['mid'] == 285336].recipients)[0].split()
[e[1] in truth for e in res['285336']] 

In [None]:
res = bayes.compute_results(training, training_info)

In [None]:
import pickle as pkl
pkl.dump(res, open('res.pkl', 'wb'))

In [None]:
res2 = bayes.compute_results(test, test_info)

In [None]:
import pickle as pkl
pkl.dump(res2, open('res2.pkl', 'wb'))

In [None]:
training_info[training_info['mid'] == ].receivers()

In [None]:
bayes.compute_results(training.loc[[0]], training_info, all_recipients, probs)

In [None]:
len(training_info)/len(test_info)

In [None]:
training.loc[[1,2]]

In [None]:
len(training.loc[10, 'mids'].split())

In [None]:
from heapq import heappop, heappush
h=[]
heappush(h, 2)
heappush(h, 3)
heappop(h)
h

In [None]:
for a,b in res.items():
    print(a,b)

In [None]:

res2 = bayes.compute_results(test, test_info, all_recipients, data)
pkl.dump(res2, open('res2.pkl', 'wb'))

#training_info[training_info['mid'] == ].receivers()

#mail_probable = list(training_info[training_info['mid'] == 158713].body)[0]
#mail_unprobable = list(training_info[training_info['mid'] == 60].body)[0]
#print(bayes.predict('karen.buckley@enron.com', 'jason.wolfe@enron.com', mail_probable, probs))
#print(bayes.predict('karen.buckley@enron.com', 'jason.wolfe@enron.com', mail_unprobable, probs))

In [None]:
if 0.0000:
    print('a')

In [None]:
1/3 * 2.1

In [None]:
np.power(2, 3)

In [2]:
from heapq import heappush, heappop

h  =[]
heappush(h, 2)
heappush(h, 4)
heappush(h, 3)

while h:
    print(heappop(h))


2
3
4


In [4]:
pow(3, 4/3)

4.3267487109222245