In [1]:
from matplotlib import pyplot as plt
import numpy as np
import operator
import pandas as pd
import pickle
import random
import scipy
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors

import src.tools as tools
import src.recencytools as recency
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.knntools as knntools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [3]:
training_info

Unnamed: 0,mid,date,body,recipients
0,60,2000-07-25 08:14:00,Legal has been assessing the risks of doing bl...,robert.badeer@enron.com murray.o neil@enron.co...
1,66,2000-08-03 02:56:00,Attached is a spreadsheet to estimate export f...,kim.ward@enron.com robert.badeer@enron.com mur...
2,74,2000-08-15 05:37:00,Kevin/Bob: Here is a quick rundown on the cons...,robert.badeer@enron.com john.massey@enron.com ...
3,80,2000-08-20 14:12:00,check this out and let everyone know what s up...,robert.badeer@enron.com jeff.richter@enron.com
4,83,2000-08-22 08:17:00,Further to your letter to us (addressed to Mr....,pgillman@schiffhardin.com kamarlantes@calpx.co...
5,105,2000-08-24 11:42:00,The new version of the EnronOnline website is ...,dave.samuels@enron.com andy.zipper@enron.com l...
6,132,2000-08-27 23:11:00,Check this out. I think that we need to be si...,robert.badeer@enron.com greg.wolfe@enron.com l...
7,136,2000-08-28 02:50:00,We have had some confusion recently with respe...,robert.badeer@enron.com m..forney@enron.com ti...
8,144,2000-08-28 08:15:00,I will be traveling to Calgary on Tuesday and ...,robert.badeer@enron.com murray.o neil@enron.co...
9,167,2000-08-29 11:18:00,Please note that the EnronOnline Phase 2 train...,erik.simpson@enron.com paul.racicot@enron.com ...


In [4]:
test_info

Unnamed: 0,mid,date,body
0,1577,2001-11-19 06:59:51,Note: Stocks of heating oil are very high for...
1,1750,2002-03-05 08:46:57,"Kevin Hyatt and I are going for ""sghetti"" at S..."
2,1916,2002-02-13 14:17:39,This was forwarded to me and it is funny. - Wi...
3,2094,2002-01-22 11:33:56,I will be in to and happy to assist too. I ma...
4,2205,2002-01-11 07:12:19,Thanks. I needed a morning chuckle.
5,2297,2002-01-11 14:37:19,Note: Westpath Expansion plans filed at NEBTr...
6,5300,2001-11-26 14:13:01,Here are Peggy s slides. -----Original Message...
7,5333,2001-11-19 13:44:18,Here s the information. -----Original Message-...
8,6583,2002-01-18 05:00:48,I would like to know where and how this is goi...
9,7460,2001-11-12 16:43:31,"Richard: Per Elliot s e-mail below, do you hav..."


In [3]:
token_dict = preprocess.body_dict_from_panda(training_info)

Constructing dictionnary from dataframe...
0 / 43613
10000 / 43613
20000 / 43613
30000 / 43613
40000 / 43613
done !


In [4]:
path_to_tfidf_model = 'results/tfidf_model'
path_to_tfidf_matrix = 'results/tfidf_matrix'

In [None]:
enumerate(token_dict)

If model not available at path_to_tfidf_model
Uncomment next cell, launch and recomment

In [5]:
tfidf_model, tfidf_matrix, tfidf_mids = tfidftools.get_tfidf(token_dict, 0.001, 0.10)

# Save to files
with open(path_to_tfidf_model, 'wb') as infile:
    pickle.dump(tfidf_model, infile)
with open(path_to_tfidf_matrix, 'wb') as infile:
    pickle.dump(tfidf_matrix, infile)

In [14]:
with open(path_to_tfidf_model, 'rb') as infile:
    tfidf_model = pickle.load(infile)
    
with open(path_to_tfidf_matrix, 'rb') as infile:
    tfidf_matrix = pickle.load(infile)

In [6]:
nb_neighbors = 50

mid_recipient_scores = knntools.compute_similarity_scores(tfidf_model, tfidf_matrix,
                              tfidf_mids, training_info, test_info, nb_similars=nb_neighbors)




In [37]:
mid_recipient_scores

{40962: defaultdict(<function src.knntools.compute_similarity_scores.<locals>.<lambda>>,
             {'alan.comnes@enron.com': 0.31383164376770456,
              'aleck.dadson@enron.com': 0.31383164376770456,
              'allison.navin@enron.com': 0.31383164376770456,
              'amr.ibrahim@enron.com': 0.31383164376770456,
              'barbara.hueter@enron.com': 0.31383164376770456,
              'bernadette.hawkins@enron.com': 0.31383164376770456,
              'bevin.hunter@enron.com': 0.31383164376770456,
              'bill.moore@enron.com': 0.31383164376770456,
              'carin.nersesian@enron.com': 0.31383164376770456,
              'carmen.perez@enron.com': 0.31383164376770456,
              'carolyn.cooney@enron.com': 0.31383164376770456,
              'charles.yeung@enron.com': 0.31383164376770456,
              'chauncey.hood@enron.com': 0.31383164376770456,
              'chris.long@enron.com': 0.31383164376770456,
              'christi.nicolay@enron.com': 0.31

In [7]:
nb_recipients = 10
knn_dic = knntools.similar_dic_to_standard(mid_recipient_scores, keep_all=True)

In [8]:
path_to_ranks = 'ranks_test/knn-k-{nb_neighbors}-rec-all'.format(nb_neighbors=nb_neighbors)
with open(path_to_ranks, 'wb') as infile:
          pickle.dump(knn_dic, infile)

In [14]:
postprocess.write_results_ranked(knn_dic,path_to_results, 'knn-k-{nb_neighbors}.csv'.format(nb_neighbors=nb_neighbors))