In [1]:
import os
import sys
import time
import random
import operator
import pandas as pd
from collections import Counter
import numpy as np

from src.graph import *
from src.recommendation import *
from src.tools import *

from sklearn.metrics.pairwise import cosine_similarity

- training : 43 613 messages -> mids 404873
- test : 2362 messages

In [2]:
path_to_data = 'data/'

##########################
# load some of the files #
##########################

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)
training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)
test_info = pd.read_csv(
    path_to_data + 'test_info.csv', sep=',', header=0)

In [3]:
################################
# create some handy structures #
################################

# convert training set to dictionary
emails_ids_per_sender = {}
for index, series in training.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender[sender] = ids

# save all unique sender names
all_senders = emails_ids_per_sender.keys()

# create address book with frequency information for each user
address_books = {}
neighbors = {}

i = 0
conversation_ids = {}

for sender, ids in emails_ids_per_sender.items():
    recs_temp = []
    for my_id in ids:
        recipients = training_info[training_info['mid'] == int(my_id)][
            'recipients'].tolist()
        recipients = recipients[0].split(' ')
        # keep only legitimate email addresses
        recipients = [rec for rec in recipients if '@' in rec]
        recs_temp.append(recipients)
        for rec in recipients:
            if (sender, rec) in conversation_ids.keys():
                conversation_ids[(sender, rec)].append(my_id)
            else:
                conversation_ids[(sender, rec)] = [my_id]
    # flatten
    recs_temp = [elt for sublist in recs_temp for elt in sublist]

    # compute recipient counts
    rec_occ = dict(Counter(recs_temp))
    # order by frequency
    sorted_rec_occ = sorted(
        rec_occ.items(), key=operator.itemgetter(1), reverse=True)
    # save
    address_books[sender] = sorted_rec_occ
    neighbors[sender] = recs_temp

    if i % 10 == 0:
        print(i)
    i += 1

# save all unique recipient names
all_recs = list(
    set([elt[0] for sublist in address_books.values() for elt in sublist]))

# save all unique user names
all_users = []
all_users.extend(all_senders)
all_users.extend(all_recs)
all_users = list(set(all_users))

0
10
20
30
40
50
60
70
80
90
100
110
120


In [4]:
train_dict = body_dict_from_panda(training_info)
test_dict = body_dict_from_panda(test_info)
tfidf = get_tfidf(train_dict)

Constructing dictionnary from dataframe...
0 / 43613
10000 / 43613
20000 / 43613
30000 / 43613
40000 / 43613
done !
Constructing dictionnary from dataframe...
0 / 2362
done !
func:get_tfidf took: 79.6069 sec


In [5]:
centroid_dict = {}

In [6]:
def get_recommandation(mail_body, mail_date, sender, k=10):
    global conversation_ids, vectorizer, centroid_dict
    recs = neighbors[sender]
    sims = {}
    for rec in recs:
        if (sender, rec) not in centroid_dict:            
            centroid_dict[(sender, rec)] = centroid(conversation_ids[(sender, rec)])        
        sims[rec] = float(cosine_similarity(vectorizer.transform([mail_body]), centroid_dict[(sender, rec)]))
    return sorted(sims, key=sims.get, reverse=True)[:k]


def centroid(mids):
    global train_dict, vectorizer
    return vectorizer.transform([train_dict[int(mid)] for mid in mids]).mean(axis = 0)


In [7]:
##############################
# Centroids tfidf
##############################
vectorizer = tfidf

adj_mat = build_graph(address_books)

start = time.time()
i = 1

predictions_per_sender = {}
for index, row in test.iterrows():
    name_ids = row.tolist()
    sender = name_ids[0]
    # get IDs of the emails for which recipient prediction is needed
    ids_predict = name_ids[1].split(' ')
    ids_predict = [int(my_id) for my_id in ids_predict]
    
    centroid_preds = []
    
    for id_predict in ids_predict:
        recom = get_recommandation(test_dict[id_predict], 0, sender)
        centroid_preds.append(recom)        
#         print(recom)
        print("%s/%s Time %s"%(i, 2362, time.time()-start))
        i +=1
    predictions_per_sender[sender] = [ids_predict, centroid_preds]
    

['c..aucoin@enron.com', 'don.black@enron.com', 'rogers.herndon@enron.com', 'ed.mcmichael@enron.com', 'lloyd.will@enron.com', 'robert.superty@enron.com', 'louise.kitchen@enron.com', 'c..gossett@enron.com', 'chris.gaskill@enron.com', 'corry.bentley@enron.com']
1/2362 Time 6.00635027885437
['darren.espey@enron.com', 'biliana.pehlivanova@enron.com', 'seung-taek.oh@enron.com', 'david.baumbach@enron.com', 'amanda.colpean@enron.com', 'don.baughman@enron.com', 'paul.schiavone@enron.com', 'chuck.ames@enron.com', 'elizabeth.shim@enron.com', 'russell.ballato@enron.com']
2/2362 Time 9.97250485420227
['jean.mrha@enron.com', 'chris.gaskill@enron.com', 'doug.gilbert-smith@enron.com', 'dana.davis@enron.com', 'lloyd.will@enron.com', 'thomas.a.martin@enron.com', 'scott.neal@enron.com', 'hunter.s.shively@enron.com', 'corry.bentley@enron.com', 'fletcher.j.sturm@enron.com']
3/2362 Time 13.210574388504028
['darron.c.giron@enron.com', 'jeff.king@enron.com', 'adrianne.engler@enron.com', 'matt.smith@enron.com'

In [8]:
#################################################
# write predictions in proper format for Kaggle #
#################################################

path_to_results = 'results/'

with open(path_to_results + 'predictions_centroid_tfidf.txt', 'wb') as my_file:
    my_file.write(bytes('mid,recipients\n', 'UTF-8'))
    for sender, preds in predictions_per_sender.items():
        ids = preds[0]
        random_preds = preds[1]
        for index, my_preds in enumerate(random_preds):
            my_file.write(bytes(str(ids[index]) + ',' +
                                ' '.join(my_preds) + '\n', 'UTF-8'))