In [1]:
from collections import defaultdict
import itertools
from matplotlib import pyplot as plt
import math
import numpy as np
import operator
import pandas as pd
import pickle
import random
import re
import scipy
import string
from tqdm import tqdm_notebook


import src.knntools as knntools
import src.fusiontools as fusiontools
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.tools as tools
import src.recencytools as recency
import src.scoring as scoring
import src.textembeddingtools as texttools
import src.graphwordstools as graphtools
import src.manualprocessingtools as manualtools

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Load dataset

In [40]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [41]:
train_info, train_email_ids_per_sender, val_info, val_email_ids_per_sender = scoring.get_train_val(training, training_info, train_frac=0.95)

Processing training !

Processing val !



In [42]:
train_body_dict = preprocess.body_dict_from_panda(train_info)
val_body_dict = preprocess.body_dict_from_panda(val_info)





## Retreive all the know email list

This allows us to filter the emails in the body to potentially only keep the emails that were present as recipients in the training set.

In [43]:
all_recipients_in_train = preprocess.get_all_recipients_from_df(train_info)
all_recipients_in_train = [rec.lower() for rec in all_recipients_in_train]




In [44]:
print('{nb_rec} were found in the training set'.format(nb_rec=len(all_recipients_in_train)))

9597 were found in the training set


In [45]:
val_body_dict = preprocess.body_dict_from_panda(val_info)




## In body email prediction

Finds emails in the body of the mail, and makes prediction that those emails are among the recipients.



In [46]:
val_mid_sender_dic = preprocess.get_mid_sender_dict(val_email_ids_per_sender)

### Filter emails present in known recipient list 

In [47]:
val_emails_in_content = manualtools.get_filtered_emails_dic(val_body_dict, val_mid_sender_dic, candidate_list=all_recipients_in_train)




In [48]:
val_in_body_predictions = {}
for mid, body in val_body_dict.items():
    if (mid in val_emails_in_content.keys()):
        val_in_body_predictions[mid] = val_emails_in_content[mid]

In [49]:
current_score = scoring.compute_prediction_mad(val_in_body_predictions, val_info)
print(current_score)

0.0416666666667


###  Keep all emails in body (no filter) 

In [50]:
val_emails_in_content = manualtools.get_filtered_emails_dic(val_body_dict, val_mid_sender_dic, candidate_list=None)

val_in_body_predictions = {}
for mid, body in val_body_dict.items():
    if (mid in val_emails_in_content.keys()):
        val_in_body_predictions[mid] = val_emails_in_content[mid]
        
current_score = scoring.compute_prediction_mad(val_in_body_predictions, val_info)
print(current_score)


0.0180055401662


We see that filtering out the emails not present in the candidate list significantly improves the results, but that overall the performances are pretty poor. 

In [53]:
train_mid_sender_dic = preprocess.get_mid_sender_dict(train_email_ids_per_sender)
train_emails_in_content = manualtools.get_filtered_emails_dic(train_body_dict, train_mid_sender_dic, candidate_list=None)
train_in_body_predictions = {}
for mid, body in train_body_dict.items():
    if (mid in train_emails_in_content.keys()):
        train_in_body_predictions[mid] = train_emails_in_content[mid]
        
current_score = scoring.compute_prediction_mad(train_in_body_predictions, train_info)
print(current_score)


0.0747570571871


For the emails in the training set that contain recipients in the body, we obtain slightly better prediction results. 

## Keyword prediction

- Extracts the string after a given keyword

- Finds train emails with matching string sequence

- Adds 1 to recipient score for each recipient of the train email

- Predicts recipient with highest scores

In [55]:
keywords = ['From:', 'FW:', 'Subject']

In [61]:
extracted_subject_length = 20

for keyword in keywords:
    keyword_predictions = manualtools.get_keyword_prediction(train_body_dict, val_body_dict,
                                                             train_info, keyword, extracted_length=20)
    current_score = scoring.compute_prediction_mad(keyword_predictions, val_info)
    print('"{key}" score : {score}'.format(key=keyword, score=current_score))


"From:" score : 0.11337176196376941

"FW:" score : 0.015641293013555786

"Subject" score : 0.04579764121663563


As expected, the most informative field is 'From:'.

We notice that by themselves, when present on the e-mail, none of the keywords allow us (by itself) to make competitive predictions