In [1]:
from collections import defaultdict
import itertools
from matplotlib import pyplot as plt
import math
import numpy as np
import operator
import pandas as pd
import pickle
import random
import re
import scipy
import string
from tqdm import tqdm_notebook


import src.knntools as knntools
import src.fusiontools as fusiontools
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.tools as tools
import src.recencytools as recency
import src.scoring as scoring
import src.textembeddingtools as texttools
import src.graphwordstools as graphtools
import src.manualprocessingtools as manualtools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [3]:
train_info, train_email_ids_per_sender, val_info, val_email_ids_per_sender = scoring.get_train_val(training, training_info, train_frac=0.95)

Processing training !


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"



Processing val !


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





In [5]:
train_body_dict = preprocess.body_dict_from_panda(train_info)
val_body_dict = preprocess.body_dict_from_panda(val_info)





In [6]:
all_recipients_in_train = preprocess.get_all_recipients_from_df(train_info)
all_recipients_in_train = [rec.lower() for rec in all_recipients_in_train]




In [7]:
val_body_dict = preprocess.body_dict_from_panda(val_info)




## In body email prediction

Finds emails in the body of the mail, and makes prediction that those emails are among the recipients

In [9]:
val_mid_sender_dic = preprocess.get_mid_sender_dict(val_email_ids_per_sender)

In [10]:
val_emails_in_content = manualtools.get_filtered_emails_dic(val_body_dict, val_mid_sender_dic, candidate_list=None)




In [20]:
val_in_body_predictions = {}
for mid, body in val_body_dict.items():
    if (mid in val_emails_in_content.keys()):
        val_in_body_predictions[mid] = val_emails_in_content[mid]
    else:
        val_in_body_predictions[mid] = []

In [24]:
current_score = scoring.compute_prediction_mad(val_in_body_predictions, val_info)
print(current_score)

0.00259819654593


In [11]:
train_mid_sender_dic = preprocess.get_mid_sender_dict(train_email_ids_per_sender)
train_emails_in_content = manualtools.get_filtered_emails_dic(train_body_dict, train_mid_sender_dic, candidate_list=all_recipients_in_train)




In [12]:
current_score = scoring.compute_prediction_mad(val_emails_in_content, val_info)
print(current_score)

0.0156971375808


In [106]:
with open('variables/emails_in_val', 'wb') as outfile:
    pickle.dump(emails_in_content, outfile)

## Keyword prediction

- Extracts the string after a given keyword

- Finds train emails with matching string sequence

- Adds 1 to recipient score for each recipient of the train email

- Predicts recipient with highest scores

In [13]:
extracted_subject_length = 10
keyword = 'From:'
val_subject_dic = manualtools.get_keyword_dic(val_body_dict, keyword=keyword,
                                              extracted_length=extracted_subject_length)
train_subject_dic = manualtools.get_keyword_dic(train_body_dict, keyword=keyword,
                                                extracted_length=extracted_subject_length)

In [14]:
keyword_predictions = manualtools.get_keyword_prediction(train_body_dict, val_body_dict, train_info, keyword, extracted_length=20)




In [15]:
current_score = scoring.compute_prediction_mad(keyword_predictions, val_info)
print(current_score)

0.026816672489


In [21]:
models = [keyword_predictions, val_in_body_predictions]
fusion_predictions = fusiontools.reciprocal_rerank(models, 10)

In [22]:
fusion_score = scoring.compute_prediction_mad(fusion_predictions, val_info)
print(fusion_score)

0.0260003097134
