In [1]:
from collections import defaultdict
import itertools
from matplotlib import pyplot as plt
import math
import numpy as np
import operator
import pandas as pd
import pickle
import random
import re
import scipy
import string
from tqdm import tqdm_notebook


import src.knntools as knntools
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.tools as tools
import src.recencytools as recency
import src.scoring as scoring
import src.textembeddingtools as texttools
import src.graphwordstools as graphtools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [6]:
training_body_dict = preprocess.body_dict_from_panda(training_info)
test_body_dict = preprocess.body_dict_from_panda(test_info)





In [3]:
training_email_ids_per_sender = preprocess.get_email_ids_per_sender(training)
test_email_ids_per_sender = preprocess.get_email_ids_per_sender(test)





In [7]:
training_token_dict = texttools.get_token_dict(training_body_dict)
test_token_dict = texttools.get_token_dict(test_body_dict)





# Prepare models

In [10]:
senders_mean_lengths = {}
senders_idf_dics = {}
senders_idf_words = {}
senders_twidf_matrixs = {}
senders_twidf_mids = {}

pbar_senders = tqdm_notebook(training_email_ids_per_sender.items())
for sender, sender_mids in pbar_senders:
    sender_token_dict = {mid:body for mid, body in training_token_dict.items() if int(mid) in sender_mids}
    # Compute info about tokens
    sender_doc_lengths_dic, sender_average_doc_len = texttools.get_doc_length_info(sender_token_dict)
    senders_mean_lengths[sender] = sender_average_doc_len
    # Get idf info
    sender_idf_dic, sender_idf_words = tfidftools.get_idf_dic(training_token_dict)
    senders_idf_dics[sender] = sender_idf_dic
    senders_idf_words[sender] = sender_idf_words
    # Get tw info
    sender_twidf_matrix, sender_twidf_mids = graphtools.get_twidf_matrix(sender_token_dict, sender_doc_lengths_dic,
                                                                         sender_average_doc_len, sender_idf_dic,
                                                                         sender_idf_words, 0.003)
    senders_twidf_mids[sender] = sender_twidf_mids
    senders_twidf_matrixs[sender] = sender_twidf_matrix
    





## Make predictions

In [11]:
nb_close=50
mid_recipient_scores = {}

pbar_senders = tqdm_notebook(test_email_ids_per_sender.items())
for sender, sender_test_mids in pbar_senders:
    # Retrieve sender info
    sender_average_doc_len = senders_mean_lengths[sender]
    sender_idf_dic = senders_idf_dics[sender]
    sender_idf_words = senders_idf_words[sender]
    sender_twidf_mids = senders_twidf_mids[sender]
    sender_twidf_matrix = senders_twidf_matrixs[sender]
    for mid in sender_test_mids:
        email_tokens = test_token_dict[mid]
        query_vector = graphtools.get_twidf_vector_from_tokens(sender_idf_dic, sender_idf_words,
                                                              email_tokens, sender_average_doc_len)
        similars = knntools.find_similar(query_vector, sender_twidf_matrix,
                                                                nb_similars=nb_close)
        # Get mid in training set corresponding to best matches   
        best_match_mid = [sender_twidf_mids[similar_item[0]] for similar_item in similars]
        
        # Get corresponding similarity scores
        best_match_scores = [similar_item[1] for similar_item in similars]
        test_mail_scores = defaultdict(lambda: 0)
        for train_mid, train_score in zip(best_match_mid, best_match_scores):
            recipients = preprocess.get_recipients(training_info, train_mid)
            for recipient in recipients:
                test_mail_scores[recipient] += train_score
        mid_recipient_scores[mid] = test_mail_scores





In [12]:
twidf_predictions = knntools.similar_dic_to_standard(mid_recipient_scores, nb_recipients=100)

In [13]:
path_to_ranks = 'ranks_test/twidf-knn-k-senders-{nb_neighbors}-rec'.format(nb_neighbors=nb_close)
with open(path_to_ranks, 'wb') as infile:
              pickle.dump(twidf_predictions, infile)

In [15]:
postprocess.write_results_ranked(twidf_predictions, path_to_results, 'twidf_sender_50_nn_tw_0_003.csv')