# K-nearest neighbor model with twidf

Here the k-nn is not at all limited by the sender, nearest neighbours are queried in the entire training dataset

In [26]:
from collections import defaultdict
import itertools
from matplotlib import pyplot as plt
import math
import numpy as np
import operator
import pandas as pd
import pickle
import random
import re
import scipy
import string
from tqdm import tqdm_notebook


import src.knntools as knntools
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.tools as tools
import src.recencytools as recency
import src.scoring as scoring
import src.textembeddingtools as texttools
import src.graphwordstools as graphtools

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [28]:
train_info, train_email_ids_per_sender, val_info, val_email_ids_per_sender = scoring.get_train_val(training, training_info, train_frac=0.95)

Processing training !

Processing val !



In [29]:
train_body_dict = preprocess.body_dict_from_panda(train_info)
val_body_dict = preprocess.body_dict_from_panda(val_info)





In [30]:
train_token_dict = texttools.get_token_dict(train_body_dict)
val_token_dict = texttools.get_token_dict(val_body_dict)





## Compute average length of doc in tokens

In [31]:
train_doc_lengths_dic, train_average_doc_len = texttools.get_doc_length_info(train_token_dict)

In [32]:
train_idf_dic, train_idf_words = tfidftools.get_idf_dic(train_token_dict)




In [33]:
print(len(train_idf_dic))

7161


## Compute training twidf vectors and other needed variables

In [41]:
%%time
twidf_matrix, twidf_mids = graphtools.get_twidf_matrix(train_token_dict, train_doc_lengths_dic,
                                                       train_average_doc_len, train_idf_dic,
                                                       train_idf_words, 0.003)



CPU times: user 13min 52s, sys: 27.8 s, total: 14min 20s
Wall time: 15min 4s


In [42]:
val_doc_lengths_dic, average_test_length = texttools.get_doc_length_info(val_token_dict)

## Compute validation twidf vectors

In [43]:
val_vectors = graphtools.get_twidf_vectors_from_tokens(train_idf_dic, train_idf_words,
                                                       val_token_dict, train_average_doc_len)




# Compute recipient scores for each mid

Scores for candidate are computed by summing the cosine distances for the nearest documents to the email in which the candidate is indeed a recipient 

In [44]:
mid_recipient_scores = knntools.compute_twidf_similarity_scores(twidf_matrix, twidf_mids,
                                                                val_vectors, train_info,
                                                                nb_similars=100)




In [45]:
twidf_predictions = knntools.similar_dic_to_standard(mid_recipient_scores, nb_recipients=50)
current_score = scoring.compute_prediction_mad(twidf_predictions, val_info)

In [46]:
print(current_score)

0.208050623216


In [53]:
path_to_ranks = 'ranks_val/twidf-knn-k-{nb_neighbors}-rec'.format(nb_neighbors=50)
with open(path_to_ranks, 'wb') as infile:
              pickle.dump(twidf_predictions, infile)