In [1]:
from matplotlib import pyplot as plt
import numpy as np
import operator
import pandas as pd
import pickle
import random
import scipy
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors

import src.tools as tools
import src.recencytools as recency
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.knntools as knntools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [3]:
token_dict = preprocess.body_dict_from_panda(training_info)

Constructing dictionnary from dataframe...
0 / 43613
10000 / 43613
20000 / 43613
30000 / 43613
40000 / 43613
done !


In [4]:
path_to_tfidf_model = 'results/tfidf_model'
path_to_tfidf_matrix = 'results/tfidf_matrix'

## Create tfidf representation

If model not available at path_to_tfidf_model
launch cell and then comment !

In [5]:
tfidf_model, tfidf_matrix, tfidf_mids = tfidftools.get_tfidf(token_dict, 0.001, 0.10)

# Save to files
with open(path_to_tfidf_model, 'wb') as infile:
    pickle.dump(tfidf_model, infile)
with open(path_to_tfidf_matrix, 'wb') as infile:
    pickle.dump(tfidf_matrix, infile)

Else, launch this cell right away !

In [14]:
with open(path_to_tfidf_model, 'rb') as infile:
    tfidf_model = pickle.load(infile)
    
with open(path_to_tfidf_matrix, 'rb') as infile:
    tfidf_matrix = pickle.load(infile)

## K-NN

Compute recipients by finding k-nearest tfidf neighbors over **all** the emails (not only the ones sent by the same sender) 

In [6]:
nb_neighbors = 50

mid_recipient_scores = knntools.compute_similarity_scores(tfidf_model, tfidf_matrix,
                              tfidf_mids, training_info, test_info, nb_similars=nb_neighbors)





In [7]:
nb_recipients = 10
knn_dic = knntools.similar_dic_to_standard(mid_recipient_scores, keep_all=True)

## Save results for fusion and submission

In [8]:
path_to_ranks = 'ranks_test/knn-k-{nb_neighbors}-rec-all'.format(nb_neighbors=nb_neighbors)
with open(path_to_ranks, 'wb') as infile:
          pickle.dump(knn_dic, infile)

In [14]:
postprocess.write_results_ranked(knn_dic,path_to_results, 'knn-k-{nb_neighbors}.csv'.format(nb_neighbors=nb_neighbors))