In [1]:
from collections import defaultdict
import itertools
from matplotlib import pyplot as plt
import math
import numpy as np
import operator
import pandas as pd
import pickle
import random
import re
import scipy
import string
from tqdm import tqdm_notebook


import src.knntools as knntools
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.tools as tools
import src.recencytools as recency
import src.scoring as scoring
import src.textembeddingtools as texttools
import src.graphwordstools as graphtools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [3]:
train_info, train_email_ids_per_sender, val_info, val_email_ids_per_sender = scoring.get_train_val(training, training_info, train_frac=0.95)

Processing training !

Processing val !



In [4]:
train_body_dict = preprocess.body_dict_from_panda(train_info)
val_body_dict = preprocess.body_dict_from_panda(val_info)





In [5]:
train_token_dict = texttools.get_token_dict(train_body_dict)
val_token_dict = texttools.get_token_dict(val_body_dict)





## Compute average length of doc in tokens

In [7]:
train_doc_lengths_dic, train_average_doc_len = texttools.get_doc_length_info(train_token_dict)

In [8]:
train_idf_dic, train_idf_words = tfidftools.get_idf_dic(train_token_dict)




In [9]:
print(len(train_idf_dic))

7161


10979/|/ 26%|| 10979/41432 [00:20<00:56, 540.37it/s]

In [28]:
%%time
twidf_matrix, twidf_mids = graphtools.get_twidf_matrix(train_token_dict, train_doc_lengths_dic,
                                                       train_average_doc_len, train_idf_dic,
                                                       train_idf_words, 0.003)


# with open('variables/twidf_matrix', 'wb') as outfile:
#     pickle.dump(scipy.sparse.csr_matrix(twidf_matrix), outfile)
    
# with open('variables/twidf_mids', 'wb') as outfile:
#     pickle.dump(twidf_mids, outfile)

          1063/|/  3%|| 1063/41432 [00:43<27:15, 24.68it/s]CPU times: user 22min 32s, sys: 28.5 s, total: 23min 1s
Wall time: 24min 33s


In [24]:
# with open('variables/twidf_matrix', 'rb') as outfile:
#     twidf_matrix_ = pickle.load(outfile)
    
# with open('variables/twidf_mids', 'rb') as outfile:
#     twidf_mids_ = pickle.load(outfile)

In [None]:
# val_doc_lengths_dic, average_test_length = texttools.get_doc_length_info(val_token_dict)

In [33]:
val_vectors = graphtools.get_twidf_vectors_from_tokens(train_idf_dic, train_idf_words,
                                                       val_token_dict, train_average_doc_len)

In [32]:
mid_recipient_scores = knntools.compute_twidf_similarity_scores(twidf_matrix, twidf_mids,
                                                                val_vectors, train_info,
                                                                nb_similars=50)

In [29]:
twidf_predictions = knntools.similar_dic_to_standard(mid_recipient_scores, nb_recipients=100)
current_score = scoring.compute_prediction_mad(twidf_predictions, val_info)

NameError: name 'mid_recipient_scores' is not defined

In [52]:
print(current_score)

0.206168659357


In [53]:
path_to_ranks = 'ranks_val/twidf-knn-k-{nb_neighbors}-rec'.format(nb_neighbors=50)
with open(path_to_ranks, 'wb') as infile:
              pickle.dump(twidf_predictions, infile)