In [54]:
from collections import defaultdict
from gensim import corpora, models, similarities
from matplotlib import pyplot as plt
import numpy as np
import operator
import pandas as pd
import pickle
import random
import scipy

from tqdm import tqdm_notebook

import src.knntools as knntools
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.tools as tools
import src.recencytools as recency
import src.scoring as scoring
import src.textembeddingtools as texttools

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [4]:
train_info, train_email_ids_per_sender, val_info, val_email_ids_per_sender = scoring.get_train_val(training, training_info, train_frac=0.95)

Processing training !

Processing val !



In [5]:
body_dict = preprocess.body_dict_from_panda(train_info)

Constructing dictionnary from dataframe...

done !


In [None]:
train_info

In [6]:
token_dict_path = 'variables/token_dict_training'


In [7]:
token_dict = texttools.get_token_dict(token_dict_path, body_dict)

In [28]:
rarity_thres = 2
email_list = list(token_dict.values())
email_list = texttools.remove_rare_words(email_list, threshold_count=rarity_thres)
mids = list(token_dict.keys())

idx_to_mids = dict(zip(range(len(mids)),mids))




In [19]:
# Assign unique id to word
word_id_dic = corpora.Dictionary(email_list)

# Compute email corpush as bow [[(wordid_1_1, count_1_1), ...] ...]
email_corpus = [word_id_dic.doc2bow(text) for text in email_list]

In [21]:
print(len(word_id_dic))

84894


In [None]:
print(word_id_dic.token2id)

In [None]:
%%time
model = models.HdpModel(email_corpus, id2word=word_id_dic)

In [26]:
model_vars_path = 'variables/hdp_similarities_{thres_nb}_words_out'.format(thres_nb=rarity_thres)
hdp_model, hdp_sims = texttools.compute_hdp_model(email_corpus, word_id_dic,
                                                  model_vars_path, overwrite=False)
    

In [68]:
email_body = "This is an awesomely incredible test for human kind wow wow wow ow "
mids, scores = texttools.get_k_similars(hdp_model,  hdp_sims, word_id_dic, idx_to_mids, email_body, k=10)