In [1]:
from collections import defaultdict
import datetime
from gensim import corpora, models, similarities
from matplotlib import pyplot as plt
import numpy as np
import operator
import pandas as pd
import pickle
import random
import re
import scipy
from sklearn.preprocessing import MultiLabelBinarizer

from tqdm import tqdm_notebook

import src.knntools as knntools
import src.postprocess as postprocess
import src.preprocess as preprocess
import src.tfidftools as tfidftools
import src.tools as tools
import src.recencytools as recency
import src.scoring as scoring
import src.textembeddingtools as texttools

%load_ext autoreload
%autoreload 2
%matplotlib inline

## Random forests with topic and day-of-week features

Because of RAM limitations we did not experiment with a global model and instead limitated ourselves to sender specific models

In [2]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(
    path_to_data + 'training_info.csv', sep=',', parse_dates=True, header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

test_info = pd.read_csv(path_to_data + 'test_info.csv',
                        sep=',', parse_dates=True, header=0)
path_to_results = 'results/'

In [3]:
training_email_ids_per_sender = preprocess.get_email_ids_per_sender(training)
test_email_ids_per_sender = preprocess.get_email_ids_per_sender(test)





Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


In [4]:
training_body_dict = preprocess.body_dict_from_panda(training_info)
test_body_dict = preprocess.body_dict_from_panda(test_info)






In [5]:
training_token_dict = texttools.get_token_dict(training_body_dict)
test_token_dict = texttools.get_token_dict(test_body_dict)





In [6]:
training_email_list, training_idx_to_mids = texttools.token_dicts_to_token_lists(training_token_dict, rarity_threshold=3)
test_email_list, test_idx_to_mids = texttools.token_dicts_to_token_lists(test_token_dict, rarity_threshold=3)

In [7]:
# Assign unique id to word
training_word_id_dic = corpora.Dictionary(training_email_list)

# Compute email corpush as bow [[(wordid_1_1, count_1_1), ...] ...]
training_email_corpus = [training_word_id_dic.doc2bow(text) for text in training_email_list]
test_email_corpus = [training_word_id_dic.doc2bow(text) for text in test_email_list]

In [8]:
nb_topics = 200
nb_training_emails = len(training_email_corpus)
print(nb_training_emails)

43613


In [9]:
# model_vars_path = 'variables/hdp_similarities_{thres_nb}_words_out_no_punct'.format(thres_nb=rarity_thres)
stored_model_results = ['variables/training-lda-200.p']

lda_model = texttools.compute_model(training_email_corpus, training_word_id_dic, model='lda', nb_topics=nb_topics,
                                    use_saved=True, save=True, model_results_path=stored_model_results[0])

In [10]:
training_senders_mid_features_dict, training_senders_idx_to_mid_dic = texttools.get_sender_model_features_from_tokens(training_email_ids_per_sender,
                                                    training_token_dict, training_word_id_dic, lda_model, nb_topics)




In [11]:
training_lda_sender_features_path = 'variables/training-lda-200-sender-features-dic.p'

# with open(training_lda_sender_features_path, 'wb') as outfile:
#     pickle.dump(training_senders_mid_features_dict, outfile)
    
with open(training_lda_sender_features_path, 'rb') as infile:
    training_senders_mid_features_dict = pickle.load(infile)

training_lda_sender_idx_to_mid_path = 'variables/training-lda-200-sender-idx-to-mids-dic.p'

# with open(training_lda_sender_idx_to_mid_path, 'wb') as outfile:
#     pickle.dump(training_senders_idx_to_mid_dic, outfile)
    
with open(training_lda_sender_idx_to_mid_path, 'rb') as infile:
    training_senders_idx_to_mid_dic = pickle.load(infile)

In [12]:
test_senders_mid_features_dict, test_senders_idx_to_mid_dic = texttools.get_sender_model_features_from_tokens(test_email_ids_per_sender, test_token_dict,
                                                                                training_word_id_dic, lda_model, nb_topics)




In [13]:
training_day_features_dic = recency.get_sender_sparse_date_info(training_email_ids_per_sender, 
                                                         training_senders_idx_to_mid_dic, training_info)
test_day_features_dic = recency.get_sender_sparse_date_info(test_email_ids_per_sender,
                                                       test_senders_idx_to_mid_dic, test_info)

In [14]:
training_stacked_features_dict = texttools.create_stacked_feature_dic([training_senders_mid_features_dict, training_day_features_dic])
test_stacked_features_dict = texttools.create_stacked_feature_dic([test_senders_mid_features_dict, test_day_features_dic])

In [15]:
training_sender_recipients_binaries, training_sender_idx_to_recipients = preprocess.get_one_hot_sender_recipients(training_senders_idx_to_mid_dic, training_info)




In [16]:
nb_tree=20
test_tree_predictions = texttools.tree_train_predict(training_stacked_features_dict, training_sender_recipients_binaries,
                           training_sender_idx_to_recipients, test_stacked_features_dict, test_senders_idx_to_mid_dic,
                                                     nb_tree=nb_tree, min_samples_split=20, min_samples_leaf=10)




## Save results

In [17]:
path_to_ranks = 'ranks_test/trees-tree-nb-{nb_tree}-{nb_topics}'.format(nb_tree=nb_tree, nb_topics=nb_topics)
with open(path_to_ranks, 'wb') as infile:
          pickle.dump(test_tree_predictions, infile)

In [18]:
postprocess.write_results_ranked(test_tree_predictions, path_to_results,
                                 'trees-tree-nb-{nb_tree}-topics-{nb_topics}.txt'.format(nb_tree=nb_tree,
                                                                                     nb_topics=nb_topics))

In [34]:
print(len(test_tree_predictions))

2362
