# Doc2Vec Model Training

## Import Modules

In [1]:
import pickle as pkl
import random

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

from collections import Counter
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

## Load Data

In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
models_dir = Path(Path.cwd().parent, 'src/models')

In [3]:
def load_emails(size=int(1e4), seed=42):
    with open(Path(data_dir, 'message.pkl'), 'rb') as handle:
          messages = pkl.load(handle)
    
    filtered_emails = [(i, msg) for i, msg, t in messages if not str.isspace(msg) or msg != '']
    random.seed(seed)
    emails = random.sample(filtered_emails, k=size)
    return emails

In [4]:
emails = load_emails()

In [9]:
# Name: sample_emails_seed_size
with open(Path(data_dir, 'sample_emails_42_1e4.pkl'), 'wb') as handle:
    pkl.dump(emails, handle, protocol=pkl.HIGHEST_PROTOCOL)

## Train Model

In [5]:
tokenized_docs = [simple_preprocess(emails[i][1]) for i in range(len(emails))]
corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_docs)]

In [10]:
# Name: tagged_docs_seed_size
with open(Path(data_dir, 'tagged_docs_42_1e4.pkl'), 'wb') as handle:
    pkl.dump(corpus, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [6]:
vector_size = 300
window_size = 15
min_count = 1
train_epoch = 20
alpha = 0.25
min_alpha = 1e-5
model = Doc2Vec(vector_size=vector_size,
                window=window_size,
                alpha=alpha, 
                min_alpha=min_alpha,
                min_count=min_count,
                epochs=train_epoch,
                dm=0)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Assess Model

In [7]:
ranks = []
second_ranks = []
for doc_id in range(len(corpus)):
    inferred_vector = model.infer_vector(corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

counter = Counter(ranks)
print([(i, c, c/len(ranks)*100) for i, c in list(counter.most_common()[:10])])

print('Document ({}): «{}»\n'.format(doc_id, ' '.join(corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(corpus[sims[index][0]].words)))

doc_id = random.randint(0, len(corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(corpus[sim_id[0]].words)))

[(0, 822, 8.219999999999999), (1, 164, 1.6400000000000001), (2, 87, 0.8699999999999999), (3, 71, 0.7100000000000001), (5, 51, 0.51), (6, 48, 0.48), (4, 47, 0.47000000000000003), (8, 31, 0.31), (11, 31, 0.31), (9, 30, 0.3)]
Document (9999): «thanks for update pls keep sending info»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dbow,d300,n5,s0.001,t3>:

MOST (1337, 0.5746689438819885): «thanks btw the pager jab was aimed at you»

SECOND-MOST (7398, 0.5477059483528137): «vasant thanks for the invitation it works for me vince»

MEDIAN (4805, 0.16880661249160767): «alan furchtenicht montclair lane madison wi aefurcht facstaff wisc edu to mr ken lay writing to urge you to donate the millions of dollars you made from selling enron stock before the company declared bankruptcy to funds such as enron employee transition fund and reach that benefit the company employees who lost their retirement savings and provide relief to low income consumers in california who can afford to pay their energy bills

In [8]:
# Name code: model_vectorsize_window_mincount_epochs_alphamin
model.save(str(Path(models_dir, "dv.model_300_15_1_20_1e-5")))
print("Model Saved")

Model Saved
