# Doc2Vec Model Training

## Import Modules

In [1]:
import pickle as pkl
import random

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

from collections import Counter
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

## Load Data

In [7]:
data_dir = Path(Path.cwd().parent, 'data/interim')
models_dir = Path(Path.cwd().parent, 'models')

In [12]:
def load_emails(size=int(1e4), seed=42):
    with open(Path(data_dir, 'message.pkl'), 'rb') as handle:
          messages = pkl.load(handle)
    
    filtered_emails = [(i, msg) for i, msg, t in messages if not str.isspace(msg) or msg != '']
    random.seed(seed)
    emails = random.sample(filtered_emails, k=size)
    return emails

In [6]:
# size = int(2e4)
# emails = load_emails(size=size)

# Name: sample_emails_seed_size
# with open(Path(data_dir, 'sample_emails_42_1e4.pkl'), 'wb') as handle:
#     pkl.dump(emails, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [13]:
# for i in range(int(1e4), int(1.01e5), int(0.5e4)):
#     emails = load_emails(size=i)
#     with open(Path(data_dir, f'sample_emails_42_{i}.pkl'), 'wb') as handle:
#         pkl.dump(emails, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [15]:
# with open(Path(data_dir, f'sample_emails_42_{int(2e4)}.pkl'), 'rb') as handle:
#     emails = pkl.load(handle)

## Train Model

In [16]:
# tokenized_docs = [simple_preprocess(emails[i][1]) for i in range(len(emails))]
# corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_docs)]

In [9]:
# Name: tagged_docs_seed_size
# with open(Path(data_dir, f'tagged_docs_42_{size}.pkl'), 'wb') as handle:
#     pkl.dump(corpus, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [17]:
# for i in range(int(1e4), int(1.01e5), int(0.5e4)):
#     with open(Path(data_dir, f'sample_emails_42_{i}.pkl'), 'rb') as handle:
#         emails = pkl.load(handle)
#     tokenized_docs = [simple_preprocess(emails[i][1]) for i in range(len(emails))]
#     corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_docs)]
#     with open(Path(data_dir, f'tagged_docs_42_{i}.pkl'), 'wb') as handle:
#         pkl.dump(corpus, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [8]:
for i in range(int(1e4), int(1.01e5), int(0.5e4)):
    with open(Path(data_dir, f'tagged_docs_42_{i}.pkl'), 'rb') as handle:
        corpus = pkl.load(handle)
    vector_size = 300
    window_size = 15
    min_count = 1
    train_epoch = 20
    alpha = 0.25
    min_alpha = 1e-5
    model = Doc2Vec(vector_size=vector_size,
                    window=window_size,
                    alpha=alpha, 
                    min_alpha=min_alpha,
                    min_count=min_count,
                    epochs=train_epoch,
                    dm=0)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    
    model.save(str(Path(models_dir, f'dv_42_{i}_300_15_1_20_1e-5.model')))
    print(f'Model with size {i} saved')

Model with size 10000 saved
Model with size 15000 saved
Model with size 20000 saved
Model with size 25000 saved
Model with size 30000 saved
Model with size 35000 saved
Model with size 40000 saved
Model with size 45000 saved
Model with size 50000 saved
Model with size 55000 saved
Model with size 60000 saved
Model with size 65000 saved
Model with size 70000 saved
Model with size 75000 saved
Model with size 80000 saved
Model with size 85000 saved
Model with size 90000 saved
Model with size 95000 saved
Model with size 100000 saved


_1e+04


In [None]:
Path(models_dir, '

In [10]:
# vector_size = 300
# window_size = 15
# min_count = 1
# train_epoch = 20
# alpha = 0.25
# min_alpha = 1e-5
# model = Doc2Vec(vector_size=vector_size,
#                 window=window_size,
#                 alpha=alpha, 
#                 min_alpha=min_alpha,
#                 min_count=min_count,
#                 epochs=train_epoch,
#                 dm=0)
# model.build_vocab(corpus)
# model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Assess Model

In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(corpus)):
    inferred_vector = model.infer_vector(corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

counter = Counter(ranks)
print([(i, c, c/len(ranks)*100) for i, c in list(counter.most_common()[:10])])

print('Document ({}): «{}»\n'.format(doc_id, ' '.join(corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(corpus[sims[index][0]].words)))

doc_id = random.randint(0, len(corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(corpus[sim_id[0]].words)))

In [None]:
# Name code: model_vectorsize_window_mincount_epochs_alphamin
model.save(str(Path(models_dir, "dv.model_300_15_1_20_1e-5")))
print("Model Saved")

In [None]:
Cluster