# Doc2Vec Model Training

## Import Modules

In [1]:
import pickle as pkl
import random

import numpy as np
import pandas as pd

from collections import Counter
from datetime import datetime
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

## Load Data

In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
d2v_dir = Path(data_dir, 'doc2vec')
models_dir = Path(Path.cwd().parent, 'models')

In [3]:
def save_tagged_docs(data, path, vector_size=300):
    file = Path(data_dir, 'tagged_docs_' + str(vector_size) + '_' + '_'.join(path.name.split('_')[2:]))
    with open(file, 'wb') as handle:
        pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)
        

def train_doc2vec(data, path, vector_size=300):
    vector_size = vector_size
    window_size = 15
    min_count = 1
    train_epoch = 20
    alpha = 0.25
    min_alpha = 1e-5
    model = Doc2Vec(vector_size=vector_size,
                    window=window_size,
                    alpha=alpha, 
                    min_alpha=min_alpha,
                    min_count=min_count,
                    epochs=train_epoch,
                    dm=0)
    model.build_vocab(data)
    model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(str(Path(models_dir, 'd2v_' + '_'.join(path.stem.split('_')[2:]) + '_' + str(vector_size) + '.model')))
    print('_'.join(path.name.split('_')[2:]) + ' model saved')
    

def get_doc2vec(path, **kwargs):
    df = pd.read_pickle(path)
    messages = df['Message'].dropna().to_numpy()
    tokenized_docs = [simple_preprocess(msg) for msg in messages]
    corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_docs)]
    t0 = datetime.now()
    if 'vector_size' in kwargs:
        save_tagged_docs(corpus, path, kwargs['vector_size'])
        train_doc2vec(corpus, path, kwargs['vector_size'])
    else:
        save_tagged_docs(corpus, path)
        train_doc2vec(corpus, path)
    t1 = datetime.now()
    print(f'Took {t1-t0}')

In [None]:
%%time
for path in sorted(data_dir.glob('parsed_emails*.pkl')):
    get_doc2vec(path)

In [None]:
%%time
for path in sorted(data_dir.glob('parsed_emails*.pkl')):
    get_doc2vec(path, vector_size=50)

In [9]:
for path in sorted(data_dir.glob('parsed_emails*.pkl'))[1:6]:
    get_doc2vec(path)

chains_eq_2.pkl model saved
Took 0:00:31.677726
chains_eq_3.pkl model saved
Took 0:00:11.961783
chains_ge_10.pkl model saved
Took 0:00:05.593118
chains_ge_4_lt_10.pkl model saved
Took 0:00:12.024035
chains_gt_1.pkl model saved
Took 0:01:02.423497


## Train Model

## Assess Model

In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(corpus)):
    inferred_vector = model.infer_vector(corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

counter = Counter(ranks)
print([(i, c, c/len(ranks)*100) for i, c in list(counter.most_common()[:10])])

print('Document ({}): «{}»\n'.format(doc_id, ' '.join(corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(corpus[sims[index][0]].words)))

doc_id = random.randint(0, len(corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(corpus[sim_id[0]].words)))

In [3]:
import spacy
# from spacy.tokens import DocBin
# from sklearn import tree
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV

In [4]:
# spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf") # define your language model

In [6]:
sorted([path for path in data_dir.glob('parsed_emails*.pkl')])

[PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_all.pkl'),
 PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_gt_1.pkl'),
 PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_replies.pkl'),
 PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_split_0.pkl'),
 PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_split_1.pkl'),
 PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_split_2.pkl'),
 PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_split_3.pkl')]

In [7]:
df = pd.read_pickle('/home/miguel/Projects/tfm-nlp/data/interim/parsed_emails_chains_gt_1.pkl')

In [34]:
def get_terms(string):
    doc = nlp(string)
    terms = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_space\
        and not token.like_url and not token.like_email and not token.is_currency\
        and not token.like_num and token.pos_ != 'X':
            terms.append(f'{token}_{token.pos_}_{token.lemma_}')
    return terms

In [38]:
get_terms(df.loc[142, 'Message'])

['Hey_INTJ_hey',
 'Paul_PROPN_Paul',
 'going_VERB_go',
 'Attached_VERB_attach',
 'find_VERB_find',
 'pics_NOUN_pic',
 'halloween_NOUN_halloween',
 'party_NOUN_party',
 'hope_VERB_hope',
 'like_VERB_like',
 'going_VERB_go',
 'Brasil_PROPN_Brasil',
 'today_NOUN_today',
 'days_NOUN_day',
 'guess_VERB_guess',
 'better_ADJ_well',
 'Let_VERB_let',
 'weekends_NOUN_weekend',
 'pretty_ADV_pretty',
 'laid_VERB_lay',
 'lately_ADV_lately',
 'miss_VERB_miss',
 'guys_NOUN_guy',
 'abraco_PROPN_abraco',
 'Eduardo_PROPN_Eduardo']

In [8]:
doc = nlp(df.loc[142, 'Message'])

In [40]:
df_terms = {}
df_terms['terms'] = df['Message'].apply(get_terms)

Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors


ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'float'>

In [None]:
# adjust attributes to your liking:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)

for doc in nlp.pipe(df['articleDocument'].str.lower()):
    doc_bin.add(doc)

# either save DocBin to a bytes object, or...
#bytes_data = doc_bin.to_bytes()

# save DocBin to a file on disc
file_name_spacy = 'output/preprocessed_documents.spacy'
doc_bin.to_disk(tfi_dir)

#Load DocBin at later time or on different system from disc or bytes object
#doc_bin = DocBin().from_bytes(bytes_data)
doc_bin = DocBin().from_disk(file_name_spacy)

docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))

tokenized_lemmatized_texts = [[token.lemma_ for token in doc 
                               if not token.is_stop and not token.is_punct and not token.is_space and not token.like_url and not token.like_email] 
                               for doc in docs]

# classifier to use
clf = tree.DecisionTreeClassifier()

# just some random target response
y = np.random.randint(2, size=len(docs))


vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, tokenizer=lambda x: x, max_features=3000)

pipeline = Pipeline([('vect', vectorizer), ('dectree', clf)])
parameters = {'dectree__max_depth':[4, 10]}
gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
gs_clf.fit(tokenized_lemmatized_texts, y)
print(gs_clf.best_estimator_.get_params()['dectree'])

In [None]:
# adjust attributes to your liking:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)

for doc in nlp.pipe(df['articleDocument'].str.lower()):
    doc_bin.add(doc)

# either save DocBin to a bytes object, or...
#bytes_data = doc_bin.to_bytes()

# save DocBin to a file on disc
file_name_spacy = 'output/preprocessed_documents.spacy'
doc_bin.to_disk(tfi_dir)

#Load DocBin at later time or on different system from disc or bytes object
#doc_bin = DocBin().from_bytes(bytes_data)
doc_bin = DocBin().from_disk(file_name_spacy)

docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))

tokenized_lemmatized_texts = [[token.lemma_ for token in doc 
                               if not token.is_stop and not token.is_punct and not token.is_space and not token.like_url and not token.like_email] 
                               for doc in docs]

# classifier to use
clf = tree.DecisionTreeClassifier()

# just some random target response
y = np.random.randint(2, size=len(docs))


vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, tokenizer=lambda x: x, max_features=3000)

pipeline = Pipeline([('vect', vectorizer), ('dectree', clf)])
parameters = {'dectree__max_depth':[4, 10]}
gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
gs_clf.fit(tokenized_lemmatized_texts, y)
print(gs_clf.best_estimator_.get_params()['dectree'])

## Direct TFIDF

In [4]:
tfi_dir

PosixPath('/home/miguel/Projects/tfm-nlp/data/interim/tfidf')

## Modified TFIDF with vector norm, lemma and pos