In [None]:
import gensim
import pandas as pd
import smart_open
from pathlib import Path
from collections import Counter

In [2]:
# Set Paths to file and directory
train_data_dir = Path(r'./Data/train')
train_file = Path(train_data_dir, 'train.csv')

test_data_dir = Path(r'./Data/test')
test_file = Path(test_data_dir, 'test.csv')

In [3]:
def read_corpus(fname, tokens_only = False):
    with smart_open.open(fname, encoding = "iso-8859-1") as f:
        data_set = pd.read_csv(f)
        data_set_a =  data_set.iloc[:, 1]
        for i, line in enumerate(data_set_a):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [4]:
train_corpus = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only = True))

In [5]:
'''
Doc2Vec Model:
    vector_size: 50 dimensions
    epochs: iterating over the training corpus 40 times
    min_count: discard words that dont appear less the 2 times in the whole corpus
'''
model = gensim.models.doc2vec.Doc2Vec(vector_size = 50, epochs = 40, min_count = 2)

In [6]:
'''
Vocabulary is a list of all unique words extracted from training corpus --> (Model.wv.index_to_key)
Additional attributes of each words                                     --> (Model.wv.get_vecattr())

ex: print(f"Word 'penalty' appeared {model.wv.get_vecattr('penalty', 'count')} times in the training corpus.")
'''
model.build_vocab(train_corpus)

In [7]:
'''
OBS:    If for some reason a BLAS library isn’t available, training uses a fallback approach that takes 60x-120x longer, so even this tiny training will take minutes rather than seconds.
        (And, in that case, you should also notice a warning in the logging letting you know there’s something worth fixing).
        So, be sure your installation uses the BLAS-optimized Gensim if you value your time.
'''
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

In [10]:
'''
Note: Infer_vector does NOT take a string, but rather a list of string tokens, which should have already been tokenized.
'''
vector = model.infer_vector(['visa', 'inc', 'class'])

**Assessing Model**

In [11]:
'''

'''
ranks = []
sec_rank = []
for doc_id in range(len(train_corpus)):
    inf_vec = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inf_vec], topn = len(model.dv))
    rank = [docId for docId, sim in sims].index(doc_id)
    ranks.append(sims[1])

In [32]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    sim = str(round(sims[index][1] * 100, 3)) + '%'
    print(u'%s «%s»: «%s»\n' % (label, ''.join(sim) , ' '.join(train_corpus[sims[index][0]].words)))

Document (2141): «dfa comm strategy»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST «83.308%»: «goldman sachs mgd futures strat»

SECOND-MOST «83.086%»: «goldman sachs mangd futures strategy»

MEDIAN «44.818%»: «technology select sectorspdr etf iv»

LEAST «-55.678%»: «vanguard short term corp bond index fund etf»



-0.5567775964736938