In [1]:
import pandas as pd
import numpy as np
import json
import re
import os
import sys
import gensim

sys.path.append(os.getcwd())
from utility import *

In [2]:
import smart_open

In [3]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="utf-8") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line, min_len=3)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [4]:
train_corpus = list(read_corpus("/Volumes/White/training/bbchealth.txt"))

In [6]:
test_corpus = list(read_corpus("/Volumes/White/training/cbchealth.txt", tokens_only=True))

In [8]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [9]:
model.build_vocab(train_corpus)

In [10]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)


In [12]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [13]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 299, 1: 101, 2: 79, 3: 73, 6: 52, 5: 49, 4: 47, 7: 44, 8: 43, 10: 42, 9: 40, 11: 37, 17: 37, 14: 37, 13: 33, 27: 33, 21: 32, 19: 31, 20: 30, 12: 30, 26: 29, 47: 29, 18: 28, 24: 28, 15: 27, 29: 26, 23: 26, 40: 24, 41: 24, 22: 24, 32: 23, 48: 23, 25: 22, 33: 22, 43: 22, 79: 21, 16: 21, 28: 21, 54: 20, 42: 20, 30: 19, 31: 18, 78: 18, 35: 18, 38: 18, 55: 17, 34: 17, 36: 17, 60: 17, 50: 17, 84: 17, 94: 17, 68: 17, 45: 17, 90: 16, 88: 16, 70: 16, 100: 16, 66: 16, 52: 15, 37: 15, 106: 15, 73: 15, 138: 15, 61: 15, 46: 15, 82: 14, 58: 14, 101: 14, 51: 14, 80: 14, 71: 14, 111: 14, 39: 14, 91: 14, 115: 14, 112: 14, 74: 13, 107: 13, 76: 13, 93: 13, 75: 13, 110: 13, 127: 13, 67: 13, 179: 13, 81: 13, 92: 13, 63: 13, 118: 13, 59: 13, 65: 13, 44: 12, 89: 12, 166: 12, 133: 12, 139: 12, 124: 12, 121: 12, 120: 12, 102: 12, 56: 12, 53: 12, 159: 11, 131: 11, 155: 11, 69: 11, 174: 11, 119: 11, 98: 11, 62: 11, 104: 11, 128: 11, 77: 11, 87: 11, 156: 11, 147: 10, 64: 10, 132: 10, 130: 10, 99: 10, 1

In [18]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (3928): «diff manslaughter inquiry call»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (1913, 0.9228657484054565): «hospital deaths inquiry announced»

SECOND-MOST (3245, 0.9043088555335999): «trust apologises over dna failings»

MEDIAN (135, 0.8058278560638428): «testosterone boost could cut deaths»

LEAST (1849, -0.890856146812439): «audio uk mum spent to have girl»



In [19]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (18): «public back tax rises to fund nhs»

Similar Document (532, 0.7212491631507874): «video could volunteers help crisis»



In [22]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (3053): «research to help you pick and choose best way to sneeze»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (99, 0.8253598213195801): «video what can make you happy»

MEDIAN (1166, 0.0705452412366867): «video robbie sings during wife labour»

LEAST (1929, -0.7627078294754028): «ebola deaths up by in west africa»



In [25]:
model.vocabulary

<gensim.models.doc2vec.Doc2VecVocab at 0x12a0f0390>

In [27]:
len(test_corpus)

3741