In [23]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess as sp

In [24]:
import json
with open('/home/nick/anaconda3/bin/Tirocinio/doc2vec_tryouts/trend_analisys.json', 'r') as file:
    docs = json.load(file)
print("Length of docs:",len(docs))

# search for duplicates
import utils
docs = utils.delete_duplicates_from_list(docs)
len(docs)

Length of docs: 293


293

### Let's compare 3 different models: the first one I came across, the intermediate one, and the final model
Then we'll see how each performs on the basic inference test.

In [66]:
import multiprocessing
cores = multiprocessing.cpu_count()
# train the models 
train_corpus = [TaggedDocument(sp(doc['title']+doc['abstract']), [i]) for i, doc in enumerate(docs)]
print("Number of docs:",len(train_corpus))
models = [
    Doc2Vec(vector_size = 35,
                alpha= 0.030,
                min_alpha=0.00030,
                min_count=2, # words that appear less than twice in the corpus are ignored
                dm=0, comment='first model, 35 features') ,
    Doc2Vec(dm=0, vector_size= 50, window=10, negative=5, hs=0, min_count=2, sample=0,
            epochs= 150, workers=cores, alpha= 0.05, comment='alpha=0.05, dm=1, vec_size=40'),
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs= 40, workers=cores, comment='dm=0, vec_size=100'), 
    Doc2Vec(dm=0, vector_size=800, negative=5, hs=0, min_count=3, sample=0,
             epochs=45, workers=multiprocessing.cpu_count(), comment='dm=0, vec = 800, min_count=3')
]

for model in models:
    model.build_vocab(train_corpus)
    print(model, "vocabulary built, with size:", len(model.wv.vocab))
model = models[0]
for epoch in range(300):
    if(epoch%50==0):
        print('iteration {0}'.format(epoch))
    model.train(train_corpus,
                total_examples = model.corpus_count,
                epochs = model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

for i, model in enumerate(models):
    if i == 0:
        continue
    print("Training %s" % model)
    %time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
    

Number of docs: 293
Doc2Vec("first model, 35 features",dbow,d35,n5,mc2,s0.001,t3) vocabulary built, with size: 7507
Doc2Vec("alpha=0.05, dm=1, vec_size=40",dbow,d50,n5,mc2,t4) vocabulary built, with size: 7507
Doc2Vec("dm=0, vec_size=100",dbow,d100,n5,mc2,t4) vocabulary built, with size: 7507
Doc2Vec("dm=0, vec = 800, min_count=3",dbow,d800,n5,mc3,t4) vocabulary built, with size: 4747
iteration 0
iteration 50
iteration 100
iteration 150
iteration 200
iteration 250
Training Doc2Vec("alpha=0.05, dm=1, vec_size=40",dbow,d50,n5,mc2,t4)
CPU times: user 29.4 s, sys: 132 ms, total: 29.6 s
Wall time: 10.3 s
Training Doc2Vec("dm=0, vec_size=100",dbow,d100,n5,mc2,t4)
CPU times: user 9.08 s, sys: 56 ms, total: 9.14 s
Wall time: 3.07 s
Training Doc2Vec("dm=0, vec = 800, min_count=3",dbow,d800,n5,mc3,t4)
CPU times: user 34.2 s, sys: 208 ms, total: 34.4 s
Wall time: 10.9 s


In [41]:
# let's check if the model is at least decent,
# which means: is it able to at least recognize news/documents
# it has seen in training?
import random
# Pick a random document from the train corpus and infer a vector from the model
def basic_test(model, train_corpus, verbose=False, iterations = 0):
    if iterations==0:
        iterations = len(train_corpus)
    correct = 0
    correct_2 = 0
    correct_3 = 0
    
    for i in range(iterations):
        doc_id = random.randint(0, len(train_corpus) - 1)
        inferred_vector = model.infer_vector(train_corpus[doc_id].words)
        similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)
        #print(doc_id, similar_docs)
        if verbose:
            titles = [doc['title'] for doc in docs]
            # show the 3 most similar document titles
            print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
            for doc_tag, similarity in similar_docs:
                print("\nSimilar Doc-->(doctag:{0},score:{1}):<<{2}>>".format(doc_tag, similarity, titles[doc_tag]))
        if doc_id == similar_docs[0][0]:
            correct += 1
        elif doc_id == similar_docs[1][0]:
            correct_2 += 1
        #elif doc_id == similar_docs[2][0]:
        #    correct_3 += 1
    # print success rate
    print("Model", model, "had a success rate of:",(correct * 100 / iterations))
    print("Found the correct one in the second document {} out of {} times".format(correct_2, (iterations-correct)))
    return correct, correct_2

In [65]:
correct = []
k = len(train_corpus)
for model in models:
    c, c2 = basic_test(model, train_corpus, iterations=k)
    correct.append((c * 100 / k, c2 * 100 / (k-c)))


Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



Model Doc2Vec("first model, 35 features",dbow,d35,n5,mc2,s0.001,t3) had a success rate of: 0.0
Found the correct one in the second document 0 out of 293 times
Model Doc2Vec("alpha=0.05, dm=1, vec_size=40",dbow,d50,n5,mc2,t4) had a success rate of: 69.28327645051195
Found the correct one in the second document 82 out of 90 times
Model Doc2Vec("dm=0, vec_size=100",dbow,d100,n5,mc2,t4) had a success rate of: 70.30716723549489
Found the correct one in the second document 83 out of 87 times
Model Doc2Vec("dm=0, vec = 800, min_count=3",dbow,d800,n5,mc3,t4) had a success rate of: 70.30716723549489
Found the correct one in the second document 86 out of 87 times


In [61]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3_', api_key='4O71urldgOueVtcApOdX')
descr = [m.comment for m in models]

trace0 = go.Bar(
    x = model_descr,
    y = [x for (x, y) in correct],
    name='Correct answers at first guess',
    marker=dict(
        color='rgb(49,130,189)'
    )
)

trace1 = go.Bar(
    x = model_descr,
    y = [y for (x, y) in correct],
    name='Correct answers at second guess',
    marker=dict(
        color='rgb(155, 244, 66)',
    )
    
)
data = [trace0, trace1]
layout = go.Layout(
    title = 'Basic Inference Test',
    xaxis=dict(
        tickfont=dict(
            size=10,
            color='rgb(107, 107, 107)',
            
        ),
        tickangle = -45
    ),
    yaxis=dict(
        title='Inference accuracy (%)',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
   
    barmode='group',
    bargap=0.2,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='basic inference testing')

In [60]:
m = Doc2Vec(vector_size= 45, min_count=2,
            epochs= 20, workers=cores)
m.build_vocab(train_corpus)
%time m.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)

basic_test(m, train_corpus, iterations=len(train_corpus))

CPU times: user 12.2 s, sys: 144 ms, total: 12.4 s
Wall time: 4.4 s



Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



Model Doc2Vec("alpha=0.05, dm=1, vec_size=40",dm/m,d45,n5,w5,mc2,s0.001,t4) had a success rate of: 68.9419795221843
Found the correct one in the second document 88 out of 91 times


(202, 88)

# Conclusions:
this test doesn't really tell us much, unless the model is way out of sense.