# 3-doc-test
The test doc is formed by a number of triples of documents, each triple is made of 2 similar docs, and a third document, different from the 2 above; the model answers correctly if it recognizes the 2 docs to be similar among them (given a certain treshold), and also figures out the third one to be a different model.
This means that later on, these documents will be part of the same cluster (the 2 similar ones), while the 3rd one will be correctly excluded from it.

In [1]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess as sp

In [3]:
import json
import random
with open('3-doc-test.json', 'r') as file:
    cdocs = json.load(file)

print("Number of triples in the document:", len(cdocs))
docs = [doc for triple in cdocs for doc in triple]
random.shuffle(docs)
print("Total number of docs: ", len(docs))

#print([doc['title'] for doc in docs])
import utils
docs = utils.delete_duplicates_from_list(docs)
print(len(docs))

Number of triples in the document: 102
Total number of docs:  306
108


## Train models on the test docs
This may not be the best option, we might surely get better results by training the models over a bigger corpus, containing, among other docs, these triples of docs

In [4]:
import multiprocessing
cores = multiprocessing.cpu_count()
# train the models on clean_dataset
train_corpus = [TaggedDocument(sp(doc['title']+doc['abstract']), [i]) for i, doc in enumerate(docs)]
print("Number of docs:",len(train_corpus))
models = [
    Doc2Vec(dm=0, vector_size=50, min_count=2,
            epochs= 40, workers=cores, comment='dm=0, vec_size=50, hs=1'), 
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs= 40, workers=cores, comment='dm=0, vec_size=100'), 
    Doc2Vec(dm=0, vector_size=800, negative=5, hs=0, min_count=3, sample=0,
             epochs=45, workers=multiprocessing.cpu_count(), comment='dm=0, vec = 800, min_count=3, epochs=45'),
    Doc2Vec(dm=0, vector_size=800, negative=10, hs=0, min_count=2, sample=0,
             epochs=40, workers=multiprocessing.cpu_count(), comment='dm=0, vec = 800, negative = 10')
]

for model in models:
    model.build_vocab(train_corpus)
    print(model, "vocabulary built, with size:", len(model.wv.vocab))
    
for i, model in enumerate(models):
    print("Training %s" % model)
    %time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
    

Number of docs: 108
Doc2Vec("dm=0, vec_size=50, hs=1",dbow,d50,n5,mc2,s0.001,t4) vocabulary built, with size: 3029
Doc2Vec("dm=0, vec_size=100",dbow,d100,n5,mc2,t4) vocabulary built, with size: 3029
Doc2Vec("dm=0, vec = 800, min_count=3, epochs=45",dbow,d800,n5,mc3,t4) vocabulary built, with size: 1925
Doc2Vec("dm=0, vec = 800, negative = 10",dbow,d800,n10,mc2,t4) vocabulary built, with size: 3029
Training Doc2Vec("dm=0, vec_size=50, hs=1",dbow,d50,n5,mc2,s0.001,t4)
CPU times: user 2.35 s, sys: 36 ms, total: 2.38 s
Wall time: 1.11 s
Training Doc2Vec("dm=0, vec_size=100",dbow,d100,n5,mc2,t4)
CPU times: user 2.72 s, sys: 24 ms, total: 2.74 s
Wall time: 1.1 s
Training Doc2Vec("dm=0, vec = 800, min_count=3, epochs=45",dbow,d800,n5,mc3,t4)
CPU times: user 8.73 s, sys: 120 ms, total: 8.85 s
Wall time: 3.32 s
Training Doc2Vec("dm=0, vec = 800, negative = 10",dbow,d800,n10,mc2,t4)
CPU times: user 16.6 s, sys: 76 ms, total: 16.7 s
Wall time: 5.81 s


In [5]:
import sklearn.metrics.pairwise as sk
def fetch_vector_from_data(doc):
    # need training corpus loaded to work
    for i, d in enumerate(docs):
        if d == doc:
            return i


def docstest(model, test_data, s_threshold = 0.75):
    similarity_threshold = s_threshold#0.61 # when can we define two docs as similar? (empirical-obtained value)
    
    # format: [... ,[{}, {}, {}], [ {}, {}, {}] ...   ]
    not_similar, not_diss1, not_diss2 = 0, 0, 0
    correct = 0
    for triple in test_data:
        inferred_docs = []
        for doc in triple:
            # infer vector from each document
            #d = doc['title'].lower() + doc['abstract'].lower()
            inferred_docs.append(model.docvecs[fetch_vector_from_data(doc)])
        assert len(inferred_docs) == 3 # triple of docs
        try:
            if sk.cosine_similarity([inferred_docs[0]], [inferred_docs[1]]) >= similarity_threshold:
                # docA and docB are guessed to be (correctly) similar
                if sk.cosine_similarity([inferred_docs[0]], [inferred_docs[2]]) < similarity_threshold:
                    #docA and docC are not similar
                    if sk.cosine_similarity([inferred_docs[1]], [inferred_docs[2]]) < similarity_threshold:
                            # guessed right
                            correct += 1
                    else:
                        not_diss2 += 1
                else:
                    not_diss1 += 1
                    
            else:
                not_similar += 1
        except Exception as e:
            print(e)
    print("%s correct guesses over %s triples (%s percent)" %(correct, len(test_data), correct*100 / len(test_data)))
    print("Not similar: {}, not dissimilar 0-2: {}, not dissimilar 1-2: {}".format(not_similar, not_diss1, not_diss2))

In [6]:
# let's try the test on each model
for model in models:
    docstest(model, cdocs)
    docstest(model, cdocs, s_threshold=0.64)
    print('----------------')

12 correct guesses over 102 triples (11.764705882352942 percent)
Not similar: 90, not dissimilar 0-2: 0, not dissimilar 1-2: 0
27 correct guesses over 102 triples (26.470588235294116 percent)
Not similar: 75, not dissimilar 0-2: 0, not dissimilar 1-2: 0
----------------
10 correct guesses over 102 triples (9.803921568627452 percent)
Not similar: 92, not dissimilar 0-2: 0, not dissimilar 1-2: 0
17 correct guesses over 102 triples (16.666666666666668 percent)
Not similar: 85, not dissimilar 0-2: 0, not dissimilar 1-2: 0
----------------
9 correct guesses over 102 triples (8.823529411764707 percent)
Not similar: 93, not dissimilar 0-2: 0, not dissimilar 1-2: 0
17 correct guesses over 102 triples (16.666666666666668 percent)
Not similar: 85, not dissimilar 0-2: 0, not dissimilar 1-2: 0
----------------
11 correct guesses over 102 triples (10.784313725490197 percent)
Not similar: 91, not dissimilar 0-2: 0, not dissimilar 1-2: 0
22 correct guesses over 102 triples (21.568627450980394 percent

In [44]:
print(cdocs)

[[{'fonte_dati': 'trend_analisys', 'id': '1-http://www.lastampa.it/2018/09/22/tecnologia/dating-il-servizio-di-facebook-per-trovare-lanima-gemella-OyMOZmQeX2gUot3gqx2IcM/pagina.html', 'ta_id': 1, 'title': 'Dating, il servizio di Facebook per trovare l’anima gemella - La Stampa', 'abstract': 'Dating è il nuovo servizio di Facebook, che permette di trovare l’anima gemella . La funzionalità è in fase di sperimentazione, in Colombia. il servizio suggerisce i profili più adatti, basandosi sui dati che abbiamo deciso di condividere sul social network. \nAl contrario di Tinder, non serve far scorrere gli utenti per selezionarli, ma è possibile interagire con loro. Come funziona? Una volta disponibile, gli utenti dell’app per dispositivi mobili e che hanno compiuto 18 anni, potranno attivarlo direttamente dal feed notizie. La prima informazione da condividere è la propria localizzazione, in modo da poter visualizzare i profili della stessa città o località. Allo stesso tempo, sarà possibile pu

In [61]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3_', api_key='4O71urldgOueVtcApOdX')
descr = [m.comment for m in models]

trace0 = go.Bar(
    x = model_descr,
    y = [x for (x, y) in correct],
    name='Correct answers at first guess',
    marker=dict(
        color='rgb(49,130,189)'
    )
)

trace1 = go.Bar(
    x = model_descr,
    y = [y for (x, y) in correct],
    name='Correct answers at second guess',
    marker=dict(
        color='rgb(155, 244, 66)',
    )
    
)
data = [trace0, trace1]
layout = go.Layout(
    title = 'Basic Inference Test',
    xaxis=dict(
        tickfont=dict(
            size=10,
            color='rgb(107, 107, 107)',
            
        ),
        tickangle = -45
    ),
    yaxis=dict(
        title='Inference accuracy (%)',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
   
    barmode='group',
    bargap=0.2,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='basic inference testing')

In [60]:
m = Doc2Vec(vector_size= 45, min_count=2,
            epochs= 20, workers=cores)
m.build_vocab(train_corpus)
%time m.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)

basic_test(m, train_corpus, iterations=len(train_corpus))

CPU times: user 12.2 s, sys: 144 ms, total: 12.4 s
Wall time: 4.4 s



Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



Model Doc2Vec("alpha=0.05, dm=1, vec_size=40",dm/m,d45,n5,w5,mc2,s0.001,t4) had a success rate of: 68.9419795221843
Found the correct one in the second document 88 out of 91 times


(202, 88)

# Conclusions:
this test doesn't really tell us much, unless the model is way out of sense.