# Train Doc2Vec model on new data
# Title + Abstract

In [4]:
import gensim
from gensim.models.doc2vec import Doc2Vec
import json
import numpy as np

#filename = '2_different_fb-spazio.json'
filename = 'similar_documents_facebook.json'
# create corpus
with open(filename, 'r') as file:
    docs = json.load(file)

# find duplicates
index_list = []
for i, doc in enumerate(docs):
    try:
        index = docs.index(i+1, len(docs), doc)
        index_list.append(index)
    except:
        None
docs = [doc for j, doc in enumerate(docs) if not(j in index_list)]

    
#filenames = ['mixed_docs_mix1.json', 'mixed_docs_mix2.json', 'mixed_docs_mix3.json']
#docs = []
#for filename in filenames:
#    with open(filename, 'r') as file:
#        docs += json.load(file)

# get train corpus
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d['title']+d['abstract']), [i]) for i, d in enumerate(docs) ]
print("Length of train corpus: ",len(train_corpus))

import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


# let's try training two models at once: Paragraph Vector - Distributed Memory (PV-DM), just like CBOW to W2V
# and Paragraph Vector - Distributed Bag of Words (PV-DBOW), analogous to W2V Skip-gram
epochs = 45
vec_size = 100
alpha = 0.10  
MODEL_NAME = "Models_Live_Test/d2v_abstract&title"

models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=cores, comment='live data'),
    # PV-DM w/ default averaging
    Doc2Vec(dm=1, vector_size= vec_size, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs= epochs, workers=cores, alpha= alpha, comment='alpha=0.1-live data'),
]

# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
for model in models:
    print(model)
    model.build_vocab(train_corpus)
print("Vocabulary created!")

# train the models on the given data!
counter = 0
for model in models:
    print("Training %s" % model)
    %time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
    #model.save(MODEL_NAME+str(counter)+'.model')
    counter = counter + 1
#print("Models Saved")


Length of train corpus:  59
Doc2Vec("live data",dbow,d100,n5,mc2,t4)
Doc2Vec("alpha=0.1-live data",dm/m,d100,n5,w10,mc2,t4)
Vocabulary created!
Training Doc2Vec("live data",dbow,d100,n5,mc2,t4)
CPU times: user 1.33 s, sys: 12 ms, total: 1.34 s
Wall time: 805 ms
Training Doc2Vec("alpha=0.1-live data",dm/m,d100,n5,w10,mc2,t4)
CPU times: user 2.23 s, sys: 4 ms, total: 2.24 s
Wall time: 1.22 s


In [21]:
# 'S-G' model finds many more clusters ( higher precision) -model[1]
# CBOW model finds fewer clusters with more docs in them

import my_dbscan
model = models[1]
print(len(model.docvecs))
doc_vecs = [model.docvecs[j] for j in range(len(model.docvecs))]
titles = [doc['title'] for doc in docs]
urls = [doc['url'] for doc in docs]
urls_cluster_list = my_dbscan.apply_dbscan(doc_vecs = doc_vecs, titles = titles, 
                                           urls = urls, subset_length = 70,
                                             eps = 0.27, eps_increment = 0.1, n_iterations = 3, verbose = False)
# visualize clustering
import plotly.plotly as py
import plotly.tools as tls
import plotly.graph_objs as go
import random
import numpy as np
import utils
# using my api-key
tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')
i = 0
cluster_titles = utils.getDocTitleFromUrl(docs, urls_cluster_list)
fig = utils.plot_clusters(cluster_titles)
py.iplot(fig, filename='a')

59
##Clusters##
Cluster: ['Facebook, i cofondatori di Instagram lasciano la società - Repubblica.it', 'Facebook, terremoto in Instagram: lasciano i due co-fondatori della app dopo scontro con Zuckerberg - Il Sole 24 ORE']
Cluster: ["Arriva Portal, l'assistente smart di Facebook - Repubblica.it", 'Facebook, Portal in arrivo la prossima settimana?']
Cluster: ['I fondatori di Instagram lasciano Facebook: ecco cosa faranno in futuro', 'Ecco perché i fondatori di Instagram hanno lasciato Facebook - Corriere.it']
Cluster: ['Facebook: ex moderatrice fa causa, traumatizzata da immagini - Hi-tech - ANSA.it', 'Usa, causa contro Facebook per inserzioni discriminatorie - Internet e Social - ANSA.it', "L'ex moderatrice di contenuti fa causa a Facebook: «Traumatizzata, ho filtrato post tossici» - Corriere.it"]
Cluster: ['Wsj, Facebook ha trattato con le banche per i dati degli utenti - Internet e Social - ANSA.it', 'Facebook chiede alle banche i dati dei clienti - Wired']
Cluster: ['Russiagate, Face

# Choose Eps for DBSCAN
Choose eps value for dbscan alg, by plotting a mixed corpus of data (taken as sample).
## Important assumption: no-duplicates
Duplicates docs in more trained model are not a problem, but here they may spoil the graph

In [36]:
import json
"""
filenames = ['mixed_docs_mix1.json', 'mixed_docs_mix2.json']
docs = []
for filename in filenames:
    with open(filenames[0], 'r') as file:
        docs += json.load(file)

train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d['title']+d['abstract']), [i]) for i, d in enumerate(docs) ]
model = Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=4, comment='live data')
model.build_vocab(train_corpus)
%time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
"""

# plot the 'eps-value' graph (needed to choose the best eps using DBSCAN)


model = models[1]
doc_vectors = [model.docvecs[i] for i in range(len(docs))]
titles = [doc['title'] for doc in docs]

import plotly.plotly as py
import plotly.tools as tls
import plotly.graph_objs as go
import random
import numpy as np
# using my api-key
tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')
import utils

# let's first try with 2 as min_count
# we get back a list of (doc_title, dist_from_kth_neighbour)
title_dist_tuples = utils.choose_eps(2, doc_vectors, titles)
data = utils.visualize_eps_graph(title_dist_tuples)    
py.iplot(data, filename='a')

In [38]:
filename = '2_different_fb-spazio.json'
#filename = 'similar_documents_facebook.json'
# create corpus
with open(filename, 'r') as file:
    docs = json.load(file)
print("Length",len(docs))
# find duplicates
index_list = []
for i, doc in enumerate(docs):
    for j in range(i+1, len(docs)):
        doc2 = docs[j]
        if doc['title'].lower().strip() == doc2['title'].lower().strip():
            index_list.append(j)
docs = [doc for j, doc in enumerate(docs) if not(j in index_list)]
print("New length", len(docs))

with open(filename, 'w') as out:
    json.dump(docs, out)
    

Length 69
New length 69


# Evaluate clusterization
## Over hand-clusterized set of docs

## 1. visualize clusterization

In [7]:
import json
import my_dbscan
import gensim
from gensim.models.doc2vec import Doc2Vec
import utils
import plotly.plotly as py
import plotly.tools as tls
import plotly.graph_objs as go
import random
import numpy as np
# using my api-key
tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

# load file containing clusters
with open('pre-clustered_docs.json', 'r') as json_data:
    cdocs = json.load(json_data)
# [ [,], [,], ....]
cdocs = [doc for cluster in cdocs for doc in cluster]
# clusterize this documents
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d['title']+d['abstract']), [i]) for i, d in enumerate(cdocs) ]
# dm = 0 / 1
model = Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=4, comment='live data')
model.build_vocab(train_corpus)
%time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)

doc_vecs = [model.docvecs[j] for j in range(len(model.docvecs))]
titles = [doc['title'] for doc in cdocs]
urls = [doc['url'] for doc in cdocs]
urls_cluster_list = my_dbscan.apply_dbscan(doc_vecs = doc_vecs, titles = titles, 
                                           urls = urls, subset_length = len(titles),
                                             eps = 0.27, eps_increment = 0.1, n_iterations = 3, verbose = False)


titles_clusters = utils.getDocTitleFromUrl(cdocs, urls_cluster_list)
data = utils.plot_clusters(titles_clusters)
py.iplot(data, filename='b')

CPU times: user 940 ms, sys: 8 ms, total: 948 ms
Wall time: 695 ms
##Clusters##
Cluster: ['Decolla Facebook Dating, il Tinder di Menlo Park: primi test in Colombia - Repubblica.it', 'Facebook: Tinder nel mirino, novità di Instagram e Whatsapp - Corriere.it', 'Facebook Dating: ecco come funziona l’anti-Tinder di Zuckerberg - Corriere.it', 'Social e bambini: YouTube assume nuovi moderatori e Facebook lancia Messenger Kids - Corriere.it', 'YouTube e i video con bambini «abusati» Google sotto accusa, ritirata la pubblicità - Corriere.it']
Cluster: ['Tesla sotto indagine per colpa dei tweet di Elon Musk: crollo in Borsa - Corriere.it', 'Elon Musk denuciato per truffa, Tesla crolla in borsa - Wired', 'Tesla, Elon Musk lascia la presidenza']
Cluster: ["L'equinozio d'autunno non è il 21 settembre: quest'anno arriva il 23 - Repubblica.it", "E' l'equinozio d'autunno - Spazio & Astronomia - ANSA.it"]
Cluster: ["iPhone Xs Max tira 3-4 volte più dell'Xs - Hi-tech - ANSA.it", 'Apple conferma "per er

## Evaluate clusterization
(soft)Test rules: go through each pre-defined cluster; if you find the elements of a cluster correctly grouped, that's a 100% correct over that cluster; otherwise, you count the number of elements correctly put together (at least 2).

In [27]:
with open('pre-clustered_docs.json', 'r') as json_data:
    cdocs = json.load(json_data)

def search_docs_in_clusters(doc_titles, clusters, verbose = True):
    """ doc_titles: list of titles to search for.
        clusters: list of cluster in which to search into.
        Returns the highest number of occurences of doc_titles elements
        in a cluster, paired with the aforementioned cluster index. 
    """
    occurences = []
    for i, cluster in enumerate(clusters):
        if verbose: print("##Searching into cluster", cluster, '##')
        correct = 0
        for title in doc_titles:
            if verbose: print("==Searching for",title,'==')
            if title in cluster:
                if verbose: print(title[:10],'.. is in cluster!')
                correct += 1
        occurences.append((correct, i))
    # order list by most appearences and return the first pair
    occurences.sort(key = lambda tup: tup[0])
    return occurences[-1]

occurences = []
for docs in cdocs:
    occurences.append(search_docs_in_clusters([doc['title'] for doc in docs], titles_clusters, verbose=False))
print(occurences)

# compute percentages
correct = [c for (c, index) in occurences]
percentages = []

for i, docs_list in enumerate(cdocs):
    # if less than 2 docs were correctly classified, the 'answer' is considered not correct
    if correct[i] < 2:
        percentages.append(0)
    else:
        percentages.append(correct[i] * 100 / len(docs_list))
print(percentages)

# compute the mean of percentages as final accuracy of the model over the test set (in terms of clustering)
p_sum = 0
for p in percentages:
    p_sum += p
print("Accuracy over evaluation set: ",p_sum/len(percentages), "%")


[(2, 7), (3, 0), (3, 1), (2, 2), (13, 3), (9, 5), (2, 0), (3, 6)]
[100.0, 100.0, 100.0, 100.0, 72.22222222222223, 100.0, 66.66666666666667, 75.0]
Accuracy over evaluation set:  89.2361111111111 %


# Conclusions:
From now on we will use these tools to evaluate each different model, possibly with an improved evaluation-set.
The results you can get from these (high) percentages must be intepreted: the test is not as 'strict' in terms of evaluationg the model, since it doesn't require a matching clusterization w.r.t the eval. set, and this inevitably leds to higher results in terms of accuracy. Why this choice? Essentatially, that's because I do not take this hand-made clustering as ground truth, they can be, up to a certain degree, freely contested, and sub-clusters may be made in some cases.
This can be seen with the 4th cluster (iphone): the model created another meaningful sub-cluster, containing, by my personal interpretation, two docs regarding the new iPhone REVIEW, and yet it does get 'punished' by the score metric.
These kinds of situations highlight the difficulty of creating a ground-truth test-set, as well as a functional score system using this approach.


How can we improve the test results reliability? 
We can try to obtain a 'closer to ground-truth' test set, by clustering only really similar docs (e.g 'First Moon landing!', 'Man lands on the Moon!', 'Moon touchdown:'..); but while this would definitely improve the test results in terms of 'objective clustering', we'd most definitely lose something in terms of representing real case scenarios, by pushing the test model to perform well over set of docs which are pretty far from the standard set it will be analyzing in 'production'.
I personally retain this last part to be fundamental, so I will use this kind of set of docs to evaluate models, and at the same time, I will analyze both visually and with other tools the results obtained (in particular, looking at the clusters the model generates, is there a way to say 'these clusters make sense, even tho they were not included in the eval. set?') 