# Test models on live-like version documents

In [76]:
# fetch data
import json
filenames = ['blockchain.json', 'industria_4.0.json']

with open(filenames[0], 'r') as outfile:
    json_data = json.load(outfile)
#print("Length of the json file: {0}, type: {1}".format(len(json_data), type(json_data)))

## let's now retrieve the meaningful part of the json document
# response{}--->docs[]

docs = json_data['response']['docs']
print("Number of documents in new json: ",len(docs))

# open file 2 and do the same things
with open(filenames[1], 'r') as outfile:
    json_data = json.load(outfile)

docs = docs + json_data['response']['docs']
print("Number of documents in new json: ",len(docs))

Number of documents in new json:  96
Number of documents in new json:  604


In [77]:
## many documents have a failed abstract, let's remove them
to_check = ' Questo sito web utilizza cookie tecnici e, previo Suo consenso, cookie di profilazione,'
docs = [doc for i, doc in enumerate(docs) if not(to_check.strip() in doc['abstract'][0].strip())]

print("New length after removing docs: ", len(docs))



New length after removing docs:  412


In [78]:
## Adjust data format
for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract', 'flattened_entities']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]
# remove duplicates
for i, doc in enumerate(docs):
    if "Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T" in doc['title']:
        del(docs[i])
"""
duplicates_indeces = []
for i, doc in enumerate(docs):
    for j in range(i+1, len(docs)):
        if docs[j]['title'] == doc['title']:
            duplicates_indeces.append(j)
print("Number of duplicates: ", len(duplicates_indeces))
docs = [doc for i, doc in enumerate(docs) if not(i in duplicates_indeces)]
print("New Length: ", len(docs))
"""
## randomize everything by shuffling the documents around
import random
random.shuffle(docs)

for doc in docs:
    if to_check.strip() in doc['abstract'].strip():
        print("cookie doc found")
print([d['abstract'] for d in docs[:1]])

New Length:  384
[' PA digitale, al via il “Premio 10×10 = cento progetti per cambiare la PA” promosso da Forum PA all’interno della manifestazione Forum PA 2017 (23 – 25 maggio). Il premio è rivolto a enti centrali e locali, regioni, province, strutture della sanità pubblica, aziende di SPL, multiutility, piccole e medie aziende innovative, Istituzioni scolastiche, università e Centri di ricerca: si chiede di presentare idee progettuali o prodotti in grado di promuovere l’innovazione digitale del paese attraverso la PA.\n Ecco gli ambiti entro i quali partecipare al Premio:\n  Cybersecurity, business continuity, crisis management, sicurezza dei sistemi informativi;  Smart city, dati e IOT;  Servizi online, servizi su mobile, pagamenti elettronici;  PA senza carta;  Scuola e educazione digitale;  Industria 4.0;  Agricoltura Intelligente: dematerializzazione, sburocratizzazione, agricoltura e digital transformation, IoT per la coltivazione;  Comunicazione verso cittadini e stakeholder; 

## Let's first try to infer vector from model; if that doesn't work much, let's train another model with this data

In [43]:
# load model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import  gensim

MODEL_NAME = 'TestModels/d2v_TA_abstract&title0.model'
MODEL_TWO = 'Models/d2v_TA_abstract&title0.model'
#model = Doc2Vec.load(MODEL_NAME)
model = Doc2Vec.load(MODEL_TWO)
inferred_vectors = []
# print out dimension of the vocabulary 
print(len(model.wv.vocab))
#print(model.most_similar(positive=['re', 'donna'], negative=['uomo']))

5923


In [44]:
# infer vectors from data
test_corpus = [gensim.utils.simple_preprocess(d['title']+d['abstract']) for d in docs]
print(len(test_corpus))
inferred_vectors = [model.infer_vector(doc) for doc in test_corpus]

384


# DBSCAN

In [21]:
from sklearn.cluster import DBSCAN

def perform_dbscan(eps = 0.4, min_samples = 4, metric = 'euclidean', algorithm = 'auto', data = None, verbose = True
                  , titles = None, print_noise = True):
    """perform DBSCAN over given data, using given parametrs. Returns dbscan object."""
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm).fit(data)

    #print("Core samples: ")
    #for i in db.core_sample_indices_ :
    #    print(titles[i]+"\n")

    # labels will print out the number of the cluster each example belongs to;
    # -1 if the vector is considered noise (not belonging to any cluster)
    #print("Labels: ", db.labels_)
    
    if verbose:
        print("##Clusters##")
        cluster = [[]]
        noise = []
        noise_r = []
        for i, label in enumerate(db.labels_):
            if label != -1:
                try:
                    cluster[label].append(titles[i])
                except Exception as e:
                    cluster.append([titles[i]])
            else:
                noise.append(titles[i])
                noise_r.append(i)
        for list_ in cluster:
            print("Cluster:", list_)
        if print_noise:
            print("Noise: ", noise)

        print("DBSCAN finished.\n")
    return db

In [45]:
# get docs titles
titles = [doc['title'] for doc in docs]
print(titles[:5])

['IoT, quasi mezzo miliardo di oggetti connessi "mobile": Cina in pole - CorCom', 'Fabbrica intelligente, 340 milioni per le Pmi del Sud - CorCom', "200 nuovi laureati per IBM: l'innovazione ha bisogno anche della Blockchain - Blockchain 4innovation", 'La progettazione virtuale di Siemens al Forum Meccatronica di Torino - Industry4Business', "Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T"]


## Incremental DBSCAN over small subset
## TODO: check if noise is None

In [102]:
subset_length = 25
# subset of docs vectors 
subset = inferred_vectors[:subset_length]
subset_titles = titles[:subset_length]

eps = 0.25
eps_increment = .15
db = perform_dbscan(eps = 0.45, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = subset, verbose = True, titles = subset_titles, print_noise = True)
"""
# let's try and find other clusters in the noise data, with higher eps
noise_data = [subset[i] for i, label in enumerate(db.labels_) if label==-1]
noise_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]

db = perform_dbscan(eps = eps + eps_increment, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = noise_data, verbose = True, titles = noise_titles, print_noise = False)

noise_data = [subset[i] for i, label in enumerate(db.labels_) if label==-1]
noise_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]

db = perform_dbscan(eps = eps + eps_increment + eps_increment, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = noise_data, verbose = True, titles = noise_titles)"""

##Clusters##
Cluster: ['Skill e competenze per la digital servitization - Industry4Business', "Telecom: 5G, sicurezza e piattaforme per l'IoT", "Che cos'è TrustedChain e perché può cambiare la logica di gestione delle transazioni grazie alla Blockchain - Blockchain 4innovation", "Cos'è, come fare ed esempi concreti di Industria 4.0 - Internet4Things", 'Abbanoa punta sulla blockchain per certificare la lettura dei contatori - Blockchain 4innovation', 'Blockchain e Industry 4.0: Sap Leonardo porta la Data Intelligence nella Smart Factory - Blockchain 4innovation', 'SpidChain, identità digitale 4.0 per PA e aziende - Blockchain 4innovation', "Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T", 'Ecco tutte le tecnologie Industry 4.0 prorogate dalla legge di Stabilità', 'Legal e blockchain - Blockchain 4innovation', "Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T", "Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studi

"\n# let's try and find other clusters in the noise data, with higher eps\nnoise_data = [subset[i] for i, label in enumerate(db.labels_) if label==-1]\nnoise_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]\n\ndb = perform_dbscan(eps = eps + eps_increment, min_samples = 2, metric = 'cosine', algorithm = 'auto',\n                    data = noise_data, verbose = True, titles = noise_titles, print_noise = False)\n\nnoise_data = [subset[i] for i, label in enumerate(db.labels_) if label==-1]\nnoise_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]\n\ndb = perform_dbscan(eps = eps + eps_increment + eps_increment, min_samples = 2, metric = 'cosine', algorithm = 'auto',\n                    data = noise_data, verbose = True, titles = noise_titles)"

In [46]:
subset_length = 20
# subset of docs vectors 
subset = inferred_vectors[:subset_length]
subset_titles = titles[:subset_length]

eps = 0.25
eps_increment = 0.13
# starting eps will be the sum of eps + eps_increment 
for i in range(3):
    eps = eps + eps_increment
    # decrease eps_increment a bit 
    #eps_increment = eps_increment - .02
    db = perform_dbscan(eps = eps, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                        data = subset, verbose = True, titles = subset_titles, print_noise = True)

    # let's try and find other clusters in the noise data, with higher eps
    subset = [subset[i] for i, label in enumerate(db.labels_) if label==-1]
    subset_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]
    if subset is None:
        break

##Clusters##
Cluster: []
Noise:  ['IoT, quasi mezzo miliardo di oggetti connessi "mobile": Cina in pole - CorCom', 'Fabbrica intelligente, 340 milioni per le Pmi del Sud - CorCom', "200 nuovi laureati per IBM: l'innovazione ha bisogno anche della Blockchain - Blockchain 4innovation", 'La progettazione virtuale di Siemens al Forum Meccatronica di Torino - Industry4Business', "Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T", 'Metalmeccanica: le imprese faticano a trovare lavoratori con le giuste competenze - Industry4Business', 'Industry 4.0, che succede ai Competence center: ci sarà un bando', 'Nuove competenze per la Digital Servitization: i risultati del Focus Group ASAP - Industry4Business', 'Formazione 4.0 in Legge di Stabilità 2019, ecco che chiedono le imprese', 'Smart Contract e blockchain - Pagina 4 di 5 - Blockchain 4innovation', 'Libro Bianco FPA per l’innovazione nella Pubblica Amministrazione: le consultazioni sono aperte - Blockchain 4innovatio

# Train Model Approach
other idea, use entities as tags!

In [13]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import  gensim
# get train corpus
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
    d['title']+d['abstract']), [i]) for i, d in enumerate(docs) ]
print("Length of train corpus: ",len(train_corpus))

import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


# let's try training two models at once: Paragraph Vector - Distributed Memory (PV-DM), just like CBOW to W2V
# and Paragraph Vector - Distributed Bag of Words (PV-DBOW), analogous to W2V Skip-gram
epochs = 45
vec_size = 100
alpha = 0.10  # default= 0.030
MODEL_NAME = "Models_Live_Test/d2v_abstract&title"

models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
            epochs=epochs, workers=cores, comment='live data'),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size= vec_size, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs= epochs, workers=cores, alpha= alpha, comment='alpha=0.1-live data'),
]

# build our vocabulary of words (all the unique words encountered inside our corpus, needed for training)
for model in models:
    print(model)
    model.build_vocab(train_corpus)
print("Vocabulary created!")

# train the models on the given data!
counter = 0
for model in models:
    print("Training %s" % model)
    %time model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
    model.save(MODEL_NAME+str(counter)+'.model')
    counter = counter + 1
print("Models Saved")


Length of train corpus:  351
Doc2Vec("live data",dbow,d100,n5,mc2,t4)
Doc2Vec("alpha=0.1-live data",dm/m,d100,n5,w10,mc2,t4)
Vocabulary created!
Training Doc2Vec("live data",dbow,d100,n5,mc2,t4)
CPU times: user 33.4 s, sys: 220 ms, total: 33.6 s
Wall time: 11.5 s
Training Doc2Vec("alpha=0.1-live data",dm/m,d100,n5,w10,mc2,t4)
CPU times: user 51.9 s, sys: 204 ms, total: 52.1 s
Wall time: 15.9 s
Models Saved


## Incremental DBSCAN over model vecs

In [17]:
# load model
modelname = 'Models_Live_Test/d2v_abstract&title0.model'
model = Doc2Vec.load(modelname)

subset_length = 25
print(len(model.docvecs))
docvecs = [vec for vec in model.docvecs]
# inferred vectors should result in the same vec as above
#inferred_vectors  = [model.infer_vector(doc.words) for i, doc in enumerate(train_corpus) if i<subset_length]

# subset of docs vectors 
subset = docvecs[:subset_length]
subset_titles = titles[:subset_length]

eps = 0.35
eps_increment = .15
db = perform_dbscan(eps = eps, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = subset, verbose = True, titles = subset_titles)

# let's try and find other clusters in the noise data, with higher eps
noise_data = [subset[i] for i, label in enumerate(db.labels_) if label==-1]
noise_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]

db = perform_dbscan(eps = eps + eps_increment, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = noise_data, verbose = True, titles = noise_titles)

# let's try and find other clusters in the noise data, with higher eps
noise_data = [subset[i] for i, label in enumerate(db.labels_) if label==-1]
noise_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]

db = perform_dbscan(eps = eps + eps_increment + 0.1, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                    data = noise_data, verbose = True, titles = noise_titles)

351


KeyError: "tag '351' not seen in training corpus/invalid"

# Visualize clusters over whole data-set

In [8]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# load model
MODEL_NAME = 'TestModels/d2v_TA_abstract&title0.model'
MODEL_TWO = 'Models/d2v_TA_abstract&title0.model'
#model = Doc2Vec.load(MODEL_NAME)
model = Doc2Vec.load(MODEL_TWO)

inferred_vectors = [model.infer_vector(doc) for doc in test_corpus]
# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(inferred_vectors)

# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
vec_size = 100
features = [i for i in range(vec_size)]

x = df.loc[:, features].values # get features values

# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

finalDf = principalDf 


In [10]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go
from scipy.spatial import distance

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

COMPONENT_ONE = "principal component 1"
COMPONENT_TWO = "principal component 2"
#centroids = kmeans.cluster_centers_
titles = [dictionary['title'] for dictionary in docs]
traces = []

# each trace will represent a point (squeezed vector from higher dimensions),
# and each point will have the title of the news assigned
for i in range(len(finalDf)):
    # assign a color to each point belonging to a specific cluster
    # computing distance from centroid
    #x = finalDf.loc[i:i, "principal component 1"]
    #y = finalDf.loc[i:i, "principal component 2"]
    x , y = finalDf.iat[i, 0], finalDf.iat[i, 1]
    color = 'rgba(0, 0, 180, 0.8)'
    """
    centroid_index = kmeans.predict([[x, y]])
    closest_centroid = centroids[centroid_index]
    #print(closest_centroid, centroids[0])
    if np.array_equal(closest_centroid, [centroids[0]]):
        color = 'blue'
    elif np.array_equal(closest_centroid, [centroids[1]]):
        color = 'pink'
    elif np.array_equal(closest_centroid, [centroids[2]]):
        color = 'yellow'
    elif np.array_equal(closest_centroid, [centroids[3]]):
        color = 'green'
    else:
        color = 'black'
    """
    
    trace0 = go.Scatter(
        x = [x], 
        y = [y],
        mode = 'markers',
            #name = 'blue markers',
        marker = dict(
            size = 7,
            color = color,
        ),
        text = str(titles[i])
    )
    traces.append(trace0)

# draw centroids
"""
c_colors = ['blue', 'pink', 'yellow', 'green', 'black']
for i in range(len(centroids)):
    c_trace = go.Scatter(
        x = [centroids[i, 0]],
        y = [centroids[i, 1]],
        mode = 'markers',
        marker = dict(
            size = 9,
            color = 'red',
        ),
        text = c_colors[i]
    )
    traces.append(c_trace)
"""
data = traces 
layout = dict(title = 'PCA Representantion of D2V on Title+Abstract',
            hovermode= 'closest',
            xaxis= dict(
                title= 'first component',
                ticklen= 5,
                gridwidth= 2,
            ),
            yaxis=dict(
                title= 'second component',
                ticklen= 5,
                gridwidth= 2,
            ),
            showlegend = False
        )
# Plot and embed in ipython notebook!
    
fig = dict(data = data, layout = layout)
py.iplot(fig, filename='live-test')

# Get Cluster Entities
Each cluster will be represented by a few meaningful entities, which summarize the cluster: 
these entities are chosen based on the most 'popular' among the documents which form a cluster

In [71]:
from collections import Counter

# efficient way of getting most common elements in a list (O(n))
def mostCommons(lst, n):
    """given a list, returns the n most common elements; in case of ties, it may not return the first occurence. """
    data = Counter(lst)
    item_count_list = data.most_common(n)

    return [item for (item, counter) in item_count_list]

def getClusterEntites(cluster_docs = None, n_entities = 3):
    """given all documents belonging to a cluster (as a list of dictionaries, each dictionary 
    representing a doc with its attributes), returns the most common 'n_entities' in the cluster.
    """
    
    # get list of flattened_entities from documents
    entities_field_name = 'flattened_entities'
    f_entities = [doc[entities_field_name] for doc in cluster_docs]
    
    # get the 'n_entities' most 'frequent' entity in the cluster
    return mostCommons(f_entities, n_entities)
    

In [80]:
# how to use getClusterEntities

print(type(docs[1]['flattened_entities']))
getClusterEntites(docs, 1)

<class 'str'>


['automazione cloud_manufacturing industria_4.0 internet_delle_cose rivoluzione_industriale']