In [1]:
verbose = True # change this is if you don't need to display print()/log in the notebook

In [2]:
# needed libraries
import json
import random
#import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import  gensim
from collections import Counter
from sklearn.cluster import DBSCAN

# Load documents from JSON
change this block to load from a pre-defined filename of your choice

In [3]:
filenames = ['blockchain.json', 'industria_4.0.json']

# load multiple files, assuming same data format
docs = []
for filename in filenames:
    with open(filename, 'r') as outfile:
        json_data = json.load(outfile)

    ## let's now retrieve the meaningful part of the json document
    # response{}--->docs[] 
    ## -that's the way I was given JSON docs so far, change this part if format changes-

    docs = docs + json_data['response']['docs']
    if verbose:
        print("Number of documents: ",len(docs))

Number of documents:  96
Number of documents:  604


### This part can be ignored if we assume data is "clean" 

In [4]:
## many documents have a failed abstract, let's remove them
to_check = ' Questo sito web utilizza cookie tecnici e, previo Suo consenso, cookie di profilazione,'
docs = [doc for i, doc in enumerate(docs) if not(to_check.strip() in doc['abstract'][0].strip())]

if verbose:
    print("New length after removing docs: ", len(docs))


New length after removing docs:  412


In [5]:
# List->String
## Adjust data format: title, abstract and url came in as list, but they're more useful as strings
for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract', 'url']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]
            
# remove duplicates (of a particular doc)
# TODO: remove all duplicates
docs = [doc for doc in docs
            if not("Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T" in doc['title'])]

if verbose:
    print("New Length: ", len(docs))

# double check to be sure
for doc in docs:
    if to_check.strip() in doc['abstract'].strip():
        print("cookie doc found")
if verbose:
    print([d['abstract'][:200]+'...' for d in docs[:1]])

New Length:  362
['L’annuncio è arrivato alla London Fintech News ed è stato rilanciato dalla testata The Fintech Times: il primo comunicatore crypto in versione Mobile per la Blockchain che con qualche forzatura si può...']


# Infer Vectors from documents

## Load model and prepara data structures

In [8]:
# shuffle documents
random.shuffle(docs)

## !Change this if you want to rename model or change dir in the filesystem ##
MODEL_NAME = 'TestModels/d2v_TA_abstract&title0.model'
model = Doc2Vec.load(MODEL_NAME)

# print out dimension of the vocabulary of the model (number of known words)
if verbose:
    print(len(model.wv.vocab))
    
# infer vectors from data
# preprocess data first (remove capitals, strange unicode chars..)
## title + abstract may change in future versions to flattened_entities, with a newer model!
test_corpus = [gensim.utils.simple_preprocess(d['title']+d['abstract']) for d in docs]
if verbose: 
    print(len(test_corpus))
    
# list of vectors of docs
inferred_vectors = [model.infer_vector(doc) for doc in test_corpus]

# get docs titles, needed to return results correctly after clustering
titles = [doc['title'] for doc in docs]
# same thing for urls
urls = [doc['url'] for doc in docs]
if verbose: 
    print(titles[:5])
    print(urls[:5])

9031
362
['"Polizza & IoT", la nuova abbinata delle assicurazioni: settore auto in pole position - CorCom', 'blockchain e digital transformation nella PA: focus su standard e governance - Blockchain 4innovation', 'Infocert, la ricerca sull’identità digitale punta sulla blockchain - Blockchain 4innovation', "Tech Data entra nell'alleanza globale per l'IoT", 'Industria 4.0, così finisce il "diritto pesante" del lavoro - CorCom']
['https://www.corrierecomunicazioni.it/industria-4-0/iot/polizza-iot-la-nuova-abbinata-delle-assicurazioni-settore-auto-in-pole-position/', 'https://www.blockchain4innovation.it/mercati/pubblica-amministrazione/blockchain-e-digital-transformation-nella-pa-focus-su-standard-e-governance/', 'https://www.blockchain4innovation.it/mercati/industria4-0/infocert-la-ricerca-sullidentita-digitale-punta-sulla-blockchain/', 'https://www.internet4things.it/industry-4-0/m2m/tech-data/', 'https://www.corrierecomunicazioni.it/industria-4-0/industria-40-cosi-finisce-il-diritto-p

# DBSCAN 
density-based algorithm used to clusterize docs

In [7]:
# my function for performing dbscan and printing out cluster results
def perform_dbscan(eps = 0.4, min_samples = 4, metric = 'euclidean', algorithm = 'auto', data = None, verbose = True
                  , titles = None, urls = None, print_noise = True):
    """perform DBSCAN over given data, using given parametrs. Returns dbscan object and clusters dictionary."""
    
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm).fit(data)

    # labels will print out the number of the cluster each example belongs to;
    # -1 if the vector is considered noise (not belonging to any cluster)
    #print("Labels: ", db.labels_)

    # create data structure containing clusters
    clusters_to_ret = {label:[] for label in db.labels_ if label!=-1}
    
    for i, label in enumerate(db.labels_):
        if label != -1: #ignore noise points
            clusters_to_ret[label].append(urls[i])
        
    
    
    # only do this if you need to print out the result (messy for large number of docs)
    if verbose:
        print("##Clusters##")
        clusters = {label: [] for label in db.labels_ if label!=-1}
        noise = []
        for i, label in enumerate(db.labels_):
            if label != -1: 
                clusters[label].append(titles[i])
            else: # save noise points
                noise.append(titles[i])
                
        for label, list_ in clusters.items():
            print("Cluster: {}".format(list_))
        if print_noise:
            print("Noise: ", noise)

        print("DBSCAN finished.\n")
    return db, clusters_to_ret

## Iterative/incremental DBSCAN 
that's the way I thought was best to use DBSCAN in our case
TODO: heuristic to find out how many times to apply DBSCAN, right now is only based on eps size

## Apply DBSCAN to SUBSET, change here to apply to 'docs' instead of 'subset' to clusterize all documents

In [10]:

subset_length = 20
# subset of docs vectors 
subset = inferred_vectors[:subset_length]
subset_titles = titles[:subset_length]
sub_urls = urls[:subset_length]

eps = 0.27
eps_increment = 0.1
noise_bool = False
# this will contain all clusters found, each one as a list, 
# mantaining the order dbscan returned (first clusters will contain articles more related to each other)
final_clusters = []
# starting eps will be the sum of eps + eps_increment 
for i in range(3):
    if i==2: 
        noise_bool = True
    eps = eps + eps_increment
   
    db, clusters = perform_dbscan(eps = eps, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                        data = subset, verbose = True, titles = subset_titles, urls = sub_urls, print_noise = noise_bool)

    # TODO: ignore noise/'other' documents or return them?
    for label, list_ in clusters.items():
        final_clusters.append(list_)
        
    # let's try and find other clusters in the noise data, with higher eps
    subset = [subset[i] for i, label in enumerate(db.labels_) if label==-1]
    subset_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]
    sub_urls = [sub_urls[i] for i, label in enumerate(db.labels_) if label==-1]
    if subset is None:
        break

if verbose:
    print("Number of cluster found: ", len(final_clusters))
    for i, cluster in enumerate(final_clusters):
        print("Length of cluster {0}: {1}".format(i, len(cluster)))
# final clusters composition:
#[[cluster0_urls], [cluster1_urls], ...]

##Clusters##
Cluster: ['blockchain e digital transformation nella PA: focus su standard e governance - Blockchain 4innovation', 'Guida completa al Cloud computing: costi, implementazione, compliance e ROI', "Che cos'è TrustedChain e perché può cambiare la logica di gestione delle transazioni grazie alla Blockchain - Blockchain 4innovation"]
Cluster: ['Industria 4.0, il futuro del lavoro passa da formazione e governance - CorCom', 'Industria 4.0, Di Maio: "Incentivi confermati". Meno tasse per chi assume - CorCom', 'Industria 4.0 in crescita del 30%, Italia pronta alla svolta? - CorCom', 'Dall’IoT alla Servitizzazione. HBR: Gli Smart connected product cambiano la competizione - Industry4Business', 'AI servizi il 34 per cento del mercato IoT italiano - Industry4Business', 'Lavoro 4.0, Falciasecca: "Nuovo patto accademia-impresa" - CorCom']
DBSCAN finished.

##Clusters##
Cluster: ['"Polizza & IoT", la nuova abbinata delle assicurazioni: settore auto in pole position - CorCom', 'Apre al Po

## TODO: Save clusters to JSON using agreed format and Cluster entities as well

In [13]:
#- sorgente_dati (vale sempre “cluster”)
#- ta_id
print('ta_id:', docs[0]['ta_id'])
#- data inizio ?
#- data fine ?
#- documents (lista degli url)
print('urls first cluster:{} .. '.format( final_clusters[0][:2]))
#- entities (lista di coppie “nome entity”, numero di occorrenze)
cluster1_docs = [doc for doc in docs if doc['url'] in final_clusters[1]]

print("Second cluster main entities: ", getClusterEntites(cluster1_docs, 4))


ta_id: [152109]
urls first cluster:['https://www.blockchain4innovation.it/mercati/pubblica-amministrazione/blockchain-e-digital-transformation-nella-pa-focus-su-standard-e-governance/', 'https://www.internet4things.it/industry-4-0/guida-completa-al-cloud-computing-costi-implementazione-compliance-roi/'] .. 
Second cluster main entities:  [('Lavoro', 5), ('Internet delle cose', 5), ('Italia', 5), ('Servizio', 5)]


In [None]:
def exportClusterResults():
    """This method will save the result of the clustering operation to disk, as a json file,
    using the agreed format: core results are the arrays 'documents':[__, __] containing the urls of each 
    document in a cluster, and 'entities':[{}, {}] """

# Get Cluster Entities
Each cluster will be represented by a few meaningful entities, which summarize the cluster: 
these entities are chosen based on the most 'popular' among the documents which form a cluster

In [12]:
# efficient way of getting most common elements in a list (O(n))
def mostCommons(lst, n):
    """given a list, returns the n most common elements; in case of ties, it may not return the first occurence. """
    data = Counter(lst)
    item_count_list = data.most_common(n)

    return item_count_list

def getClusterEntites(cluster_docs = None, n_entities = 3):
    """given all documents belonging to a cluster (as a list of dictionaries, each dictionary 
    representing a doc with its attributes), returns the most common 'n_entities' in the cluster.
    """
    
    # get list of flattened_entities from documents
    entities_field_name = 'result_entities'
    # we're expecting flattened_entities as a list of strings
    f_entities = [entity for doc in cluster_docs for entity in doc[entities_field_name]]
    
    # get the 'n_entities' most 'frequent' entity in the cluster
    return mostCommons(f_entities, n_entities)
    

In [17]:
# how to use getClusterEntities, example:

## convert flattened_entites from string to list of strings
for doc in docs:
    if isinstance(doc['result_entities'], str):
        doc['flattened_entities'] = doc['flattened_entities'].split()
getClusterEntites(docs, 4) # print out 4 - most common in whole dataset

# more realistic case

[('Tecnologia', 241),
 ('Azienda', 205),
 ('Industria 4.0', 190),
 ('Produzione', 156)]