In [1]:
verbose = True # change this is if you don't need to display print()/log in the notebook

In [26]:
# needed libraries
import json
import random
#import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import  gensim
from collections import Counter
from sklearn.cluster import DBSCAN

# Load documents from JSON
change this block to load from a pre-defined filename of your choice

In [15]:
filenames = ['blockchain.json', 'industria_4.0.json']

# load multiple files, assuming same data format
docs = []
for filename in filenames:
    with open(filename, 'r') as outfile:
        json_data = json.load(outfile)

    ## let's now retrieve the meaningful part of the json document
    # response{}--->docs[] 
    ## -that's the way I was given JSON docs so far, change this part if format changes-

    docs = docs + json_data['response']['docs']
    if verbose:
        print("Number of documents: ",len(docs))

Number of documents:  96
Number of documents:  604


### This part can be ignored if we assume data is "clean" 

In [16]:
## many documents have a failed abstract, let's remove them
to_check = ' Questo sito web utilizza cookie tecnici e, previo Suo consenso, cookie di profilazione,'
docs = [doc for i, doc in enumerate(docs) if not(to_check.strip() in doc['abstract'][0].strip())]

if verbose:
    print("New length after removing docs: ", len(docs))


New length after removing docs:  412


In [21]:
# List->String
## Adjust data format: title and abstract came in as list, but they're more useful as strings
for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]
            
# remove duplicates
docs = [doc for doc in docs
            if not("Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T" in doc['title'])]

if verbose:
    print("New Length: ", len(docs))

# double check to be sure
for doc in docs:
    if to_check.strip() in doc['abstract'].strip():
        print("cookie doc found")
if verbose:
    print([d['abstract'][:200]+'...' for d in docs[:1]])

New Length:  362
['L’annuncio è arrivato alla London Fintech News ed è stato rilanciato dalla testata The Fintech Times: il primo comunicatore crypto in versione Mobile per la Blockchain che con qualche forzatura si può...']


# Infer Vectors from documents

## Load model and prepara data structures

In [25]:
# shuffle documents
random.shuffle(docs)

## !Change this if you want to rename model or change dir in the filesystem ##
MODEL_NAME = 'TestModels/d2v_TA_abstract&title0.model'
model = Doc2Vec.load(MODEL_NAME)

# print out dimension of the vocabulary of the model (number of known words)
if verbose:
    print(len(model.wv.vocab))
    
# infer vectors from data
# preprocess data first (remove capitals, strange unicode chars..)
## title + abstract may change in future versions to flattened_entities, with a newer model!
test_corpus = [gensim.utils.simple_preprocess(d['title']+d['abstract']) for d in docs]
if verbose: 
    print(len(test_corpus))
    
# list of vectors of docs
inferred_vectors = [model.infer_vector(doc) for doc in test_corpus]

# get docs titles, needed to return results correctly after clustering
titles = [doc['title'] for doc in docs]
if verbose: 
    print(titles[:5])

9031
362
['Con Laboratorio RISE e Vendor verso le “Best Practice 4.0” - Industry4Business', 'Una Academy per insegnare Industry 4.0 alle medie imprese - Industry4Business', "Nokia volta pagina e punta (anche) sull'Iot: acquisita SpaceTime Insight - CorCom", 'Servitization e smart product così la Industry 4.0 diventa realtà', 'Per innovare la pubblica amministrazione serve lo smart working: ecco perché']


# DBSCAN 
density-based algorithm used to clusterize docs

In [None]:
# my function for performing dbscan and printing out cluster results
def perform_dbscan(eps = 0.4, min_samples = 4, metric = 'euclidean', algorithm = 'auto', data = None, verbose = True
                  , titles = None, print_noise = True):
    """perform DBSCAN over given data, using given parametrs. Returns dbscan object."""
    
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm).fit(data)

    # labels will print out the number of the cluster each example belongs to;
    # -1 if the vector is considered noise (not belonging to any cluster)
    #print("Labels: ", db.labels_)
    
    if verbose:
        print("##Clusters##")
        cluster = [[]]
        noise = []
        noise_r = []
        for i, label in enumerate(db.labels_):
            if label != -1:
                try:
                    cluster[label].append(titles[i])
                except Exception as e:
                    cluster.append([titles[i]])
            else:
                noise.append(titles[i])
                noise_r.append(i)
        for list_ in cluster:
            print("Cluster:", list_)
        if print_noise:
            print("Noise: ", noise)

        print("DBSCAN finished.\n")
    return db

## Iterative/incremental DBSCAN 
that's the way I thought was best to use DBSCAN in our case
TODO: heuristic to find out how many times to apply DBSCAN, right now is only based on eps size

## Apply DBSCAN to SUBSET, change here to apply to 'docs' instead of 'subset' to clusterize all documents

In [29]:

subset_length = 20
# subset of docs vectors 
subset = inferred_vectors[:subset_length]
subset_titles = titles[:subset_length]

eps = 0.25
eps_increment = 0.13
noise_bool = False
# starting eps will be the sum of eps + eps_increment 
for i in range(3):
    if i==2: 
        noise_bool = True
    eps = eps + eps_increment
    # decrease eps_increment a bit 
    #eps_increment = eps_increment - .02
    db = perform_dbscan(eps = eps, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                        data = subset, verbose = True, titles = subset_titles, print_noise = noise_bool)

    # let's try and find other clusters in the noise data, with higher eps
    subset = [subset[i] for i, label in enumerate(db.labels_) if label==-1]
    subset_titles = [subset_titles[i] for i, label in enumerate(db.labels_) if label==-1]
    if subset is None:
        break

##Clusters##
Cluster: ['Con Laboratorio RISE e Vendor verso le “Best Practice 4.0” - Industry4Business', 'Servitization e smart product così la Industry 4.0 diventa realtà', 'Per innovare la pubblica amministrazione serve lo smart working: ecco perché', "Dall'Industria 4.0 all'impresa Smart: Pronti ad affrontare la sfida? - Industry4Business", "Storm Reply: nella monetizzazione del dato la nuova sfida per le imprese nell'era dell'IoT"]
Cluster: ['Industria 4.0, Ceresa (Fca): "Pronti a sperimentare il 5G" - CorCom', 'White Paper selection: Siemens spiega l’uso di MindSphere per l’IoT in ottica Industry 4.0', 'Industria 4.0, Tecnest: “Modello Italia forte perché human centered”']
DBSCAN finished.

##Clusters##
Cluster: ['Una Academy per insegnare Industry 4.0 alle medie imprese - Industry4Business', 'Metalmeccanica: le imprese faticano a trovare lavoratori con le giuste competenze - Industry4Business', 'Industria 4.0: creare le basi per il futuro del settore manifatturiero -', 'Industria

## TODO: Save clusters to JSON using agreed format and Cluster entities as well

# Get Cluster Entities
Each cluster will be represented by a few meaningful entities, which summarize the cluster: 
these entities are chosen based on the most 'popular' among the documents which form a cluster

In [31]:
# efficient way of getting most common elements in a list (O(n))
def mostCommons(lst, n):
    """given a list, returns the n most common elements; in case of ties, it may not return the first occurence. """
    data = Counter(lst)
    item_count_list = data.most_common(n)

    return [item for (item, counter) in item_count_list]

def getClusterEntites(cluster_docs = None, n_entities = 3):
    """given all documents belonging to a cluster (as a list of dictionaries, each dictionary 
    representing a doc with its attributes), returns the most common 'n_entities' in the cluster.
    """
    
    # get list of flattened_entities from documents
    entities_field_name = 'flattened_entities'
    # we're expecting flattened_entities as a list of strings
    f_entities = [entity for doc in cluster_docs for entity in doc[entities_field_name]]
    
    # get the 'n_entities' most 'frequent' entity in the cluster
    return mostCommons(f_entities, n_entities)
    

In [32]:
# how to use getClusterEntities, example:

## convert flattened_entites from string to list of strings
for doc in docs:
    if isinstance(doc['flattened_entities'], str):
        doc['flattened_entities'] = doc['flattened_entities'].split()
getClusterEntites(docs, 4) # print out 4 - most common in whole dataset (it will be used in clusters, not whole dataset)

['tecnologia', 'azienda', 'industria_4.0', 'internet_delle_cose']