# Load model, data, and test DBSCAN on it

In [7]:
# needed libraries
import json
import random
#import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
import  gensim
from collections import Counter
from sklearn.cluster import DBSCAN
import numpy as np

In [18]:
filenames = ['blockchain.json', 'industria_4.0.json']

# load multiple files, assuming same data format
docs = []
for filename in filenames:
    with open(filename, 'r') as outfile:
        json_data = json.load(outfile)

    ## let's now retrieve the meaningful part of the json document
    # response{}--->docs[] 

    docs = docs + json_data['response']['docs']
    print("Number of docs:",len(docs))
    ## many documents have a failed abstract, let's remove them
    to_check = ' Questo sito web utilizza cookie tecnici e, previo Suo consenso, cookie di profilazione,'
    docs = [doc for i, doc in enumerate(docs) if not(to_check.strip() in doc['abstract'][0].strip())]

    # remove duplicates (of a particular doc)
    # TODO: remove all duplicates
    docs = [doc for doc in docs
                if not("Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T" in doc['title'])]
    print("New length after removing docs: ", len(docs))
    
## Adjust data format: title, abstract and url came in as list, but they're more useful as strings
for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract', 'url']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]   
            
MODEL_NAME = 'TestModels/w2v_entities+abstract_model.model'
model = Word2Vec.load(MODEL_NAME)

Number of docs: 96
New length after removing docs:  91
Number of docs: 599
New length after removing docs:  362


In [17]:
# my function for performing dbscan and printing out cluster results
def perform_dbscan(eps = 0.4, min_samples = 4, metric = 'euclidean', algorithm = 'auto', data = None, verbose = True
                  , titles = None, urls = None, print_noise = True):
    """perform DBSCAN over given data, using given parametrs. Returns dbscan object and clusters dictionary."""
    
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm).fit(data)

    # labels will print out the number of the cluster each example belongs to;
    # -1 if the vector is considered noise (not belonging to any cluster)
    #print("Labels: ", db.labels_)

    # create data structure containing clusters
    clusters_to_ret = {label:[] for label in db.labels_ if label!=-1}
    
    for i, label in enumerate(db.labels_):
        if label != -1: #ignore noise points
            clusters_to_ret[label].append(urls[i])
        
    
    
    # only do this if you need to print out the result (messy for large number of docs)
    if verbose:
        print("##Clusters##")
        clusters = {label: [] for label in db.labels_ if label!=-1}
        noise = []
        for i, label in enumerate(db.labels_):
            if label != -1: 
                clusters[label].append(titles[i])
            else: # save noise points
                noise.append(titles[i])
                
        for label, list_ in clusters.items():
            print("Cluster: {}".format(list_))
        if print_noise:
            print("Noise: ", noise)

        print("DBSCAN finished.\n")
    return db, clusters_to_ret

# Infer Vectors from docs

In [34]:
def mean_of_vectors(vectors):
    """given a list of (entities) vectors, return the simplest mean of vectors."""
    
    if len(vectors)==0:
        print("This document doesn't contain any known entity")
        return np.array([])
    
    # all vectors will have the same number of elements (features), 
    # which is equal to model.size
    sum_vectors = np.zeros(np.shape(vectors[0]))
    for vec in vectors:
        sum_vectors = sum_vectors + vec
    return sum_vectors/len(vectors)

def infer_vector(entities, model):
    """Given a list of entities, returns the vector representing the documents from which the entities 
    were extracted from, wrt a given W2V model.
    
    entities: list of entities, our way of representing a document.
    model: w2v model.
    """
    
    # get word vector of each entity; ignores word if the model does not know it
    entities_vecs = []
    unknown_words = 0
    for e in entities:
        try:
            entities_vecs.append(model[e])
        except:
            unknown_words += 1 # ignore unknown word
    if unknown_words > 0:
        print("Number of unknown words for this doc: %s; known words %s"%(unknown_words, len(entities)-unknown_words))
    return mean_of_vectors(entities_vecs)

In [35]:
import numpy as np
import sklearn.metrics.pairwise as sk # for cosine_distance
# TODO: add possibility of passing metric to use as parameters

def get_pairwise_distances_matrix(docs, model, verbose = False):
    """"
        docs: list of documents, each represented as a list of entities.
        model: w2v model used to fetch the representation of entities as word vectors.
        verbose: print out operations.
        
        Returns the pairwise distances matrix between documents. 
        Distance between 2 docs will be computed by averaging the distances between all words
        composing the 2 documents.
        Metric used is the one used in group_averaging_distance.
    """
    # initialize distance matrix
    n = len(docs)
    distances_m = np.zeros((n, n))
    
    # un-wrap each set of entities (doc) and compute the distance betweem them all
    # this is all but efficient at the moment, okay for a debug version.
    for i, doc1 in enumerate(docs):
        if verbose: print("##Calculating distances from ", doc1)
        for j, doc2 in enumerate(docs):
            distances_m[i, j] = group_averaging_distance(entities_vector(doc1, model), entities_vector(doc2, model))
            if verbose: print("Distance between %s and %s: %s"%(doc1, doc2, distances_m[i, j]))
    return distances_m
    
def group_averaging_distance(doc1, doc2):
    """
        Computes and returns the distance between 2 'sets'/lists of vectors, 
        by computing the distance between a vector in doc1 and all the other in doc2,
        and averaging all these distances.
        Metric used to compute distance is the cosine_distance -by default-.
    """
    sum_of_distances = 0
    for vec1 in doc1:
        for vec2 in doc2:
            sum_of_distances += sk.cosine_distances([vec1], [vec2])
    return sum_of_distances/(len(doc1) * len(doc2))

def entities_vector(doc, model):
    """"
    Doc: document, represented as a list of entities.
    model: w2v model used to fetch representation of each vector.
    
    Given these two arguments, returns a list of vectors, each vector representing 
    an entity word.
    In case the model does NOT know the word in the list, it will be ignored.
    Might return an empty list.
    """
    unknown_words = 0
    list_ = []
    for word in doc:
        try:
            v = model[word]
            list_.append(v)
        except:
            unknown_words += 1
    print("Unknown words: %s; Known words: %s" %(unknown_words, len(doc)-unknown_words))
    return list_
    

In [33]:
def lower_case_list(list_):
    for i, word in enumerate(list_):
        list_[i] = word.lower()
    return list_
# shuffle docs to get a random sub-sample
random.shuffle(docs)

subsample_length = 10
subsample = docs[:subsample_length]
subsample_titles = [doc['title'] for doc in subsample]
sub_urls = [doc['url'] for doc in subsample]


# get flattened_entities for each document, AS LIST of words (not a single string)
#doc_entities = [doc['flattened_entities'].split() for doc in subsample]
doc_entities = [lower_case_list(doc['result_entities']) for doc in subsample]
#print(doc_entities[:1])

# now we have to 'convert' every doc to vector form
docs_vecs = [infer_vector(list_ent, model) for list_ent in doc_entities]

# eps-visually chosen
eps = 0.16

db, clusters = perform_dbscan(eps = eps, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                data = docs_vecs, verbose = True, titles = subsample_titles, urls = sub_urls, print_noise = True)

Number of unknown words for this doc: 18; known words 17
Number of unknown words for this doc: 38; known words 27
Number of unknown words for this doc: 24; known words 19
Number of unknown words for this doc: 21; known words 20
Number of unknown words for this doc: 5; known words 8
Number of unknown words for this doc: 28; known words 15
Number of unknown words for this doc: 25; known words 21
Number of unknown words for this doc: 38; known words 32
Number of unknown words for this doc: 7; known words 9
Number of unknown words for this doc: 1; known words 2
##Clusters##
Cluster: ['Stampa 3D in metallo, HP si lancia nella produzione di massa', 'Moreschini (Microsoft): Blockchain e Cloud accoppiata vincente per la PA - Blockchain 4innovation', "Nokia volta pagina e punta (anche) sull'Iot: acquisita SpaceTime Insight - CorCom", 'White Paper selection: Siemens spiega l’uso di MindSphere per l’IoT in ottica Industry 4.0', "Pontremoli, Dallara: la vera innovazione nell'IoT e nell'Industria 4



In [36]:
#print(docs[1])
a = np.array([1,2])
print(np.zeros(a.shape))

[0. 0.]


# DBSCAN with matrix of distances

In [40]:
matrix = get_pairwise_distances_matrix(doc_entities, model, False)

db, clusters = perform_dbscan(eps = 0.3, min_samples = 2, metric = 'cosine', algorithm = 'auto',
                data = matrix, verbose = True, titles = subsample_titles, urls = sub_urls, print_noise = True)



Unknown words: 18; Known words: 17
Unknown words: 18; Known words: 17
Unknown words: 18; Known words: 17
Unknown words: 38; Known words: 27
Unknown words: 18; Known words: 17
Unknown words: 24; Known words: 19
Unknown words: 18; Known words: 17
Unknown words: 21; Known words: 20
Unknown words: 18; Known words: 17
Unknown words: 5; Known words: 8
Unknown words: 18; Known words: 17
Unknown words: 28; Known words: 15
Unknown words: 18; Known words: 17
Unknown words: 25; Known words: 21
Unknown words: 18; Known words: 17
Unknown words: 38; Known words: 32
Unknown words: 18; Known words: 17
Unknown words: 7; Known words: 9
Unknown words: 18; Known words: 17
Unknown words: 1; Known words: 2
Unknown words: 38; Known words: 27
Unknown words: 18; Known words: 17
Unknown words: 38; Known words: 27
Unknown words: 38; Known words: 27
Unknown words: 38; Known words: 27
Unknown words: 24; Known words: 19
Unknown words: 38; Known words: 27
Unknown words: 21; Known words: 20
Unknown words: 38; Known w