# DBSCAN: Choose Eps value by plotting distances between points
### e.g. for min_count = 2, plot distances of 2nd closest neighbour of each point

In [2]:
# needed libraries
import json
import random
from gensim.models import Word2Vec
import  gensim
from collections import Counter
from sklearn.cluster import DBSCAN
import numpy as np
from collections import OrderedDict 

In [19]:
def choose_eps(min_count, docs_vecs, doc_titles):
    """Plot the graph and choose best eps based on the composition of our data: 
    to do so, we will need to compute the distances between every point in the data-space, and its
    2nd/3rd closest neighbour (based on 'min_count'). 
    Take the eps corresponding to a great change in the derivative of the plotted function ('knee' or 'elbow' shape).
    
    Docs_vecs is the list of vectors we will analyze, each representing a document.
    
    min_count is the number of points needed to define a core point in DBSCAN.
    
    doc_titles is a matching list (wrt to docs_vecs), containing the titles of each doc.
    
    Returns a list of tuples (doc_title, distance from k-th neighbour), ORDERED by distance (ascendantly).
    """
    
    # first thing to do: compute the matrix of all pairwise elements distances
    # warning: this code is not optimized
    dist_matrix = get_pairwise_distances_matrix(docs_vecs)
    
    
    list_ = [] 
    # for each document vec, only keep the DISTANCE from k-th closest document
    for j, doc_distances in enumerate(dist_matrix):
        # get a row of the matrix (vector of distances for doc_j)
        
        # discard the distance between a doc and itself
        doc_distances = np.delete(doc_distances, j)
        for i in range(0, min_count-1):
            # get the closest doc to it and discard it, we only need the k-th closest doc.
            doc_distances = np.delete(doc_distances, np.argmin(doc_distances))
        # now create the pair: (doc_name, distance from k-th neighbour)
        list_.append((doc_titles[j], np.amin(doc_distances)))
        
    # sort the list by the second parameter (distance)
    list_.sort(key=lambda tup: tup[1])  # sorts in place
    return list_

In [20]:
import numpy as np
import sklearn.metrics.pairwise as sk # for cosine_distance
# TODO: add possibility of passing metric to use as parameters
def get_pairwise_distances_matrix(docs):
    """"
        docs: list of documents, each represented as a vector.
        
        Returns the pairwise distances matrix between documents. 
    
        Metric used to compute the distance is cosine_distance -by default-.
    """
    # initialize distance matrix
    n = len(docs)
    distances_m = np.zeros((n, n))
    
    # compute the distance betweem each vector (doc)
    # this is all but efficient at the moment, okay for a debug version.
    for i, doc1 in enumerate(docs):
        for j, doc2 in enumerate(docs):
            distances_m[i, j] = sk.cosine_distances([doc1], [doc2])
    return distances_m
    
#def get_kth_neighbour_distance(docvec, k):
    

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

In [36]:

b = get_pairwise_distances_matrix([[1, 2, 4], [2, 2, 2], [1, 2, 4], [2, 3, 4]])
print(b)
for i, row in enumerate(b):
    row = np.delete(row, i)
    row = np.delete(row, np.argmin(row))
    #print(row, i, np.amin(row))
a = choose_eps(3, [[1, 2, 4], [2, 2, 2], [1, 2, 4], [2, 3, 4]], ["Primo", "Secondo", "Primo 2", "Terzo"])
print(a)

[[0.         0.1180829  0.         0.02747092]
 [0.1180829  0.         0.1180829  0.03509872]
 [0.         0.1180829  0.         0.02747092]
 [0.02747092 0.03509872 0.02747092 0.        ]]
[('Terzo', 0.03509871864598446), ('Primo', 0.11808289631180302), ('Secondo', 0.11808289631180302), ('Primo 2', 0.11808289631180302)]


## Infer vector given doc (as list of entities) and model

In [27]:
def mean_of_vectors(vectors):
    """given a list of vectors, return the simplest mean of vectors."""
    
    sum_vectors = np.zeros(np.shape(vectors[0]))
    for vec in vectors:
        sum_vectors = sum_vectors + vec
    return sum_vectors/len(vectors)

def infer_vector(entities, model):
    """Given a list of entities, returns the vector representing the documents from which the entities 
    were extracted from, wrt a given W2V model.
    
    entities: list of entities, our way of representing a document.
    model: w2v model.
    """
    
    # get word vector of each entity; ignores word if the model does not know it
    entities_vecs = []
    for e in entities:
        try:
            entities_vecs.append(model[e])
        except:
            None # ignore unknown word
    
    return mean_of_vectors(entities_vecs)

MODEL_NAME = 'TestModels/w2v_entities+abstract_model.model'
model = Word2Vec.load(MODEL_NAME)

## Real data test -OVER SUBSAMPLE OF DATA-

In [32]:
filenames = ['blockchain.json', 'industria_4.0.json']

# load multiple files, assuming same data format
docs = []
for filename in filenames:
    with open(filename, 'r') as outfile:
        json_data = json.load(outfile)

    ## let's now retrieve the meaningful part of the json document
    # response{}--->docs[] 

    docs = docs + json_data['response']['docs']
    print("Number of docs:",len(docs))
    ## many documents have a failed abstract, let's remove them
    to_check = ' Questo sito web utilizza cookie tecnici e, previo Suo consenso, cookie di profilazione,'
    docs = [doc for i, doc in enumerate(docs) if not(to_check.strip() in doc['abstract'][0].strip())]

    # remove duplicates (of a particular doc)
    # TODO: remove all duplicates
    docs = [doc for doc in docs
                if not("Industry 4.0 (o industria 4.0): cos'è, notizie, normative, casi studio - I4T" in doc['title'])]
    print("New length after removing docs: ", len(docs))
    
## Adjust data format: title, abstract and url came in as list, but they're more useful as strings
for i, dictionary in enumerate(docs):
    for field in ['title', 'abstract', 'url']:
        if isinstance(dictionary[field], list):
            # re-format data to hold string instead of single-list item
            docs[i][field] = dictionary[field][0]   

Number of docs: 96
New length after removing docs:  91
Number of docs: 599
New length after removing docs:  362


In [39]:
# shuffle docs to get a random sub-sample
random.shuffle(docs)
subsample_length = 50
subsample = docs[:subsample_length]
subsample_titles = [doc['title'] for doc in subsample]


# get flattened_entities for each document, AS LIST of words (not a single string)
doc_entities = [doc['flattened_entities'].split() for doc in subsample]
print(doc_entities[:1])

# now we have to 'convert' every doc to vector form
docs_vecs = [infer_vector(list_ent, model) for list_ent in doc_entities]

# finally, call eps-estimate function
title_dist_tuples = choose_eps(2, docs_vecs, subsample_titles)

[['aristotele', 'automazione', 'azienda', 'basta_così_negramaro', 'benessere', 'capitale_umano', 'costituzione', "credito_d'imposta", 'digitale_informatica', 'digitalizzazione', 'domanda_di_lavoro', 'euro', 'europa', 'fattore_produttivo', 'filiera', 'forza-lavoro', 'homo_sapiens', 'impresa', 'industria', 'industria_4.0', 'lavoro', 'luce', 'macroeconomia', 'milano', 'numero', 'numero_reale', 'parlamento', 'partenariato', 'piccola_e_media_impresa', 'politecnico_di_milano', 'produttività', 'sabbia', 'scetticismo_scientifico', 'sindacato', 'sistema', 'tecnologia', 'università', 'velocità', 'xxi_secolo']]



Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



# Visualize results

In [40]:
trace = go.Scatter(
    x =[x for (x, y) in title_dist_tuples],  # list of x
    y = [y for (x, y) in title_dist_tuples],
    mode = 'lines',
    name = 'lines'
)

data = [trace]

py.iplot(data, filename = 'dbscan-eps-choosing')

