# DBSCAN: Choose Eps value by plotting distances between points
### e.g. for min_count = 2, plot distances of 2nd closest neighbour of each point

In [12]:
# needed libraries
import json
import random
from gensim.models import Word2Vec
import  gensim
from collections import Counter
from sklearn.cluster import DBSCAN
import numpy as np
from collections import OrderedDict 

In [22]:
def choose_eps(min_count, docs_vecs, doc_titles):
    """Plot the graph and choose best eps based on the composition of our data: 
    to do so, we will need to compute the distances between every point in the data-space, and its
    2nd/3rd closest neighbour (based on 'min_count'). 
    Take the eps corresponding to a great change in the derivative of the plotted function ('knee' or 'elbow' shape).
    
    Docs_vecs is the list of vectors we will analyze, each representing a document.
    
    min_count is the number of points needed to define a core point in DBSCAN.
    
    doc_titles is a matching list (wrt to docs_vecs), containing the titles of each doc
    Returns ordered dict doc:distance_k_neighbour for every doc, for now.
    """
    
    # first thing to do: compute the matrix of all pairwise elements distances
    # warning: this code is not optimized
    dist_matrix = get_pairwise_distances_matrix(docs_vecs)
    
    # for each document vec, discard the other elements and keep the 
    # k-closest vector DISTANCE
    dictionary = {}
    j = 0
    for doc_distances in dist_matrix:
        # get a row of the matrix (vector of distances for doc_j)
        
        # first occurrence will always be discarded, since it's the distance from a doc to itself,
        doc_distances = np.delete(doc_distances, j)
        for i in range(min_count-1):
            # get the closest doc to it and discard it, we only need the k-th closest doc.
            doc_distances = np.delete(doc_distances, np.argmin(doc_distances))
        # now create the couple: {doc_name: distance from k-th neighbour}; couple is inverted for sorting
        dictionary[np.amin(doc_distances)] = doc_titles[j]
        j += 1
    # now that we have a dict with the needed distances, we need to order them and return them
    #return OrderedDict(sorted(dictionary))
    return dictionary

In [2]:
import numpy as np
import sklearn.metrics.pairwise as sk # for cosine_distance
# TODO: add possibility of passing metric to use as parameters
def get_pairwise_distances_matrix(docs):
    """"
        docs: list of documents, each represented as a vector.
        
        Returns the pairwise distances matrix between documents. 
    
        Metric used to compute the distance is cosine_distance -by default-.
    """
    # initialize distance matrix
    n = len(docs)
    distances_m = np.zeros((n, n))
    
    # compute the distance betweem each vector (doc)
    # this is all but efficient at the moment, okay for a debug version.
    for i, doc1 in enumerate(docs):
        for j, doc2 in enumerate(docs):
            distances_m[i, j] = sk.cosine_distances([doc1], [doc2])
    return distances_m
    
def get_kth_neighbour_distance(docvec, k):
    

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')

[31mtwisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [24]:
a = np.array([1, 2, 3])
print(np.argmin(a))
a = np.delete(a, np.argmin(a))
print(np.amin(a))

print(get_pairwise_distances_matrix([[1, 2, 4], [2, 2, 2], [1, 2, 4], [2, 3, 4]]))
print(choose_eps(2, [[1, 2, 4], [2, 2, 2], [1, 2, 4], [2, 3, 4]], ["Primo", "Secondo", "Primo 2", "Terzo"]))

0
2
[[0.         0.1180829  0.         0.02747092]
 [0.1180829  0.         0.1180829  0.03509872]
 [0.         0.1180829  0.         0.02747092]
 [0.02747092 0.03509872 0.02747092 0.        ]]
{0.027470921832270623: 'Terzo', 0.11808289631180302: 'Secondo'}
