# Clustering

In [3]:
!pip install kneed

Collecting kneed
  Downloading kneed-0.7.0-py2.py3-none-any.whl (9.4 kB)
Installing collected packages: kneed
Successfully installed kneed-0.7.0


In [8]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.0.6-cp38-cp38-manylinux2014_x86_64.whl (13.0 MB)
[K     |████████████████████████████████| 13.0 MB 27.8 MB/s eta 0:00:01
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading spacy_legacy-3.0.6-py2.py3-none-any.whl (12 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 125 kB/s  eta 0:00:01
[?25hCollecting thinc<8.1.0,>=8.0.3
  Downloading thinc-8.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
[K     |████████████████████████████████| 628 kB 117.6 MB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp38-cp38-manylinux2014_x86_64.whl (35 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp38-cp38-manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 112.2 MB/s eta 0:00:01
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.5-cp38-cp38-manylinux2014_x86_64.whl (20 kB)
Co

In [11]:
import math
import numpy as np
import pandas as pd

from kneed import KneeLocator

from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA


import multiprocessing


In [271]:

def get_cpu_number():
    return multiprocessing.cpu_count()

In [272]:
def dimensionality_reduction(vectors, n_reduction=2):
    print('Number of dimensions is {}'.format(n_reduction))
    pca = PCA(n_components=n_reduction, svd_solver='full')
    pca.fit(vectors)
    return pca.transform(vectors)

In [273]:
def kneighbors(vectors):
    """
    Calculates average distances for k-nearest neighbors
    :return:
    """
    k = round(math.sqrt(len(vectors)))
    print('K-neighbours = {}'.format(k))
    nbrs = NearestNeighbors(n_neighbors=k, n_jobs=-1, algorithm='auto').fit(vectors) #-1 means using all processors       
    distances, indices = nbrs.kneighbors(vectors)
    distances = [np.mean(d) for d in np.sort(distances, axis=0)]
    return distances
    
def epsilon_search(vectors):
    """
    Search epsilon for the DBSCAN clusterization
    :return:
    """
    distances=kneighbors(vectors)
    kneedle = KneeLocator(distances, list(range(len(distances))), online=True)
    epsilon = np.mean(list(kneedle.all_elbows))
    if epsilon == 0.0:
        epsilon = np.mean(distances)
    return epsilon


def dbscan(df,min_samples=5,pca=True):
    """
    Execution of the DBSCAN clusterization algorithm.
    Returns cluster labels
    :return:
    """
    cpu_number=get_cpu_number()
    vectors=df['Vector'].values.tolist()
    if pca:
        vectors = dimensionality_reduction(vectors)
    distances=kneighbors(vectors)
    epsilon=epsilon_search(vectors)
    cluster_labels = DBSCAN(eps=epsilon,
    min_samples=min_samples,
     n_jobs=cpu_number) \
    .fit_predict(vectors)
    df['cluster'] = cluster_labels
    print('DBSCAN finished with {} clusters'.format(len(set(cluster_labels))))
    return pd.DataFrame.from_dict([item for item in df.groupby('cluster').apply(func=gb_regroup)], orient='columns').sort_values(by=['cluster_size'], ascending=False)
    #return df

def gb_regroup(gb):
    #indices = [i for sublist in gb['cluster'].values for i in sublist]
    states=[sublist for sublist in gb['State'].values]
    size = len(states)    
    return {'cluster_size': size,'state':states}

In [274]:
l=[np.random.rand(10) for i in np.arange(100)]
l[:3]

[array([0.67008292, 0.72511592, 0.51296091, 0.61591785, 0.39330138,
        0.22364256, 0.17759284, 0.02739236, 0.76093684, 0.67899768]),
 array([0.45597819, 0.78106331, 0.98242905, 0.73853574, 0.32355317,
        0.39674301, 0.89656203, 0.93764646, 0.4000217 , 0.28815555]),
 array([0.86648271, 0.56835921, 0.59024685, 0.63746128, 0.5057951 ,
        0.92872724, 0.59922935, 0.96047515, 0.39362159, 0.31709045])]

In [275]:
df['Vector'].values.tolist()[:3]

[array([0.28533164, 0.22794147, 0.7327608 , 0.84943759, 0.39375335,
        0.17845318, 0.38323975, 0.73863453, 0.37833823, 0.80228078]),
 array([0.05732098, 0.7027165 , 0.10137313, 0.66803885, 0.8875465 ,
        0.56129409, 0.40251905, 0.48516848, 0.4736999 , 0.8761976 ]),
 array([0.83864112, 0.73115569, 0.76582351, 0.9618685 , 0.4683687 ,
        0.85834234, 0.6093806 , 0.43627257, 0.0848338 , 0.66359205])]

In [276]:
p=[]
for i in np.arange(100):
    if(i<5):
        p.append('italy')
    else:
        p.append('france')

In [277]:
d={'State':p,'Vector':l}
df=pd.DataFrame(d)

In [278]:
df

Unnamed: 0,State,Vector
0,italy,"[0.6700829187231775, 0.7251159211614744, 0.512..."
1,italy,"[0.4559781907057495, 0.7810633087254356, 0.982..."
2,italy,"[0.866482712956828, 0.5683592105705176, 0.5902..."
3,italy,"[0.7515944457528813, 0.932530642604771, 0.0393..."
4,italy,"[0.850251534835107, 0.8510008624621102, 0.7290..."
...,...,...
95,france,"[0.5868373903395457, 0.7975425968159987, 0.328..."
96,france,"[0.8685134630415027, 0.28509358383769623, 0.47..."
97,france,"[0.3184864781692527, 0.022088163490382007, 0.5..."
98,france,"[0.6428394258971026, 0.34896819446639615, 0.90..."


In [280]:
cl=dbscan(df,pca=False)   

K-neighbours = 10
K-neighbours = 10
DBSCAN finished with 4 clusters
