## Load Library

In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import SpectralClustering

import warnings

warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
%matplotlib inline

## Load Data

In [2]:
input_file ='../data/results_3k_2hop.json'

# with open(input_file) as f:
#     repo_data = json.load(f,strict=False)
#  JSONDecodeError: Extra data: line 2 column 1 (char 969)
    
file = open(input_file, 'r', encoding='utf-8')
repo_data = {}
for line in file.readlines():
    dic = json.loads(line)
    repo_data = {**repo_data,**dic}
    
print(f'Repository numbers: {len(repo_data)}')
print(f'''https://github.com/natcap/natgeo-dams': 
{repo_data['https://github.com/natcap/natgeo-dams']}''')

Repository numbers: 2960
https://github.com/natcap/natgeo-dams': 
['pypng', 'requests', 'alabaster', 'codecov', 'detox', 'docutils', 'flake8', 'httpbin', 'more-itertools', 'pysocks', 'pytest', 'pytest-cov', 'pytest-httpbin', 'pytest-mock', 'pytest-xdist', 'readme-renderer', 'sphinx', 'tox', 'apipkg', 'appdirs', 'atomicwrites', 'attrs', 'babel', 'bleach', 'blinker', 'brotlipy', 'certifi', 'cffi', 'chardet', 'click', 'configparser', 'contextlib2', 'coverage', 'decorator', 'distlib', 'dnspython', 'entrypoints', 'enum34', 'eventlet', 'execnet', 'filelock', 'flask', 'funcsigs', 'functools32', 'greenlet', 'idna', 'imagesize', 'importlib-metadata', 'importlib-resources', 'itsdangerous', 'jinja2', 'markupsafe', 'mccabe', 'mock', 'monotonic', 'pathlib2', 'pluggy', 'py', 'pycodestyle', 'pycparser', 'pyflakes', 'pygments', 'pytest-forked', 'pytz', 'raven', 'scandir', 'singledispatch', 'six', 'snowballstemmer', 'toml', 'typing', 'urllib3', 'virtualenv', 'webencodings', 'werkzeug', 'zipp']


###  problem: why 2960?

## Process Data
1. repository name -> index
2. dependency name -> index
3. build dep matrix  matrix shape (3012,15663)

In [3]:
## id->name name->id dictionary
rep_name2index = {}
dep_name2index = {}
for rep,deps in repo_data.items():
    rep_name2index.setdefault(rep,len(rep_name2index))
    for dep in deps:
        dep_name2index.setdefault(dep,len(dep_name2index))

rep_index2name = {}
dep_index2name = {}
for k,v in rep_name2index.items():
    rep_index2name[v] = k

for k,v in dep_name2index.items():
    dep_index2name[v] = k

rep_num = len(rep_name2index)
dep_num = len(dep_name2index)
    
# build matrix
repo_mat = np.zeros((rep_num,dep_num))  # (3012, 15663)
for rep in repo_data:
    rep_index = rep_name2index[rep] # row number
    for dep in repo_data[rep]:
        dep_index =dep_name2index[dep] # col number
        repo_mat[rep_index][dep_index] = 1

## Clustering

In [4]:
##  get the cosine similarity matrix
cos_sim_mat = cosine_similarity(repo_mat)
cos_sim_mat.shape, cos_sim_mat

((2960, 2960),
 array([[1.        , 0.09410436, 0.        , ..., 0.13905533, 0.14509525,
         0.12681432],
        [0.09410436, 1.        , 0.03566882, ..., 0.32928732, 0.22558942,
         0.31905175],
        [0.        , 0.03566882, 1.        , ..., 0.09325048, 0.15811388,
         0.10050378],
        ...,
        [0.13905533, 0.32928732, 0.09325048, ..., 1.        , 0.58976782,
         0.77787815],
        [0.14509525, 0.22558942, 0.15811388, ..., 0.58976782, 1.        ,
         0.38138504],
        [0.12681432, 0.31905175, 0.10050378, ..., 0.77787815, 0.38138504,
         1.        ]]))

In [5]:
repo_mat.shape,repo_mat

((2960, 28477), array([[1., 1., 1., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]]))

###  1. Kmeans with repo_mat

In [None]:
%%time

clusters = list(range(2,15))
c_h_scores,s_scores,d_scores = [],[],[]
for k in clusters:
    y_pred = KMeans(n_clusters=k, init='k-means++', max_iter=500).fit_predict(repo_mat)
    c_h_score= metrics.calinski_harabasz_score(repo_mat, y_pred) # maximize 
    s_score = metrics.silhouette_score(repo_mat, y_pred)         # [-1,1]  want it close to 1
    d_score = metrics.davies_bouldin_score(repo_mat, y_pred)     # minimize  want it close to 0
    c_h_scores.append(c_h_score)
    s_scores.append(s_score)
    d_scores.append(d_score)
    print("n_clusters=", k,"score1:",c_h_score,"score2:",s_score,"score3:",d_score)

n_clusters= 2 score1: 1037.3159999545803 score2: 0.6070374259660937 score3: 1.251053509803207


#### => result clusetr = 4 OR 5 

###  2. Kmeans + PCA with repo_mat

In [None]:
%%time
pca = PCA(0.95)  
X_pca = pca.fit_transform(repo_mat)
print(X_pca.shape)

clusters = list(range(2,15))
c_h_scores,s_scores,d_scores = [],[],[]
for k in clusters:
    y_pred = KMeans(n_clusters=k, init='k-means++', max_iter=500).fit_predict(X_pca)
    c_h_score= metrics.calinski_harabasz_score(X_pca, y_pred) # maximize 
    s_score = metrics.silhouette_score(X_pca, y_pred)         # [-1,1]  want it close to 1
    d_score = metrics.davies_bouldin_score(X_pca, y_pred)     # minimize  want it close to 0
    c_h_scores.append(c_h_score)
    s_scores.append(s_score)
    d_scores.append(d_score)
    print("n_clusters=", k,"score1:",c_h_score,"score2:",s_score,"score3:",d_score)

#### => result clusetr = 3 OR 4

###  3. Kmeans with cos_mat

In [None]:
%%time
clusters = list(range(2,15))
c_h_scores,s_scores,d_scores = [],[],[]
for k in clusters:
    y_pred = KMeans(n_clusters=k, init='k-means++', max_iter=500).fit_predict(cos_sim_mat)
    c_h_score= metrics.calinski_harabasz_score(cos_sim_mat, y_pred) # maximize 
    s_score = metrics.silhouette_score(cos_sim_mat, y_pred)         # [-1,1]  want it close to 1
    d_score = metrics.davies_bouldin_score(cos_sim_mat, y_pred)     # minimize  want it close to 0
    c_h_scores.append(c_h_score)
    s_scores.append(s_score)
    d_scores.append(d_score)
    print("n_clusters=", k,"score1:",c_h_score,"score2:",s_score,"score3:",d_score)

#### => result clusetr = 3 OR 4

### Spectral Cluster 

The edge weight value between two points farther apart is lower, while the edge weight value between two points closer together is higher.

Strength:<br>
1) Spectral clustering only needs the similarity matrix between data, so it is very effective for processing sparse data clustering. This is difficult for traditional clustering algorithms such as K-Means.<br>
2) Due to the use of dimensionality reduction, the complexity of processing high-dimensional data clustering is better than traditional clustering algorithms.

Weakness:<br>
1) If the dimensionality of the final cluster is very high, the running speed of spectral clustering and the final clustering effect are not good due to insufficient dimensionality reduction.<br>
2) The clustering effect depends on the similarity matrix, and the final clustering effect obtained by different similarity matrices may be very different.

In [None]:
%%time
clusters = list(range(2,15))
c_h_scores,s_scores,d_scores = [],[],[]
for k in clusters:
    y_pred = SpectralClustering(n_clusters=k,affinity='precomputed').fit_predict(cos_sim_mat)
    c_h_score= metrics.calinski_harabasz_score(cos_sim_mat, y_pred) # maximize 
    s_score = metrics.silhouette_score(cos_sim_mat, y_pred)         # [-1,1]  want it close to 1
    d_score = metrics.davies_bouldin_score(cos_sim_mat, y_pred)     # minimize  want it close to 0
    c_h_scores.append(c_h_score)
    s_scores.append(s_score)
    d_scores.append(d_score)
    print("n_clusters=", k,"score1:",c_h_score,"score2:",s_score,"score3:",d_score)
    
#Warning:
#/opt/anaconda3/envs/Knowledge_Graph/lib/python3.6/site-packages/sklearn/manifold/_spectral_embedding.py:236: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
#  warnings.warn("Graph is not fully connected, spectral embedding"
#'''

#### => result clusetr = 10

### HDBSCAN Cluster 

strength:
1. Compared with K-Means, HDBSCAN does not need to declare the number of clusters in advance.
2. It can cluster dense data sets of any shape. In contrast, clustering algorithms such as K-Means are generally only suitable for convex data sets.
3. Abnormal points can be found while clustering, and it is not sensitive to abnormal points in the data set.
4. The clustering results are not biased. In contrast, the initial values of clustering algorithms such as K-Means have a great influence on the clustering results

In [None]:
%%time
import hdbscan
import numpy as np

#params 
min_cluster_sizes = [5,10,15,20]
min_sampless = [10,20,30,40,50]
cluster_selection_epsilons = [0.1,0.3,0.5]
alphas = [0.25,0.5,0.75,1.0]


for min_cluster_size in min_cluster_sizes:
    for min_samples in min_sampless:
        for cluster_selection_epsilon in cluster_selection_epsilons:
            for alpha in alphas:
                print("min_cluster_size:",min_cluster_size,'min_samples:',min_samples,
                      "cluster_selection_epsilon:",cluster_selection_epsilon,'alpha:',alpha)

                cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,min_samples=min_samples,
                                          cluster_selection_epsilon=cluster_selection_epsilon,alpha=alpha)
                y_pred= cluster.fit_predict(cos_sim_mat)
                labels = cluster.labels_

                c_h_score= metrics.calinski_harabasz_score(cos_sim_mat, y_pred) # maximize 
                s_score = metrics.silhouette_score(cos_sim_mat, y_pred)         # [-1,1]  want it close 1
                d_score = metrics.davies_bouldin_score(cos_sim_mat, y_pred)     # minimize 
                y_unique = np.unique(labels)
                n_clusters = y_unique.size - (1 if -1 in y_unique else 0)
                print("clusters:",n_clusters ,"score1:",c_h_score,"score2:",s_score,"score3:",d_score)
                print('='*50)

min_cluster_size: 10 min_samples: 40 cluster_selection_epsilon: 0.3 alpha: 1.0
clusters: 7 score1: 461.83448416490046 score2: 0.10818213466051357 score3: 1.29953322175402
                
min_cluster_size: 20 min_samples: 50 cluster_selection_epsilon: 0.3 alpha: 1.0
clusters: 5 score1: 516.2251302409094 score2: 0.07802924382561442 score3: 1.583326675965355
                