In [1]:
!pip install plotly



In [2]:
# array, dataframe manipulation
import math
import numpy as np
import os
import pandas as pd
from six import iteritems
from functools import reduce

# cluster
from sklearn import metrics
from sklearn.cluster import OPTICS, cluster_optics_dbscan,DBSCAN 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# plot
import plotly
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec


%matplotlib inline
np.random.seed(9999)

In [None]:
# Conduct DBSCAN for comparison

def hyperparameter_tuning(df):
  """ input: dataframe contains only parameters for clustering purpose
      output: min cluster, min sample, xi 
      reference source: https://stackoverflow.com/questions/12893492/choosing-eps-and-minpts-for-dbscan-r
  """
  X = len(df)
  count = 0
  while(X>0):
    X=X//10  # result is rounded to the next smallest whole number
    count=count+1
    min_cluster_size = int(round(X/pow(100,count)))
    min_samples      = int(round(np.log(X)))-1
    xi               = min_samples/(pow(10,count+1))
    
    return min_cluster_size,min_samples ,xi
  
############################################################################
def optic_plot(df_for_cluster,min_samples, xi, min_cluster_size):
    X = df_for_cluster.values
    
    clust = OPTICS(min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)
    
    # Run the fit
    clust.fit(X)
    
    # plot
    space = np.arange(len(X))
    reachability = clust.reachability_[clust.ordering_]
    labels = clust.labels_[clust.ordering_]

    plt.figure(figsize=(10, 7))
    G = gridspec.GridSpec(2, 3)
    ax1 = plt.subplot(G[0, :])

    # Reachability plot
    colors = ['g.', 'r.', 'b.', 'y.', 'c.']
    for klass, color in zip(range(0, 5), colors):
        Xk = space[labels == klass]
        Rk = reachability[labels == klass]
        ax1.plot(Xk, Rk, color, alpha=0.3)
    ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3)
    ax1.set_ylabel('Reachability (epsilon distance)')
    ax1.set_title('Reachability Plot')

    plt.tight_layout()
    return plt.show()

  
############################################################################
def dbscan_kmeans_c(cluster_type,scaled_df,raw_df,k=None,eps=None,min_samples=None,metric=None):
  """
  DBSCAN is used when we don't have to assign number of clusters
  
  Manhattan distance if the input variables are not similar in type (such as age, gender, height, etc.). 
  Euclidean distance becomes a poor choice when the number of dimensions increases.
  
  Input:
  scaled_df:      scaled data frame ready for clustering, sharing same indexes with raw_df
  metric:         'euclidean' for example euclidean distance
  Output:         orginal dataframe with cluster labelling
  """
  # In this case Euclidean distance was used because there are only 2 parameters and in the same type (percentage)
  try:
    if cluster_type=='dbscan':
      dbscan = DBSCAN(eps,
                      min_samples, 
                      metric=metric)
      c = dbscan.fit(scaled_df)
      
    elif cluster_type=='kmeans':
      kmeans = KMeans(n_clusters=k)
      c = kmeans.fit(scaled_df)
      
    cluster_map = pd.DataFrame()
    cluster_map['cluster'] = c.labels_
    cluster = pd.concat([cluster_map, raw_df],  axis=1)
    return cluster
    
  except ArithmeticError:
    var_exists = False
    
  else:
    var_exists = True
    
  if not var_exists:
    print("no data loaded...")
    return 0
