In [2]:
# pip install pyclustering

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyclustering
  Downloading pyclustering-0.10.1.2.tar.gz (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 34.8 MB/s 
Building wheels for collected packages: pyclustering
  Building wheel for pyclustering (setup.py) ... [?25l[?25hdone
  Created wheel for pyclustering: filename=pyclustering-0.10.1.2-py3-none-any.whl size=2395122 sha256=6a1af3ae814d237890b0ccc6784ef6f3f44c146cdfcb4e7e5ff730c57b063147
  Stored in directory: /root/.cache/pip/wheels/ea/87/6b/1e0568b5ba9dc6518a25338bae90bd8392f35206bb90bb10f1
Successfully built pyclustering
Installing collected packages: pyclustering
Successfully installed pyclustering-0.10.1.2


In [None]:
from numpy.random.mtrand import randint

import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
from sklearn import preprocessing
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from pyclustering.cluster.clarans import clarans as CLARANS
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth

from sklearn.metrics import silhouette_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from itertools import combinations

import seaborn as sns

import copy

import matplotlib.pyplot as plt
import numpy as np


# Change CLARANS result to ScikitLearn result
def clarans_label_converter(labels):
  total_len = 0
  for k in range(0, len(labels)):
    total_len += len(labels[k])

  outList = np.empty((total_len), dtype=int)
  cluster_number = 0
  for k in range(0, len(labels)):
    for l in range(0, len(labels[k])):
      outList[labels[k][l]] = cluster_number
    cluster_number += 1
  return outList

def findKMeans(X,cluster_k_value,cur_scaler):
  global best_score,best_k_,best_scaler,best_model,best_score,labels_
  labels = None
  model=KMeans(n_clusters = cluster_k_value)
  labels = model.fit_predict(X)
  score_result = silhouette_score(X, labels)
  # if mean value of scores are bigger than max variable,
  # update new options(model, scaler, k) to best options
  if best_score < score_result:
    best_score = score_result
    best_scaler = cur_scaler
    best_model = KMeans()
    best_k_ = cluster_k_value
    labels_ = copy.deepcopy(labels)

def findGMM(X,cluster_k_value,cur_scaler):
  global best_score,best_k_,best_scaler,best_model,best_score,labels_
  labels = None
  model=GaussianMixture(n_components = cluster_k_value)
  labels = model.fit_predict(X)
  score_result = silhouette_score(X, labels)
  # if mean value of scores are bigger than max variable,
  # update new options(model, scaler, k) to best options
  if best_score < score_result:
    best_score = score_result
    best_scaler = cur_scaler
    best_model = GaussianMixture()
    best_k_ = cluster_k_value
    labels_ = copy.deepcopy(labels)

def findCLARANS(X,cluster_k_value,cur_scaler):
  global best_score,best_k_,best_scaler,best_model,best_score,labels_
  labels = None
  model=CLARANS(data=X.tolist(),number_clusters=cluster_k_value, numlocal=2, maxneighbor=3)
  model.process()
  clarans_label = model.get_clusters()
  labels = clarans_label_converter(labels=clarans_label)
  score_result = silhouette_score(X, labels)
  # if mean value of scores are bigger than max variable,
  # update new options(model, scaler, k) to best options
  if best_score < score_result:
    best_score = score_result
    best_scaler = cur_scaler
    best_model = CLARANS(data=X.tolist(),number_clusters=cluster_k_value, numlocal=2, maxneighbor=3)
    best_k_ = cluster_k_value
    labels_ = copy.deepcopy(labels)

def findDBSCAN(X,cur_scaler):
  global best_score,best_k_,best_scaler,best_model,best_score,labels_
  labels = None
  model=DBSCAN(eps=0.5, min_samples=2)
  labels=model.fit_predict(X)
  # when cluster nuber is just 1, skip scoring
  if len(pd.DataFrame(labels).drop_duplicates().to_numpy().flatten())>1:
    score_result = silhouette_score(X, labels)
    # if mean value of scores are bigger than max variable,
    # update new options(model, scaler, k) to best options
    if best_score < score_result:
      best_score = score_result
      best_scaler = cur_scaler
      best_model = DBSCAN(eps=0.5, min_samples=2)
      best_k_ = len(pd.DataFrame(labels).drop_duplicates().to_numpy().flatten())
      labels_ = copy.deepcopy(labels)

def AutoML(X,model_name,scalers,cluster_k):
  global best_score,best_k_,best_scaler,best_model,best_score,labels_
  cur_case = 1
  total_case = len(scalers) * len(cluster_k)
  #total_case = len(scalers) * len(models) * len(cluster_k)
  # Find best scaler
  for n in range(0, len(scalers)):
    X = scalers[n].fit_transform(X)
    for i in range(0,len(cluster_k)):
      print("Progressing: (",end="")
      print(cur_case,end="/")
      print(total_case,end=")\n")
      cur_case += 1
      for i in range(len(model_name)):
        if model_name[i]=="KMeans":
          findKMeans(X,cluster_k[i],scalers[n])
        elif model_name[i]=="GMM":
          findGMM(X,cluster_k[i],scalers[n])
        elif model_name[i]=="CLARANS":
          findCLARANS(X,cluster_k[i],scalers[n])
        elif model_name[i]=="DBSCAN":
          findDBSCAN(X,scalers[n])
        else:
          print("no model")


# Import dataset
base_src='./drive/MyDrive'
df = pd.read_csv(base_src+"/housing.csv")

# Preprocessing
# Drop useless feature
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)


# LabelEncoder
print("Encoder: LabelEncoder")
label_encoder = LabelEncoder()
result = label_encoder.fit_transform(df['ocean_proximity'])
df['ocean_proximity'] = result

best_scaler = None
best_model = None
labels_ = None
best_k_ = None
best_score=-1.0

#Median_house_value
median_house_value=df['median_house_value']

# Feature
# Save origin dataframe to join clustered data after clustering
dft = copy.deepcopy(df)
print("Median_house_value:")
print(df['median_house_value'].describe())
print(df.to_numpy())

# Save origin dataframe to join clustered data after clustering
dft = copy.deepcopy(df)
print("Median_house_value:")
print(df['median_house_value'].describe())

print("to_numpy: ", end="")
print(df.to_numpy())
print("Columns: ", end="")
print(df.columns)

#model_name=["KMeans","GMM","DBSCAN","CLARANS"]
model_name=["KMeans","GMM","DBSCAN"]
AutoML(df,model_name,[StandardScaler(), RobustScaler(), MinMaxScaler(), MaxAbsScaler(),],range(2,12))

print("\nBest Scaler: ", end="")
print(best_scaler)
print("Best Model: ", end="")
print(best_model)
print("Score: ", end="")
print(best_score)
print("labels: ", end="")
print(labels_)
print("k: ", end="")
print(best_k_)


# Analyze
# Extrace cluster numbers 
cluster_info = pd.DataFrame(labels_).drop_duplicates().to_numpy().flatten()

# Make dataframe for each cluster
clusters_df = []
for i in range(0, len(cluster_info)):
  clusters_df.append(pd.DataFrame(columns=dft.columns))

for i in range(0, len(labels_)):
  clusters_df[labels_[i]] = clusters_df[labels_[i]].append(dft.iloc[i, :])

print("Median_house_value:")
print(dft['median_house_value'].describe())

# Print describe() to analyze clusters
print("Cluster Info:", cluster_info)
for i in range(0, len(clusters_df)):
  print("Cluster", cluster_info[i])
  print(clusters_df[i].describe())
  print("\n")


Encoder: LabelEncoder
Median_house_value:
count     20433.000000
mean     206864.413155
std      115435.667099
min       14999.000000
25%      119500.000000
50%      179700.000000
75%      264700.000000
max      500001.000000
Name: median_house_value, dtype: float64
[[-1.2223e+02  3.7880e+01  4.1000e+01 ...  8.3252e+00  4.5260e+05
   3.0000e+00]
 [-1.2222e+02  3.7860e+01  2.1000e+01 ...  8.3014e+00  3.5850e+05
   3.0000e+00]
 [-1.2224e+02  3.7850e+01  5.2000e+01 ...  7.2574e+00  3.5210e+05
   3.0000e+00]
 ...
 [-1.2122e+02  3.9430e+01  1.7000e+01 ...  1.7000e+00  9.2300e+04
   1.0000e+00]
 [-1.2132e+02  3.9430e+01  1.8000e+01 ...  1.8672e+00  8.4700e+04
   1.0000e+00]
 [-1.2124e+02  3.9370e+01  1.6000e+01 ...  2.3886e+00  8.9400e+04
   1.0000e+00]]
Median_house_value:
count     20433.000000
mean     206864.413155
std      115435.667099
min       14999.000000
25%      119500.000000
50%      179700.000000
75%      264700.000000
max      500001.000000
Name: median_house_value, dtype: floa