In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from sklearn.metrics.cluster import contingency_matrix
from sklearn.model_selection import train_test_split

In [53]:
from DBSCAN import DBSCAN as c_DBSCAN
from Agglomerative import Agglomerative as c_Agglomerative
c_KMeans = __import__('Draft K-Means').KMeans

In [54]:
df = pd.read_csv('dataset/iris.data', header=None)

In [55]:
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [56]:
x = df.drop([4], axis=1)
y = df[4]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [57]:
y.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [58]:
def get_cluster_mapping_to_target(x, y, label):
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, label))
    print("Completeness: %0.3f" % metrics.completeness_score(y, label))
    print("V-measure: %0.3f" % metrics.v_measure_score(y, label))
    print("Shilouette: %0.3f" % metrics.silhouette_score(x, label))

## DBSCAN

In [59]:
cdbscan_model = c_DBSCAN(x_train, 0.8, 2)
cdbscan_label = cdbscan_model.get_all()

In [60]:
cdbscan_map = get_cluster_mapping_to_target(x_train, y_train, cdbscan_label)

Homogeneity: 0.602
Completeness: 0.911
V-measure: 0.725
Shilouette: 0.520


In [61]:
model_dbscan = DBSCAN(eps=0.8, min_samples=2).fit(x_train)
model_dbscan.labels_

array([0, 1, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0,
       0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0,
       0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0,
       2, 0, 2, 0, 2, 0, 0, 0, 0, 0], dtype=int64)

In [62]:
dbscan_map = get_cluster_mapping_to_target(x_train, y_train, model_dbscan.labels_)

Homogeneity: 0.602
Completeness: 0.911
V-measure: 0.725
Shilouette: 0.520


## Agglomerative

### Single

In [63]:
cagglomerative_single_model = c_Agglomerative(x_train, 3, 'single')
cagglomerative_single_label = cagglomerative_single_model.get_all()

In [64]:
cagglomerative_single_map = get_cluster_mapping_to_target(x_train, y_train, cagglomerative_single_label)

Homogeneity: 0.602
Completeness: 0.911
V-measure: 0.725
Shilouette: 0.520


In [65]:
model_single = AgglomerativeClustering(linkage="single").fit(x_train)
model_single.labels_

array([0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [66]:
single_map = get_cluster_mapping_to_target(x_train, y_train, model_single.labels_)

Homogeneity: 0.590
Completeness: 1.000
V-measure: 0.742
Shilouette: 0.683


### Complete

In [67]:
cagglomerative_complete_model = c_Agglomerative(x_train, 3, 'complete')
cagglomerative_complete_label = cagglomerative_complete_model.get_all()

In [68]:
cagglomerative_complete_map = get_cluster_mapping_to_target(x_train, y_train, cagglomerative_complete_label)

Homogeneity: 0.808
Completeness: 0.831
V-measure: 0.820
Shilouette: 0.551


In [69]:
model_complete = AgglomerativeClustering(linkage="complete").fit(x_train)
model_complete.labels_

array([0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [70]:
complete_map = get_cluster_mapping_to_target(x_train, y_train, model_complete.labels_)

Homogeneity: 0.590
Completeness: 1.000
V-measure: 0.742
Shilouette: 0.683


### Average

In [71]:
cagglomerative_average_model = c_Agglomerative(x_train, 3, 'average')
cagglomerative_average_label = cagglomerative_average_model.get_all()

In [72]:
cagglomerative_average_map = get_cluster_mapping_to_target(x_train, y_train, cagglomerative_average_label)

Homogeneity: 0.626
Completeness: 0.834
V-measure: 0.715
Shilouette: 0.550


In [73]:
model_average = AgglomerativeClustering(linkage="average").fit(x_train)
model_average.labels_

array([0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [74]:
average_map = get_cluster_mapping_to_target(x_train, y_train, model_average.labels_)

Homogeneity: 0.590
Completeness: 1.000
V-measure: 0.742
Shilouette: 0.683


### Average Group

In [75]:
cagglomerative_avg_g_model = c_Agglomerative(x_train, 3, 'average_group')
cagglomerative_avg_g_label = cagglomerative_avg_g_model.get_all()

In [76]:
cagglomerative_avg_g_map = get_cluster_mapping_to_target(x_train, y_train, cagglomerative_avg_g_label)

Homogeneity: 0.646
Completeness: 0.806
V-measure: 0.717
Shilouette: 0.548


In [77]:
model_ward = AgglomerativeClustering(linkage="ward").fit(x_train)
model_ward.labels_

array([0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [78]:
ward_map = get_cluster_mapping_to_target(x_train, y_train, model_ward.labels_)

Homogeneity: 0.590
Completeness: 1.000
V-measure: 0.742
Shilouette: 0.683


## KMeans

In [79]:
ckmeans_model = c_KMeans(3, 1e-8, 1000)
ckmeans_label = ckmeans_model.fit_predict(x_train)

  1%|▍                                                                                | 6/1000 [00:00<00:17, 58.05it/s]


In [80]:
print(ckmeans_label)

[1, 1, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 1, 1, 0, 2, 0, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 2, 2, 0, 1, 0, 2, 1, 2, 1, 1, 0, 1, 2, 2, 2, 2, 2, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 2, 1, 1, 2, 1, 0, 1, 1, 2, 0, 2, 1, 1, 0, 2, 1, 2, 2, 2, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 0, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, 0, 2, 1, 2, 1, 1, 1, 0, 1]


In [81]:
ckmeans_map = get_cluster_mapping_to_target(x_train, y_train, ckmeans_label)

Homogeneity: 0.782
Completeness: 0.801
V-measure: 0.792
Shilouette: 0.517


In [82]:
kmeans_model = KMeans(3, tol=1e-8)
kmeans_model.fit(x_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=1e-08, verbose=0)

In [83]:
kmeans_model.labels_

array([1, 1, 0, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 1, 2, 0, 2, 0, 1,
       1, 0, 0, 0, 2, 2, 1, 1, 0, 2, 1, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 1,
       2, 2, 2, 0, 0, 0, 0, 0, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 0, 1, 1,
       0, 1, 2, 1, 2, 0, 2, 0, 1, 2, 2, 0, 1, 0, 0, 0, 1, 0, 2, 2, 2, 1,
       1, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 1, 0, 2, 2, 0, 2, 1, 1, 0, 2,
       0, 2, 0, 1, 0, 2, 2, 2, 2, 1])

In [84]:
kmeans_map = get_cluster_mapping_to_target(x_train, y_train, kmeans_model.labels_)

Homogeneity: 0.756
Completeness: 0.771
V-measure: 0.764
Shilouette: 0.546
