In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from sklearn.metrics.cluster import contingency_matrix
from sklearn.model_selection import train_test_split

In [3]:
from DBSCAN import DBSCAN as c_DBSCAN
from Agglomerative import Agglomerative as c_Agglomerative
from commons import euclidean_distance
c_KMeans = __import__('Draft K-Means').KMeans

In [4]:
df = pd.read_csv('dataset/iris.data', header=None)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
x = df.drop([4], axis=1)
y = df[4]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
y.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [8]:
def print_info(y, label):
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, label))
    print("Completeness: %0.3f" % metrics.completeness_score(y, label))
    print("V-measure: %0.3f" % metrics.v_measure_score(y, label))
#     print("Shilouette: %0.3f" % metrics.silhouette_score(y, label))

In [9]:
def get_mapping_to_label(n_cluster, y, label):
    uc = np.unique(np.array(y))
    cluster = [{} for i in range(len(uc))]
    label = label.reset_index(drop=True)
    for i in range(len(y)):
        if y[i] is not None and y[i] >= 0:
            loc = np.where(uc == y[i])[0][0]
            if label[i] in cluster[loc]:
                cluster[loc][label[i]] += 1
            else:
                cluster[loc][label[i]] = 0
    map = {}
    for i in range(len(uc)):
        if cluster[i]:
            map[uc[i]] = max(cluster[i], key=cluster[i].get)
    return map

In [10]:
def apply_map_to_cluster(y, map):
    return [map[i] for i in y]

In [11]:
def get_accuracy(y_pred, y_test):
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_test[i]:
            count += 1
    return count / len(y_pred)

In [12]:
def sklean_predict(x_train, y_train, x_test):
    x_train = np.array(x_train)
    x_test = np.array(x_test)
    y_train = np.array(y_train)
    y_pred = []
    for point in x_test:
        min_dist = math.inf
        cluster = None
        for i in range(len(x_train)):
            distance = euclidean_distance(point, x_train[i])
            if distance < min_dist and y_train[i] >= 0:
                min_dist = distance
                cluster = y_train[i]
        y_pred.append(cluster)
    return y_pred

## DBSCAN

In [13]:
cdbscan_model = c_DBSCAN(x_train, 0.8, 2)
cdbscan_label = cdbscan_model.get_all()
cdbscan_map = get_mapping_to_label(3, cdbscan_label, y_train)

In [14]:
print_info(y_train, cdbscan_label)

Homogeneity: 0.574
Completeness: 0.904
V-measure: 0.702


In [15]:
cdbscan_pred = []
for i, row in x_test.iterrows():
    cdbscan_pred.append(cdbscan_model.predict(row))

print(
    'Our DBSCAN accuracy = ',
    get_accuracy(apply_map_to_cluster(cdbscan_pred, cdbscan_map), y_test.reset_index(drop=True))
)

Our DBSCAN accuracy =  0.7


In [16]:
model_dbscan = DBSCAN(eps=0.8, min_samples=2).fit(x_train)
dbscan_map = get_mapping_to_label(3, model_dbscan.labels_, y_train)

In [17]:
print_info(y_train, model_dbscan.labels_)

Homogeneity: 0.574
Completeness: 0.904
V-measure: 0.702


In [18]:
dbscan_pred = sklean_predict(x_train, model_dbscan.labels_, x_test)
print(
    'Our DBSCAN accuracy = ',
    get_accuracy(apply_map_to_cluster(dbscan_pred, dbscan_map), y_test.reset_index(drop=True))
)

Our DBSCAN accuracy =  0.7


## Agglomerative

### Single

In [19]:
cagglomerative_single_model = c_Agglomerative(x_train, 3, 'single')
cagglomerative_single_label = cagglomerative_single_model.get_all()
cagglomerative_single_map = get_mapping_to_label(3, cagglomerative_single_label, y_train)

In [20]:
print_info(y_train, cagglomerative_single_label)

Homogeneity: 0.574
Completeness: 0.904
V-measure: 0.702


In [21]:
cagglomerative_single_pred = []
for i, row in x_test.iterrows():
    cagglomerative_single_pred.append(cagglomerative_single_model.predict(row))

print(
    'Our Agglomerative Single accuracy = ',
    get_accuracy(apply_map_to_cluster(cagglomerative_single_pred, cagglomerative_single_map), y_test.reset_index(drop=True))
)

Our Agglomerative Single accuracy =  0.7


In [22]:
model_single = AgglomerativeClustering(3, linkage="single").fit(x_train)
agg_single_map = get_mapping_to_label(3, model_single.labels_, y_train)

In [23]:
print_info(y_train, model_single.labels_)

Homogeneity: 0.574
Completeness: 0.904
V-measure: 0.702


In [24]:
agg_single_pred = sklean_predict(x_train, model_single.labels_, x_test)
print(
    'Our DBSCAN accuracy = ',
    get_accuracy(apply_map_to_cluster(agg_single_pred, agg_single_map), y_test.reset_index(drop=True))
)

Our DBSCAN accuracy =  0.7


### Complete

In [25]:
cagglomerative_complete_model = c_Agglomerative(x_train, 3, 'complete')
cagglomerative_complete_label = cagglomerative_complete_model.get_all()
cagglomerative_complete_map = get_mapping_to_label(3, cagglomerative_complete_label, y_train)

In [26]:
print_info(y_train, cagglomerative_complete_label)

Homogeneity: 0.679
Completeness: 0.726
V-measure: 0.701


In [27]:
cagglomerative_complete_pred = []
for i, row in x_test.iterrows():
    cagglomerative_complete_pred.append(cagglomerative_complete_model.predict(row))

print(
    'Our Agglomerative Complete accuracy = ',
    get_accuracy(apply_map_to_cluster(cagglomerative_complete_pred, cagglomerative_complete_map), y_test.reset_index(drop=True))
)

Our Agglomerative Complete accuracy =  0.9


In [28]:
model_complete = AgglomerativeClustering(linkage="complete").fit(x_train)
agg_complete_map = get_mapping_to_label(3, model_complete.labels_, y_train)

In [29]:
print_info(y_train, model_complete.labels_)

Homogeneity: 0.375
Completeness: 0.594
V-measure: 0.460


In [30]:
agg_complete_pred = sklean_predict(x_train, model_complete.labels_, x_test)
print(
    'Our DBSCAN accuracy = ',
    get_accuracy(apply_map_to_cluster(agg_complete_pred, agg_complete_map), y_test.reset_index(drop=True))
)

Our DBSCAN accuracy =  0.7333333333333333


### Average

In [31]:
cagglomerative_average_model = c_Agglomerative(x_train, 3, 'average')
cagglomerative_average_label = cagglomerative_average_model.get_all()
cagglomerative_average_map = get_mapping_to_label(3, cagglomerative_average_label, y_train)

In [32]:
print_info(y_train, cagglomerative_average_label)

Homogeneity: 0.772
Completeness: 0.799
V-measure: 0.785


In [33]:
cagglomerative_average_pred = []
for i, row in x_test.iterrows():
    cagglomerative_average_pred.append(cagglomerative_average_model.predict(row))

print(
    'Our Agglomerative Average accuracy = ',
    get_accuracy(apply_map_to_cluster(cagglomerative_average_pred, cagglomerative_average_map), y_test.reset_index(drop=True))
)

Our Agglomerative Average accuracy =  0.9


In [34]:
model_average = AgglomerativeClustering(3, linkage="average").fit(x_train)
agg_average_map = get_mapping_to_label(3, model_average.labels_, y_train)

In [35]:
print_info(y_train, model_average.labels_)

Homogeneity: 0.772
Completeness: 0.799
V-measure: 0.785


In [36]:
agg_average_pred = sklean_predict(x_train, model_average.labels_, x_test)
print(
    'Our DBSCAN accuracy = ',
    get_accuracy(apply_map_to_cluster(agg_average_pred, agg_average_map), y_test.reset_index(drop=True))
)

Our DBSCAN accuracy =  0.9333333333333333


### Average Group

In [37]:
cagglomerative_avg_g_model = c_Agglomerative(x_train, 3, 'average_group')
cagglomerative_avg_g_label = cagglomerative_avg_g_model.get_all()
cagglomerative_avg_g_map = get_mapping_to_label(3, cagglomerative_avg_g_label, y_train)

In [38]:
print_info(y_train, cagglomerative_avg_g_label)

Homogeneity: 0.772
Completeness: 0.799
V-measure: 0.785


In [39]:
cagglomerative_avg_g_pred = []
for i, row in x_test.iterrows():
    cagglomerative_avg_g_pred.append(cagglomerative_avg_g_model.predict(row))

print(
    'Our Agglomerative Average Group accuracy = ',
    get_accuracy(apply_map_to_cluster(cagglomerative_avg_g_pred, cagglomerative_avg_g_map), y_test.reset_index(drop=True))
)

Our Agglomerative Average Group accuracy =  0.9


## KMeans

In [40]:
ckmeans_model = c_KMeans(3, 1e-8, 1000)
ckmeans_label = ckmeans_model.fit_predict(x_train)
ckmeans_map = get_mapping_to_label(3, ckmeans_label, y_train)

  0%|                                                                                                        | 0/1000 [00:00<?, ?it/s]


In [41]:
print_info(y_train, ckmeans_label)

Homogeneity: 0.522
Completeness: 0.668
V-measure: 0.586


In [42]:
ckmeans_pred = ckmeans_model.predict(x_test)

print(
    'Our KMeans accuracy = ',
    get_accuracy(apply_map_to_cluster(ckmeans_pred, ckmeans_map), y_test.reset_index(drop=True))
)

Our KMeans accuracy =  0.7333333333333333


In [43]:
kmeans_model = KMeans(3, tol=1e-8)
kmeans_model.fit(x_train)
kmeans_map = get_mapping_to_label(3, kmeans_model.labels_, y_train)

In [44]:
print_info(y_train, kmeans_model.labels_)

Homogeneity: 0.730
Completeness: 0.745
V-measure: 0.738


In [45]:
kmeans_pred = kmeans_model.predict(x_test)

print(
    'Scikit-learn KMeans accuracy = ',
    get_accuracy(apply_map_to_cluster(kmeans_pred, kmeans_map), y_test.reset_index(drop=True))
)

Scikit-learn KMeans accuracy =  0.9
