In [None]:
import openml
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder #For encoding categorical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN
from sklearn.metrics import accuracy_score,f1_score,adjusted_rand_score,silhouette_score
from joblib import Parallel,delayed
import time

# Download Datasets Using openml

In [None]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

In [None]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df

# Identify Data Types

In [None]:
iris_df.info()

In [None]:
wine_df.info()

# Transform categorical variable to numeric

In [None]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)
wine_y=le.fit_transform(wine_label)

# Min-max normalise

In [None]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)
wine_x_scaled=scaler.fit_transform(wine_x)


# DataSets

In [None]:
Dataset = ["Iris","Wine"]

# KMeans

In [None]:
def kmeans (x, y, parameters):
    start_time = time.time()
    kmeans = KMeans(n_clusters = parameters[0], max_iter = parameters[1], n_init = parameters[2])
    y_kmeans = kmeans.fit_predict(x)
    kmeans_f1score = f1_score(y, y_kmeans, average = 'weighted')
    kmeans_ars = adjusted_rand_score(y, y_kmeans)
    kmeans_sscore = silhouette_score(x, y_kmeans, metric='euclidean')
    kmeans_execution_time = time.time() - start_time
    return y_kmeans, kmeans_f1score, kmeans_ars, kmeans_sscore, kmeans_execution_time

In [None]:
n_clusters=[2,3,4,5,6,7]
max_iter=[200,300,400]
n_init=[5,10,15]
    
kmean_parameters = pd.DataFrame({"n_clusters":[] , 
                                 "max_iter":[] , 
                                 "n_init":[]}).astype(int)


In [None]:
for i in n_clusters:
        for  ite in max_iter:
            for n in n_init:
                kmean_parameters = kmean_parameters.append({"n_clusters" : i, 
                                                            "max_iter" : ite, 
                                                            "n_init" : n},ignore_index=True)

In [None]:
kmean_parameters

In [None]:
final_iris_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(iris_x_scaled, iris_y, kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))
final_wine_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(wine_x_scaled, wine_y, kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))

In [None]:
final_kmeans_df=pd.DataFrame({
    "Dataset":[],
    "n_clusters":[],
    "max_iter":[],
    "n_init":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [None]:
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "n_clusters":kmean_parameters.iloc[i][0],
        "max_iter":kmean_parameters.iloc[i][1],
        "n_init":kmean_parameters.iloc[i][2],
        "f1 score":final_iris_kmeans[i][1],
        "Adjusted Random Score":final_iris_kmeans[i][2],
        "Silhouette Score":final_iris_kmeans[i][3],
        "Execution Time":final_iris_kmeans[i][4],
        },ignore_index=True)
    
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "n_clusters":kmean_parameters.iloc[i][0],
        "max_iter":kmean_parameters.iloc[i][1],
        "n_init":kmean_parameters.iloc[i][2],
        "f1 score":final_wine_kmeans[i][1],
        "Adjusted Random Score":final_wine_kmeans[i][2],
        "Silhouette Score":final_wine_kmeans[i][3],
        "Execution Time":final_wine_kmeans[i][4],
        "Dataset":Dataset[1]
        },ignore_index=True)
final_kmeans_df

# Agglomerative Clustering

In [None]:
def agglomerative (x, y, parameters):
    start_time = time.time()
    agglomerative= AgglomerativeClustering(n_clusters = parameters[0], linkage = parameters[1])
    y_agglomerative = agglomerative.fit_predict(x)
    agglomerative_f1score = f1_score(y, y_agglomerative, average = "weighted")
    agglomerative_ars = adjusted_rand_score(y, y_agglomerative)
    agglomerative_sscore = silhouette_score(x, y_agglomerative, metric="euclidean")
    agglomerative_execution_time = time.time() - start_time
    return y_agglomerative, agglomerative_f1score,agglomerative_ars, agglomerative_sscore, agglomerative_execution_time

In [None]:
n_clusters =[2,3,4,5,6,7]
linkage = ["ward", "complete", "average", "single"]

agglomerative_parameters = pd.DataFrame({
    "n_clusters":[] ,
    "linkage":[]}).astype(int)
  
for i in n_clusters:
        for  n in linkage:
                agglomerative_parameters = agglomerative_parameters.append({
                    "n_clusters" : i, 
                    "linkage" : n },
                    ignore_index=True)

In [None]:
agglomerative_parameters

In [None]:
final_iris_aggromilative = Parallel(n_jobs=-1)(delayed(agglomerative)(iris_x_scaled, iris_y, agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))
final_wine_aggromilative = Parallel(n_jobs=-1)(delayed(agglomerative)(wine_x_scaled, wine_y, agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))

In [None]:
final_aggromilative_df=pd.DataFrame({
    "Dataset":[],
    "n_clusters":[],
    "linkage":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [None]:
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[0],
        "n_clusters":agglomerative_parameters.iloc[i][0],
        "linkage":agglomerative_parameters.iloc[i][1],
        "f1 score":final_iris_aggromilative[i][1],
        "Adjusted Random Score":final_iris_aggromilative[i][2],
        "Silhouette Score":final_iris_aggromilative[i][3],
        "Execution Time":final_iris_aggromilative[i][4],
        },ignore_index=True)
    
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[1],
        "n_clusters":agglomerative_parameters.iloc[i][0],
        "linkage":agglomerative_parameters.iloc[i][1],
        "f1 score":final_wine_aggromilative[i][1],
        "Adjusted Random Score":final_wine_aggromilative[i][2],
        "Silhouette Score":final_wine_aggromilative[i][3],
        "Execution Time":final_wine_aggromilative[i][4],
        },ignore_index=True)
    
final_aggromilative_df

# DBScan Clustering

In [None]:
def dbscan (x, y, parameters):
    start_time = time.time()
    dbscan= DBSCAN(eps = parameters[0], min_samples = parameters[1])
    y_dbscan = dbscan.fit_predict(x)
    dbscan_f1score = f1_score(y, y_dbscan, average = "weighted")
    dbscan_ars = adjusted_rand_score(y, y_dbscan)
    dbscan_n_clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
    if dbscan_n_clusters>=2:
        dbscan_sscore = silhouette_score(x, y_dbscan, metric="euclidean")
    else:
        dbscan_sscore = print("111")
    dbscan_execution_time = time.time() - start_time
    return y_dbscan, dbscan_f1score,dbscan_ars,dbscan_sscore, dbscan_execution_time

In [None]:
eps = [0.1,0.2,0.3,0.4,0.5]
min_samples = [2,3,4,5,6,7,8]

dbscan_parameters = pd.DataFrame({
    "eps":[],
    "min_samples" :[]
}).astype(int)

for i in eps:
    for n in min_samples:
        dbscan_parameters=dbscan_parameters.append({
            "eps" : i,
            "min_samples": n
        },ignore_index=True)
        
dbscan_parameters

In [None]:
final_iris_dbscan = Parallel(n_jobs=-1)(delayed(dbscan)(iris_x_scaled, iris_y, dbscan_parameters.iloc[i]) for i in range(0, len(dbscan_parameters)))

In [None]:
final_dbscan_df=pd.DataFrame({
    "Dataset":[],
    "eps":[],
    "min_samples":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [None]:
for i in range(0, len(dbscan_parameters)):
    final_dbscan_df =final_dbscan_df.append({
        "Dataset":Dataset[0],
        "eps":dbscan_parameters.iloc[i][0],
        "min_samples":dbscan_parameters.iloc[i][1],
        "f1 score":final_iris_dbscan[i][1],
        "Adjusted Random Score":final_iris_dbscan[i][2],
        "Silhouette Score":final_iris_dbscan[i][3],
        "Execution Time":final_iris_dbscan[i][4],
        },ignore_index=True)

In [None]:
final_dbscan_df

In [None]:
# n_clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
# n_noise = list(y_dbscan).count(-1)

# print('Estimated number of clusters: %d' % n_clusters)
# print('Estimated number of noise points: %d' % n_noise)

Unlike k-means, DBSCAN will figure out the number of clusters. 
DBSCAN works by determining whether the minimum number of points are close enough to 
one another to be considered part of a single cluster. DBSCAN is very sensitive to scale since 
epsilon is a fixed value for the maximum distance between two points.

# Optics Clustering

# Gaussian mixtures Clustering

# Affinity propagation

In [None]:
# from sklearn.cluster import AffinityPropagation

# afp = AffinityPropagation(damping=0.9, max_iter=200, convergence_iter=15, copy=True, preference=-5, affinity='euclidean', verbose=False, random_state=None)
# y_afp=afp.fit_predict(x_scaled)
# y_afp

# Mean-shift 

# Spectral Clustering

# Ward hierarchical

In [None]:
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 0, 1, 1, 2]
metrics.rand_score(labels_true, labels_pred)