In [None]:
import openml
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder #For encoding categorical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN
from sklearn.metrics import accuracy_score,f1_score,adjusted_rand_score,silhouette_score
from joblib import Parallel,delayed
import time

# Download Datasets Using openml

In [None]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

In [None]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df

# Identify Data Types

In [None]:
iris_df.info()

In [None]:
wine_df.info()

# Transform categorical variable to numeric

In [None]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)
wine_y=le.fit_transform(wine_label)

# Min-max normalise

In [None]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)
wine_x_scaled=scaler.fit_transform(wine_x)


# DataSets

In [27]:
Dataset = ["Iris","Wine"]

# KMeans

In [28]:
def algorithm (x, y, parameters):
    start_time = time.time()
    kmeans =KMeans(n_clusters=parameters[0], max_iter=parameters[1], n_init=parameters[2])
    y_predict = kmeans.fit_predict(x)
    f1score = f1_score(y, y_predict, average = 'weighted')
    ars = adjusted_rand_score(y, y_predict)
    sscore = silhouette_score(x, y_predict, metric='euclidean')
    execution_time = time.time() - start_time
    return y_predict, f1score, ars, sscore, execution_time

In [29]:
# def algorithm(x, y, algo):
#     start_time = time.time()
#     y_predict = algo.fit_predict(x)
#     f1score = f1_score(y, y_predict, average = 'weighted')
#     ars = adjusted_rand_score(y, y_predict)
#     sscore = silhouette_score(x, y_predict, metric='euclidean')
#     execution_time = time.time() - start_time
#     return y_predict, f1score, ars, sscore, execution_time

In [30]:
n_clusters=[2,3,4,5,6,7]
max_iter=[200,300,400]
n_init=[5,10,15]
    
kmean_parameters = pd.DataFrame({"n_clusters":[] , 
                                 "max_iter":[] , 
                                 "n_init":[]}).astype(int)

for i in n_clusters:
        for  ite in max_iter:
            for n in n_init:
                kmean_parameters = kmean_parameters.append({"n_clusters" : i, 
                                                            "max_iter" : ite, 
                                                            "n_init" : n},ignore_index=True)
                
                
kmean_parameters

Unnamed: 0,n_clusters,max_iter,n_init
0,2,200,5
1,2,200,10
2,2,200,15
3,2,300,5
4,2,300,10
5,2,300,15
6,2,400,5
7,2,400,10
8,2,400,15
9,3,200,5


In [31]:
# kmeans =KMeans(n_clusters=kmean_parameters.iloc[:z,:1].values, max_iter=kmean_parameters.iloc[:z,:2].values, n_init=kmean_parameters.iloc[:z,:3].values)

In [32]:
final_iris_kmeans = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))
final_wine_kmeans = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))

In [33]:
# final_iris_kmeans = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, kmeans) for z in range(0, len(kmean_parameters)))
# final_wine_kmeans = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, kmeans) for z in range(0, len(kmean_parameters)))

# final_iris_kmeans

In [34]:
final_kmeans_df=pd.DataFrame({
    "Dataset":[],
    "[n_clusters,max_iter,n_init]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [35]:
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,max_iter,n_init]":kmean_parameters.iloc[i].to_list(),
        "f1 score":final_iris_kmeans[i][1],
        "Adjusted Random Score":final_iris_kmeans[i][2],
        "Silhouette Score":final_iris_kmeans[i][3],
        "Execution Time":final_iris_kmeans[i][4],
        },ignore_index=True)
    
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,max_iter,n_init]":kmean_parameters.iloc[i].to_list(),
        "f1 score":final_wine_kmeans[i][1],
        "Adjusted Random Score":final_wine_kmeans[i][2],
        "Silhouette Score":final_wine_kmeans[i][3],
        "Execution Time":final_wine_kmeans[i][4],
        "Dataset":Dataset[1]
        },ignore_index=True)
    
final_kmeans_df

Unnamed: 0,Dataset,"[n_clusters,max_iter,n_init]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[2, 200, 5]",0.555556,0.568116,0.629468,0.017951
1,Iris,"[2, 200, 10]",0.555556,0.568116,0.629468,0.022939
2,Iris,"[2, 200, 15]",0.555556,0.568116,0.629468,0.063828
3,Iris,"[2, 300, 5]",0.555556,0.568116,0.629468,0.024931
4,Iris,"[2, 300, 10]",0.000000,0.568116,0.629468,0.047871
...,...,...,...,...,...,...
103,Wine,"[7, 300, 10]",0.435244,0.620315,0.212823,0.080783
104,Wine,"[7, 300, 15]",0.311285,0.507351,0.200496,0.100751
105,Wine,"[7, 400, 5]",0.244550,0.427808,0.127070,0.041898
106,Wine,"[7, 400, 10]",0.237831,0.487379,0.148404,0.072806


In [188]:
final_kmeans_df.to_csv('K-Means_Data.csv')

# Agglomerative Clustering

In [36]:
def algorithm (x, y, parameters):
    start_time = time.time()
    agglomerative =AgglomerativeClustering(n_clusters=parameters[0], linkage=parameters[1])
    y_predict = agglomerative.fit_predict(x)
    f1score = f1_score(y, y_predict, average = 'weighted')
    ars = adjusted_rand_score(y, y_predict)
    sscore = silhouette_score(x, y_predict, metric='euclidean')
    execution_time = time.time() - start_time
    return y_predict, f1score, ars, sscore, execution_time

In [37]:
n_clusters =[2,3,4,5,6,7]
linkage = ["ward", "complete", "average", "single"]

agglomerative_parameters = pd.DataFrame({
    "n_clusters":[] ,
    "linkage":[]}).astype(int)
  
for i in n_clusters:
        for  n in linkage:
                agglomerative_parameters = agglomerative_parameters.append({
                    "n_clusters" : i, 
                    "linkage" : n },
                    ignore_index=True)
                
agglomerative_parameters

Unnamed: 0,n_clusters,linkage
0,2,ward
1,2,complete
2,2,average
3,2,single
4,3,ward
5,3,complete
6,3,average
7,3,single
8,4,ward
9,4,complete


In [40]:
final_iris_aggromilative = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))
final_wine_aggromilative = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))

In [41]:
final_aggromilative_df=pd.DataFrame({
    "Dataset":[],
    "[n_clusters,linkage]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [42]:
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,linkage]":agglomerative_parameters.iloc[i].to_list(),
        "f1 score":final_iris_aggromilative[i][1],
        "Adjusted Random Score":final_iris_aggromilative[i][2],
        "Silhouette Score":final_iris_aggromilative[i][3],
        "Execution Time":final_iris_aggromilative[i][4],
        },ignore_index=True)
    
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[1],
        "[n_clusters,linkage]":agglomerative_parameters.iloc[i].to_list(),
        "f1 score":final_iris_aggromilative[i][1],
        "Adjusted Random Score":final_iris_aggromilative[i][2],
        "Silhouette Score":final_iris_aggromilative[i][3],
        "Execution Time":final_iris_aggromilative[i][4],
        },ignore_index=True)
    
final_aggromilative_df

Unnamed: 0,Dataset,"[n_clusters,linkage]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[2, ward]",0.0,0.568116,0.629468,0.005985
1,Iris,"[2, complete]",0.20874,0.22342,0.303913,0.006984
2,Iris,"[2, average]",0.0,0.568116,0.629468,0.006971
3,Iris,"[2, single]",0.555556,0.568116,0.629468,0.009971
4,Iris,"[3, ward]",0.26506,0.719584,0.504349,0.00798
5,Iris,"[3, complete]",0.438971,0.706006,0.503067,0.005985
6,Iris,"[3, average]",0.883294,0.719584,0.504349,0.010972
7,Iris,"[3, single]",0.0,0.558371,0.530889,0.006969
8,Iris,"[4, ward]",0.82506,0.645423,0.432557,0.01097
9,Iris,"[4, complete]",0.007937,0.578129,0.393187,0.006981


# DBScan Clustering

In [181]:
def algorithm (x, y, parameters):
    start_time = time.time()
    dbscan= DBSCAN(eps = parameters[0], min_samples = parameters[1])
    y_predict = dbscan.fit_predict(x)
    f1score = f1_score(y, y_predict, average = "weighted")
    ars = adjusted_rand_score(y, y_predict)
    dbscan_n_clusters = len(set(y_predict)) - (1 if -1 in y_predict else 0)
    if dbscan_n_clusters>=2:
        sscore = silhouette_score(x, y_predict, metric="euclidean")
    else:
        sscore = print("None")
    execution_time = time.time() - start_time
    return y_predict, f1score,ars,sscore, execution_time

In [182]:
eps = [0.1,0.2,0.3,0.4]
min_samples = [2,3,4,5,6,7,8]

dbscan_parameters = pd.DataFrame({
    "eps":[],
    "min_samples" :[]
}).astype(int)

for i in eps:
    for n in min_samples:
        dbscan_parameters=dbscan_parameters.append({
            "eps" : i,
            "min_samples": n
        },ignore_index=True)
        
dbscan_parameters

Unnamed: 0,eps,min_samples
0,0.1,2.0
1,0.1,3.0
2,0.1,4.0
3,0.1,5.0
4,0.1,6.0
5,0.1,7.0
6,0.1,8.0
7,0.2,2.0
8,0.2,3.0
9,0.2,4.0


In [183]:
# dbscan= DBSCAN(eps = dbscan_parameters.iloc[0][0], min_samples = dbscan_parameters.iloc[0][1])

In [184]:
final_iris_dbscan = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, dbscan_parameters.iloc[i]) for i in range(0, len(dbscan_parameters)))

In [185]:
final_dbscan_df=pd.DataFrame({
    "Dataset":[],
    "[eps,min_samples]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [186]:
for i in range(0, len(dbscan_parameters)):
    final_dbscan_df =final_dbscan_df.append({
        "Dataset":Dataset[0],
        "[eps,min_samples]":dbscan_parameters.iloc[i].to_list(),
        "f1 score":final_iris_dbscan[i][1],
        "Adjusted Random Score":final_iris_dbscan[i][2],
        "Silhouette Score":final_iris_dbscan[i][3],
        "Execution Time":final_iris_dbscan[i][4],
        },ignore_index=True)

In [187]:
final_dbscan_df

Unnamed: 0,Dataset,"[eps,min_samples]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[0.1, 2.0]",0.505427,0.42626,0.230234,0.009019
1,Iris,"[0.1, 3.0]",0.505427,0.428689,0.202901,0.009016
2,Iris,"[0.1, 4.0]",0.473502,0.393767,0.134221,0.012994
3,Iris,"[0.1, 5.0]",0.406043,0.441407,0.057464,0.012012
4,Iris,"[0.1, 6.0]",0.461289,0.366987,0.057155,0.006983
5,Iris,"[0.1, 7.0]",0.408097,0.310786,0.088912,0.013962
6,Iris,"[0.1, 8.0]",0.399288,0.276562,0.062605,0.007035
7,Iris,"[0.2, 2.0]",0.580833,0.553879,0.388266,0.008003
8,Iris,"[0.2, 3.0]",0.555192,0.553582,0.555263,0.007025
9,Iris,"[0.2, 4.0]",0.555192,0.553582,0.555263,0.007494


In [None]:
# n_clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
# n_noise = list(y_dbscan).count(-1)

# print('Estimated number of clusters: %d' % n_clusters)
# print('Estimated number of noise points: %d' % n_noise)

Unlike k-means, DBSCAN will figure out the number of clusters. 
DBSCAN works by determining whether the minimum number of points are close enough to 
one another to be considered part of a single cluster. DBSCAN is very sensitive to scale since 
epsilon is a fixed value for the maximum distance between two points.

# Optics Clustering

# Gaussian mixtures Clustering

# Affinity propagation

In [None]:
# from sklearn.cluster import AffinityPropagation

# afp = AffinityPropagation(damping=0.9, max_iter=200, convergence_iter=15, copy=True, preference=-5, affinity='euclidean', verbose=False, random_state=None)
# y_afp=afp.fit_predict(x_scaled)
# y_afp

# Mean-shift 

# Spectral Clustering

# Ward hierarchical

In [None]:
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 0, 1, 1, 2]
metrics.rand_score(labels_true, labels_pred)