In [1]:
import openml
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder #For encoding categorical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN
from sklearn.metrics import accuracy_score,f1_score,adjusted_rand_score,silhouette_score
from joblib import Parallel,delayed
import time

# Download Datasets Using openml

In [2]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df

Unnamed: 0,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline,class
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


# Identify Data Types

In [4]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    float64 
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
 4   class        150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [5]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Alcohol                         178 non-null    float64 
 1   Malic_acid                      178 non-null    float64 
 2   Ash                             178 non-null    float64 
 3   Alcalinity_of_ash               178 non-null    float64 
 4   Magnesium                       178 non-null    uint8   
 5   Total_phenols                   178 non-null    float64 
 6   Flavanoids                      178 non-null    float64 
 7   Nonflavanoid_phenols            178 non-null    float64 
 8   Proanthocyanins                 178 non-null    float64 
 9   Color_intensity                 178 non-null    float64 
 10  Hue                             178 non-null    float64 
 11  OD280%2FOD315_of_diluted_wines  178 non-null    float64 
 12  Proline               

# Transform categorical variable to numeric

In [6]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)
wine_y=le.fit_transform(wine_label)

# Min-max normalise

In [7]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)
wine_x_scaled=scaler.fit_transform(wine_x)


# DataSets

In [8]:
Dataset = ["Iris","Wine"]

In [9]:
def algo (x, y, algorithm, parameters):
    start_time = time.time()
    y_predict = algorithm.fit_predict(x)
    f1score = f1_score(y, y_predict, average = 'weighted')
    ars = adjusted_rand_score(y, y_predict)
    
#     if algorithm =="dbscan" and len(set(y_predict))<=1:
#         sscore = print("none")
#     else:
    sscore = silhouette_score(x, y_predict, metric='euclidean')
    execution_time = time.time() - start_time
    return y_predict, f1score, ars, sscore, execution_time

# KMeans

In [10]:
n_clusters=[2,3,4,5,6,7]
max_iter=[200,300,400]
n_init=[5,10,15]
    
kmean_parameters = pd.DataFrame({"n_clusters":[] , 
                                 "max_iter":[] , 
                                 "n_init":[]}).astype(int)

for i in n_clusters:
        for  ite in max_iter:
            for n in n_init:
                kmean_parameters = kmean_parameters.append({"n_clusters" : i, 
                                                            "max_iter" : ite, 
                                                            "n_init" : n},ignore_index=True)
                
                
kmean_parameters

Unnamed: 0,n_clusters,max_iter,n_init
0,2,200,5
1,2,200,10
2,2,200,15
3,2,300,5
4,2,300,10
5,2,300,15
6,2,400,5
7,2,400,10
8,2,400,15
9,3,200,5


In [11]:
kmeans =KMeans(n_clusters=kmean_parameters.iloc[0][0] , max_iter=kmean_parameters.iloc[0][1] , n_init=kmean_parameters.iloc[0][2])

In [12]:
final_iris_kmeans = Parallel(n_jobs=-1)(delayed(algo)(iris_x_scaled, iris_y, kmeans, kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))
final_wine_kmeans = Parallel(n_jobs=-1)(delayed(algo)(wine_x_scaled, wine_y, kmeans, kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))

In [13]:
final_kmeans_df=pd.DataFrame({
    "Dataset":[],
    "[n_clusters,max_iter,n_init]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [14]:
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,max_iter,n_init]":kmean_parameters.iloc[i].to_list(),
        "f1 score":final_iris_kmeans[i][1],
        "Adjusted Random Score":final_iris_kmeans[i][2],
        "Silhouette Score":final_iris_kmeans[i][3],
        "Execution Time":final_iris_kmeans[i][4],
        },ignore_index=True)
    
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,max_iter,n_init]":kmean_parameters.iloc[i].to_list(),
        "f1 score":final_wine_kmeans[i][1],
        "Adjusted Random Score":final_wine_kmeans[i][2],
        "Silhouette Score":final_wine_kmeans[i][3],
        "Execution Time":final_wine_kmeans[i][4],
        "Dataset":Dataset[1]
        },ignore_index=True)
    
final_kmeans_df

Unnamed: 0,Dataset,"[n_clusters,max_iter,n_init]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[2, 200, 5]",0.000000,0.568116,0.629468,0.054856
1,Iris,"[2, 200, 10]",0.000000,0.568116,0.629468,0.019947
2,Iris,"[2, 200, 15]",0.555556,0.568116,0.629468,0.071807
3,Iris,"[2, 300, 5]",0.555556,0.568116,0.629468,0.013963
4,Iris,"[2, 300, 10]",0.000000,0.568116,0.629468,0.019948
...,...,...,...,...,...,...
103,Wine,"[7, 300, 10]",0.218379,0.370227,0.298722,0.028924
104,Wine,"[7, 300, 15]",0.218379,0.370227,0.298722,0.054853
105,Wine,"[7, 400, 5]",0.218379,0.370227,0.298722,0.053856
106,Wine,"[7, 400, 10]",0.218379,0.370227,0.298722,0.059839


# Agglomerative Clustering

In [15]:
n_clusters =[2,3,4,5,6,7]
linkage = ["ward", "complete", "average", "single"]

agglomerative_parameters = pd.DataFrame({
    "n_clusters":[] ,
    "linkage":[]}).astype(int)
  
for i in n_clusters:
        for  n in linkage:
                agglomerative_parameters = agglomerative_parameters.append({
                    "n_clusters" : i, 
                    "linkage" : n },
                    ignore_index=True)
                
agglomerative_parameters

Unnamed: 0,n_clusters,linkage
0,2,ward
1,2,complete
2,2,average
3,2,single
4,3,ward
5,3,complete
6,3,average
7,3,single
8,4,ward
9,4,complete


In [16]:
agglomerative= AgglomerativeClustering(n_clusters = agglomerative_parameters.iloc[0][0], linkage = agglomerative_parameters.iloc[0][1])

In [17]:
final_iris_aggromilative = Parallel(n_jobs=-1)(delayed(algo)(iris_x_scaled, iris_y, agglomerative, agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))
final_wine_aggromilative = Parallel(n_jobs=-1)(delayed(algo)(wine_x_scaled, wine_y, agglomerative,  agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))

In [18]:
final_aggromilative_df=pd.DataFrame({
    "Dataset":[],
    "[n_clusters,linkage]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [19]:
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,linkage]":agglomerative_parameters.iloc[i].to_list(),
        "f1 score":final_iris_aggromilative[i][1],
        "Adjusted Random Score":final_iris_aggromilative[i][2],
        "Silhouette Score":final_iris_aggromilative[i][3],
        "Execution Time":final_iris_aggromilative[i][4],
        },ignore_index=True)
    
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[1],
        "[n_clusters,linkage]":agglomerative_parameters.iloc[i].to_list(),
        "f1 score":final_iris_aggromilative[i][1],
        "Adjusted Random Score":final_iris_aggromilative[i][2],
        "Silhouette Score":final_iris_aggromilative[i][3],
        "Execution Time":final_iris_aggromilative[i][4],
        },ignore_index=True)
    
final_aggromilative_df

Unnamed: 0,Dataset,"[n_clusters,linkage]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[2, ward]",0.0,0.568116,0.629468,0.138631
1,Iris,"[2, complete]",0.0,0.568116,0.629468,0.137634
2,Iris,"[2, average]",0.0,0.568116,0.629468,0.141625
3,Iris,"[2, single]",0.0,0.568116,0.629468,0.128656
4,Iris,"[3, ward]",0.0,0.568116,0.629468,0.009974
5,Iris,"[3, complete]",0.0,0.568116,0.629468,0.008976
6,Iris,"[3, average]",0.0,0.568116,0.629468,0.006983
7,Iris,"[3, single]",0.0,0.568116,0.629468,0.011968
8,Iris,"[4, ward]",0.0,0.568116,0.629468,0.008975
9,Iris,"[4, complete]",0.0,0.568116,0.629468,0.008979


# DBScan Clustering

In [20]:
# def dbscan (x, y, parameters):
#     start_time = time.time()
#     dbscan= DBSCAN(eps = parameters[0], min_samples = parameters[1])
#     y_dbscan = dbscan.fit_predict(x)
#     dbscan_f1score = f1_score(y, y_dbscan, average = "weighted")
#     dbscan_ars = adjusted_rand_score(y, y_dbscan)
#     dbscan_n_clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
#     if dbscan_n_clusters>=2:
#         dbscan_sscore = silhouette_score(x, y_dbscan, metric="euclidean")
#     else:
#         dbscan_sscore = print("111")
#     dbscan_execution_time = time.time() - start_time
#     return y_dbscan, dbscan_f1score,dbscan_ars,dbscan_sscore, dbscan_execution_time

In [21]:
eps = [0.1,0.2,0.3,0.4,0.5]
min_samples = [2,3,4,5,6,7,8]

dbscan_parameters = pd.DataFrame({
    "eps":[],
    "min_samples" :[]
}).astype(int)

for i in eps:
    for n in min_samples:
        dbscan_parameters=dbscan_parameters.append({
            "eps" : i,
            "min_samples": n
        },ignore_index=True)
        
dbscan_parameters

Unnamed: 0,eps,min_samples
0,0.1,2.0
1,0.1,3.0
2,0.1,4.0
3,0.1,5.0
4,0.1,6.0
5,0.1,7.0
6,0.1,8.0
7,0.2,2.0
8,0.2,3.0
9,0.2,4.0


In [22]:
dbscan= DBSCAN(eps = dbscan_parameters.iloc[0][0], min_samples = dbscan_parameters.iloc[0][1])

In [23]:
final_iris_dbscan = Parallel(n_jobs=-1)(delayed(algo)(iris_x_scaled, iris_y, dbscan, dbscan_parameters.iloc[i]) for i in range(0, len(dbscan_parameters)))

In [24]:
final_dbscan_df=pd.DataFrame({
    "Dataset":[],
    "[eps,min_samples]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [25]:
for i in range(0, len(dbscan_parameters)):
    final_dbscan_df =final_dbscan_df.append({
        "Dataset":Dataset[0],
        "[eps,min_samples]":dbscan_parameters.iloc[i].to_list(),
        "f1 score":final_iris_dbscan[i][1],
        "Adjusted Random Score":final_iris_dbscan[i][2],
        "Silhouette Score":final_iris_dbscan[i][3],
        "Execution Time":final_iris_dbscan[i][4],
        },ignore_index=True)

In [26]:
final_dbscan_df

Unnamed: 0,Dataset,"[eps,min_samples]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[0.1, 2.0]",0.505427,0.42626,0.230234,0.057846
1,Iris,"[0.1, 3.0]",0.505427,0.42626,0.230234,0.038896
2,Iris,"[0.1, 4.0]",0.505427,0.42626,0.230234,0.057847
3,Iris,"[0.1, 5.0]",0.505427,0.42626,0.230234,0.013962
4,Iris,"[0.1, 6.0]",0.505427,0.42626,0.230234,0.00798
5,Iris,"[0.1, 7.0]",0.505427,0.42626,0.230234,0.009977
6,Iris,"[0.1, 8.0]",0.505427,0.42626,0.230234,0.00698
7,Iris,"[0.2, 2.0]",0.505427,0.42626,0.230234,0.007978
8,Iris,"[0.2, 3.0]",0.505427,0.42626,0.230234,0.008976
9,Iris,"[0.2, 4.0]",0.505427,0.42626,0.230234,0.006981


In [27]:
# n_clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
# n_noise = list(y_dbscan).count(-1)

# print('Estimated number of clusters: %d' % n_clusters)
# print('Estimated number of noise points: %d' % n_noise)

Unlike k-means, DBSCAN will figure out the number of clusters. 
DBSCAN works by determining whether the minimum number of points are close enough to 
one another to be considered part of a single cluster. DBSCAN is very sensitive to scale since 
epsilon is a fixed value for the maximum distance between two points.

# Optics Clustering

# Gaussian mixtures Clustering

# Affinity propagation

In [28]:
# from sklearn.cluster import AffinityPropagation

# afp = AffinityPropagation(damping=0.9, max_iter=200, convergence_iter=15, copy=True, preference=-5, affinity='euclidean', verbose=False, random_state=None)
# y_afp=afp.fit_predict(x_scaled)
# y_afp

# Mean-shift 

# Spectral Clustering

# Ward hierarchical

In [29]:
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 0, 1, 1, 2]
metrics.rand_score(labels_true, labels_pred)

0.8666666666666667