In [11]:
import openml
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder #For encoding categorical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN,AffinityPropagation,MeanShift,SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score,f1_score,adjusted_rand_score,silhouette_score
from joblib import Parallel,delayed
import time
import scipy

# Download Datasets Using openml

In [12]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [13]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df

Unnamed: 0,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline,class
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


# Identify Data Types

In [14]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    float64 
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
 4   class        150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [15]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Alcohol                         178 non-null    float64 
 1   Malic_acid                      178 non-null    float64 
 2   Ash                             178 non-null    float64 
 3   Alcalinity_of_ash               178 non-null    float64 
 4   Magnesium                       178 non-null    uint8   
 5   Total_phenols                   178 non-null    float64 
 6   Flavanoids                      178 non-null    float64 
 7   Nonflavanoid_phenols            178 non-null    float64 
 8   Proanthocyanins                 178 non-null    float64 
 9   Color_intensity                 178 non-null    float64 
 10  Hue                             178 non-null    float64 
 11  OD280%2FOD315_of_diluted_wines  178 non-null    float64 
 12  Proline               

# Transform categorical variable to numeric

In [16]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)
wine_y=le.fit_transform(wine_label)

# Min-max normalise

In [17]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)
wine_x_scaled=scaler.fit_transform(wine_x)


# DataSets

In [18]:
Dataset = ["Iris","Wine"]

# Algorithms

In [19]:
def algorithm(x, y, algo, parameters):
    start_time = time.time()
    if algo=="kmeans":
        model=KMeans(n_clusters=parameters[0], max_iter=parameters[1], n_init=parameters[2])
    elif algo=="aglomerative":
        model= AgglomerativeClustering(n_clusters=parameters[0], linkage=parameters[1])
    elif algo=="dbscan":
        model= DBSCAN(eps = parameters[0], min_samples = parameters[1])
    elif algo=="ap":
        model= AffinityPropagation(preference=parameters[0], damping=parameters[1])
    elif algo=="meanshift":
        model=MeanShift(bandwidth =parameters[0])
    elif algo =="spectral":
        model=SpectralClustering(n_clusters =parameters[0], affinity=parameters[1],eigen_solver=parameters[2])
    elif algo == "gm":
        model=GaussianMixture(n_components=parameters[0], covariance_type=parameters[1])
        
    y_predict = model.fit_predict(x)
    f1score = f1_score(y, y_predict, average = 'weighted')
    ars = adjusted_rand_score(y, y_predict)
    n_clusters = len(set(y_predict)) - (1 if -1 in y_predict else 0)
    if n_clusters>=2 and n_clusters<=100:
        sscore = silhouette_score(x, y_predict, metric="euclidean")
    else:
        sscore = print("None")
    execution_time = time.time() - start_time
    return y_predict, f1score, ars, sscore, execution_time

# KMeans

In [20]:
n_clusters=[2,3,4,5,6,7,8,9,10,20,21,22,23,24,25]
max_iter=[200,300,400]
n_init=[5,10,15]
    
kmean_parameters = pd.DataFrame({"n_clusters":[] , 
                                 "max_iter":[] , 
                                 "n_init":[]}).astype(int)

for i in n_clusters:
        for  ite in max_iter:
            for n in n_init:
                kmean_parameters = kmean_parameters.append({"n_clusters" : i, 
                                                            "max_iter" : ite, 
                                                            "n_init" : n},ignore_index=True)
                
                
                
kmean_parameters

Unnamed: 0,n_clusters,max_iter,n_init
0,2,200,5
1,2,200,10
2,2,200,15
3,2,300,5
4,2,300,10
...,...,...,...
130,25,300,10
131,25,300,15
132,25,400,5
133,25,400,10


In [21]:
# parameters={"n_clusters":[2,3,4,5,6,7,8,9,10,20,21,22,23,24,25],"max_iter":[200,300,400],"n_init":[5,10,15]}
# df=pd.DataFrame(parameters)
# df = pd.DataFrame({k:pd.Series(v) for k,v in parameters.items()})
# for i,ite,n in parameters:
#         for  ite in max_iter:
#             for n in n_init:
#                 kmean_parameters =pd.DataFrame({"[n_clusters,max_iter,n_init]":[i,ite,n]},ignore_index=True)
                
                
                
# kmean_parameters

In [22]:
final_iris_kmeans = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, "kmeans", kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))
final_wine_kmeans = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, "kmeans", kmean_parameters.iloc[i]) for i in range(0, len(kmean_parameters)))

In [23]:
final_kmeans_df=pd.DataFrame({
    "Dataset":[],
    "[n_clusters,max_iter,n_init]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [24]:
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,max_iter,n_init]":kmean_parameters.iloc[i].to_list(),
        "f1 score":final_iris_kmeans[i][1],
        "Adjusted Random Score":final_iris_kmeans[i][2],
        "Silhouette Score":final_iris_kmeans[i][3],
        "Execution Time":final_iris_kmeans[i][4],
        },ignore_index=True)
    
for i in range(0, len(kmean_parameters)):
    final_kmeans_df =final_kmeans_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,max_iter,n_init]":kmean_parameters.iloc[i].to_list(),
        "f1 score":final_wine_kmeans[i][1],
        "Adjusted Random Score":final_wine_kmeans[i][2],
        "Silhouette Score":final_wine_kmeans[i][3],
        "Execution Time":final_wine_kmeans[i][4],
        "Dataset":Dataset[1]
        },ignore_index=True)
    
final_kmeans_df

Unnamed: 0,Dataset,"[n_clusters,max_iter,n_init]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[2, 200, 5]",0.555556,0.568116,0.629468,0.056847
1,Iris,"[2, 200, 10]",0.555556,0.568116,0.629468,0.085771
2,Iris,"[2, 200, 15]",0.555556,0.568116,0.629468,0.072807
3,Iris,"[2, 300, 5]",0.000000,0.568116,0.629468,0.056849
4,Iris,"[2, 300, 10]",0.000000,0.568116,0.629468,0.055848
...,...,...,...,...,...,...
265,Wine,"[25, 300, 10]",0.000000,0.168183,0.135724,0.166555
266,Wine,"[25, 300, 15]",0.098876,0.155383,0.142125,0.222407
267,Wine,"[25, 400, 5]",0.277744,0.169688,0.132516,0.080786
268,Wine,"[25, 400, 10]",0.097237,0.161102,0.140588,0.152593


In [16]:
final_kmeans_df.to_csv('K-Means_Data.csv')

# Agglomerative Clustering

In [25]:
n_clusters =[2,3,4,5,6,7,8,9,10,11,12,13,14,15]
linkage = ["ward", "complete", "average", "single"]

agglomerative_parameters = pd.DataFrame({
    "n_clusters":[] ,
    "linkage":[]}).astype(int)
  
for i in n_clusters:
        for  n in linkage:
                agglomerative_parameters = agglomerative_parameters.append({
                    "n_clusters" : i, 
                    "linkage" : n },
                    ignore_index=True)
                
agglomerative_parameters

Unnamed: 0,n_clusters,linkage
0,2,ward
1,2,complete
2,2,average
3,2,single
4,3,ward
5,3,complete
6,3,average
7,3,single
8,4,ward
9,4,complete


In [26]:
final_iris_aggromilative = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, "aglomerative", agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))
final_wine_aggromilative = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, "aglomerative", agglomerative_parameters.iloc[i]) for i in range(0, len(agglomerative_parameters)))

In [27]:
final_aggromilative_df=pd.DataFrame({
    "Dataset":[],
    "[n_clusters,linkage]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [28]:
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[0],
        "[n_clusters,linkage]":agglomerative_parameters.iloc[i].to_list(),
        "f1 score":final_iris_aggromilative[i][1],
        "Adjusted Random Score":final_iris_aggromilative[i][2],
        "Silhouette Score":final_iris_aggromilative[i][3],
        "Execution Time":final_iris_aggromilative[i][4],
        },ignore_index=True)
    
for i in range(0, len(agglomerative_parameters)):
    final_aggromilative_df =final_aggromilative_df.append({
        "Dataset":Dataset[1],
        "[n_clusters,linkage]":agglomerative_parameters.iloc[i].to_list(),
        "f1 score":final_wine_aggromilative[i][1],
        "Adjusted Random Score":final_wine_aggromilative[i][2],
        "Silhouette Score":final_wine_aggromilative[i][3],
        "Execution Time":final_wine_aggromilative[i][4],
        },ignore_index=True)
    
final_aggromilative_df

Unnamed: 0,Dataset,"[n_clusters,linkage]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[2, ward]",0.000000,0.568116,0.629468,0.144613
1,Iris,"[2, complete]",0.208740,0.223420,0.303913,0.150599
2,Iris,"[2, average]",0.000000,0.568116,0.629468,0.155586
3,Iris,"[2, single]",0.555556,0.568116,0.629468,0.140622
4,Iris,"[3, ward]",0.265060,0.719584,0.504349,0.006980
...,...,...,...,...,...,...
107,Wine,"[14, single]",0.175392,-0.012347,-0.120288,0.006976
108,Wine,"[15, ward]",0.280516,0.288701,0.142652,0.007979
109,Wine,"[15, complete]",0.543024,0.515401,0.186428,0.006981
110,Wine,"[15, average]",0.343167,0.684635,0.178581,0.009975


In [21]:
final_aggromilative_df.to_csv('Aggromilative_Data.csv')

# DBScan Clustering

In [29]:
eps = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
min_samples = [2,3,4,5,6,7,8]

dbscan_parameters = pd.DataFrame({
    "eps":[],
    "min_samples" :[]
}).astype(int)

for i in eps:
    for n in min_samples:
        dbscan_parameters=dbscan_parameters.append({
            "eps" : i,
            "min_samples": n
        },ignore_index=True)
        
dbscan_parameters

Unnamed: 0,eps,min_samples
0,0.1,2.0
1,0.1,3.0
2,0.1,4.0
3,0.1,5.0
4,0.1,6.0
...,...,...
58,0.9,4.0
59,0.9,5.0
60,0.9,6.0
61,0.9,7.0


In [30]:
final_iris_dbscan = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, "dbscan", dbscan_parameters.iloc[i]) for i in range(0, len(dbscan_parameters)))
final_wine_dbscan = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, "dbscan", dbscan_parameters.iloc[i]) for i in range(0, len(dbscan_parameters)))

In [31]:
final_dbscan_df=pd.DataFrame({
    "Dataset":[],
    "[eps,min_samples]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [32]:
for i in range(0, len(dbscan_parameters)):
    final_dbscan_df =final_dbscan_df.append({
        "Dataset":Dataset[0],
        "[eps,min_samples]":dbscan_parameters.iloc[i].to_list(),
        "f1 score":final_iris_dbscan[i][1],
        "Adjusted Random Score":final_iris_dbscan[i][2],
        "Silhouette Score":final_iris_dbscan[i][3],
        "Execution Time":final_iris_dbscan[i][4],
        },ignore_index=True)

for i in range(0, len(dbscan_parameters)):
    final_dbscan_df =final_dbscan_df.append({
        "Dataset":Dataset[1],
        "[eps,min_samples]":dbscan_parameters.iloc[i].to_list(),
        "f1 score":final_wine_dbscan[i][1],
        "Adjusted Random Score":final_wine_dbscan[i][2],
        "Silhouette Score":final_wine_dbscan[i][3],
        "Execution Time":final_wine_dbscan[i][4],
        },ignore_index=True)
    
final_dbscan_df  

Unnamed: 0,Dataset,"[eps,min_samples]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[0.1, 2.0]",0.505427,0.426260,0.230234,0.036905
1,Iris,"[0.1, 3.0]",0.505427,0.428689,0.202901,0.029921
2,Iris,"[0.1, 4.0]",0.473502,0.393767,0.134221,0.038896
3,Iris,"[0.1, 5.0]",0.406043,0.441407,0.057464,0.027925
4,Iris,"[0.1, 6.0]",0.461289,0.366987,0.057155,0.006981
...,...,...,...,...,...,...
121,Wine,"[0.9, 4.0]",0.165031,0.000000,,0.012967
122,Wine,"[0.9, 5.0]",0.165031,0.000000,,0.007973
123,Wine,"[0.9, 6.0]",0.165031,0.000000,,0.006983
124,Wine,"[0.9, 7.0]",0.165031,0.000000,,0.007978


In [27]:
final_dbscan_df.to_csv('DBScan_Data.csv')

In [28]:
# n_clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
# n_noise = list(y_dbscan).count(-1)

# print('Estimated number of clusters: %d' % n_clusters)
# print('Estimated number of noise points: %d' % n_noise)

Unlike k-means, DBSCAN will figure out the number of clusters. 
DBSCAN works by determining whether the minimum number of points are close enough to 
one another to be considered part of a single cluster. DBSCAN is very sensitive to scale since 
epsilon is a fixed value for the maximum distance between two points.

# Optics Clustering

# Gaussian mixtures Clustering

In [43]:
n_components=[1,2,3,4,5,6,7,8,9,10]
covariance_type=["full", "tied","diag", "spherical"]

gm_parameters=pd.DataFrame({
    "n_components":[],
    "covariance_type":[]
}).astype(int)

for i in n_components:
    for n in covariance_type:
        gm_parameters=gm_parameters.append({
            "n_components":i,
            "covariance_type":n
        },ignore_index=True)
        
gm_parameters

Unnamed: 0,n_components,covariance_type
0,1,full
1,1,tied
2,1,diag
3,1,spherical
4,2,full
5,2,tied
6,2,diag
7,2,spherical
8,3,full
9,3,tied


In [44]:
final_iris_gm = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, "dbscan", dbscan_parameters.iloc[i]) for i in range(0, len(dbscan_parameters)))
final_wine_gm = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, "dbscan", dbscan_parameters.iloc[i]) for i in range(0, len(dbscan_parameters)))

In [45]:
final_gm_df=pd.DataFrame({
    "Dataset":[],
    "[n_components,covariance_type]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [46]:
for i in range(0, len(gm_parameters)):
    final_gm_df =final_gm_df.append({
        "Dataset":Dataset[0],
        "[n_components,covariance_type]":gm_parameters.iloc[i].to_list(),
        "f1 score":final_iris_gm[i][1],
        "Adjusted Random Score":final_iris_gm[i][2],
        "Silhouette Score":final_iris_gm[i][3],
        "Execution Time":final_iris_gm[i][4],
        },ignore_index=True)

for i in range(0, len(gm_parameters)):
    final_gm_df =final_gm_df.append({
        "Dataset":Dataset[1],
        "[n_components,covariance_type]":gm_parameters.iloc[i].to_list(),
        "f1 score":final_wine_gm[i][1],
        "Adjusted Random Score":final_wine_gm[i][2],
        "Silhouette Score":final_wine_gm[i][3],
        "Execution Time":final_wine_gm[i][4],
        },ignore_index=True)
    
final_gm_df  

Unnamed: 0,Dataset,"[n_components,covariance_type]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[1, full]",0.505427,0.426260,0.230234,0.007982
1,Iris,"[1, tied]",0.505427,0.428689,0.202901,0.007978
2,Iris,"[1, diag]",0.473502,0.393767,0.134221,0.011970
3,Iris,"[1, spherical]",0.406043,0.441407,0.057464,0.007980
4,Iris,"[2, full]",0.461289,0.366987,0.057155,0.006982
...,...,...,...,...,...,...
75,Wine,"[9, spherical]",0.204642,-0.011362,0.126727,0.011967
76,Wine,"[10, full]",0.204642,-0.011362,0.126727,0.008975
77,Wine,"[10, tied]",0.172301,-0.008542,,0.005985
78,Wine,"[10, diag]",0.173064,-0.007353,,0.005983


# Affinity propagation

In [31]:
preference = [-1,-3,-5,-7,-9,-11,-13]
damping = [0.5,0.7,0.9]

ap_parameters = pd.DataFrame({
    "preference":[],
    "damping" :[]
}).astype(int)

for i in preference:
    for n in damping:
        ap_parameters=ap_parameters.append({
            "preference":i,
            "damping" :n
        },ignore_index=True)
        
ap_parameters

Unnamed: 0,preference,damping
0,-1.0,0.5
1,-1.0,0.7
2,-1.0,0.9
3,-3.0,0.5
4,-3.0,0.7
5,-3.0,0.9
6,-5.0,0.5
7,-5.0,0.7
8,-5.0,0.9
9,-7.0,0.5


In [32]:
final_iris_ap = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, "ap", ap_parameters.iloc[i]) for i in range(0, len(ap_parameters)))
final_wine_ap = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, "ap", ap_parameters.iloc[i]) for i in range(0, len(ap_parameters)))

In [33]:
final_ap_df=pd.DataFrame({
    "Dataset":[],
    "[preference, damping]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [34]:
for i in range(0, len(ap_parameters)):
    final_ap_df =final_ap_df.append({
        "Dataset":Dataset[0],
        "[preference, damping]":ap_parameters.iloc[i].to_list(),
        "f1 score":final_iris_ap[i][1],
        "Adjusted Random Score":final_iris_ap[i][2],
        "Silhouette Score":final_iris_ap[i][3],
        "Execution Time":final_iris_ap[i][4],
        },ignore_index=True)
    
for i in range(0, len(ap_parameters)):
    final_ap_df =final_ap_df.append({
        "Dataset":Dataset[1],
        "[preference, damping]":ap_parameters.iloc[i].to_list(),
        "f1 score":final_wine_ap[i][1],
        "Adjusted Random Score":final_wine_ap[i][2],
        "Silhouette Score":final_wine_ap[i][3],
        "Execution Time":final_wine_ap[i][4],
        },ignore_index=True)
    
    
final_ap_df 

Unnamed: 0,Dataset,"[preference, damping]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[-1.0, 0.5]",0.297129,0.450074,0.315135,0.132643
1,Iris,"[-1.0, 0.7]",0.780876,0.664774,0.426983,0.076794
2,Iris,"[-1.0, 0.9]",0.559006,0.662306,0.435747,0.16057
3,Iris,"[-3.0, 0.5]",0.898281,0.744526,0.506163,0.140625
4,Iris,"[-3.0, 0.7]",0.906329,0.756532,0.489912,0.087775
5,Iris,"[-3.0, 0.9]",0.906329,0.756532,0.489912,0.127662
6,Iris,"[-5.0, 0.5]",0.899183,0.742975,0.480108,0.126663
7,Iris,"[-5.0, 0.7]",0.893333,0.727543,0.482575,0.082779
8,Iris,"[-5.0, 0.9]",0.906329,0.756532,0.489912,0.130652
9,Iris,"[-7.0, 0.5]",0.0,0.0,,0.484703


In [35]:
final_ap_df.to_csv('Affinity_Propagation_Data.csv')

# Mean-shift 

In [36]:
bandwidth = [0.2,0.3,0.4,0.5,0.6,0.7,0.8]

meanshift_parameters = pd.DataFrame({
    "bandwidth":[],
}).astype(int)

for i in bandwidth:
        meanshift_parameters=meanshift_parameters.append({
            "bandwidth":i,
        },ignore_index=True)
        
meanshift_parameters

Unnamed: 0,bandwidth
0,0.2
1,0.3
2,0.4
3,0.5
4,0.6
5,0.7
6,0.8


In [37]:
final_iris_meanshift = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, "meanshift",meanshift_parameters.iloc[i]) for i in range(0, len(meanshift_parameters)))
final_wine_meanshift = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, "meanshift",meanshift_parameters.iloc[i]) for i in range(0, len(meanshift_parameters)))

In [38]:
final_meanshift_df=pd.DataFrame({
    "Dataset":[],
    "[bandwidth]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [39]:
for i in range(0, len(meanshift_parameters)):
    final_meanshift_df =final_meanshift_df.append({
        "Dataset":Dataset[0],
        "[bandwidth]":meanshift_parameters.iloc[i].to_list(),
        "f1 score":final_iris_meanshift[i][1],
        "Adjusted Random Score":final_iris_meanshift[i][2],
        "Silhouette Score":final_iris_meanshift[i][3],
        "Execution Time":final_iris_meanshift[i][4],
        },ignore_index=True)
    
for i in range(0, len(meanshift_parameters)):
    final_meanshift_df =final_meanshift_df.append({
        "Dataset":Dataset[1],
        "[bandwidth]":meanshift_parameters.iloc[i].to_list(),
        "f1 score":final_wine_meanshift[i][1],
        "Adjusted Random Score":final_wine_meanshift[i][2],
        "Silhouette Score":final_wine_meanshift[i][3],
        "Execution Time":final_wine_meanshift[i][4],
        },ignore_index=True)
    
    
final_meanshift_df 

Unnamed: 0,Dataset,[bandwidth],f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,[0.2],0.494468,0.602107,0.347057,0.88663
1,Iris,[0.3],0.183575,0.593487,0.476486,1.126985
2,Iris,[0.4],0.0,0.568116,0.629468,0.750991
3,Iris,[0.5],0.0,0.568116,0.629468,0.63231
4,Iris,[0.6],0.166667,0.0,,0.693147
5,Iris,[0.7],0.166667,0.0,,1.319471
6,Iris,[0.8],0.166667,0.0,,0.624332
7,Wine,[0.2],0.011049,0.0,,0.418881
8,Wine,[0.3],0.079155,0.022753,,0.478719
9,Wine,[0.4],0.550369,0.266724,0.042969,0.861696


In [40]:
final_meanshift_df.to_csv('Meanshift_Data.csv')

# Spectral Clustering

In [41]:
n_clusters=[2,3,4,5,6,7,8]
affinity=["nearest_neighbors","rbf"]
eigen_solver=["arpack"]

    
spectral_parameters = pd.DataFrame({"n_clusters":[] , 
                                 "affinity":[] , 
                                 "eigen_solver":[]}).astype(int)

for i in n_clusters:
        for  af in affinity:
            for n in eigen_solver:
                spectral_parameters = spectral_parameters.append({"n_clusters" : i, 
                                                            "affinity" : af, 
                                                            "eigen_solver" : n},ignore_index=True)
                
                
spectral_parameters

Unnamed: 0,n_clusters,affinity,eigen_solver
0,2,nearest_neighbors,arpack
1,2,rbf,arpack
2,3,nearest_neighbors,arpack
3,3,rbf,arpack
4,4,nearest_neighbors,arpack
5,4,rbf,arpack
6,5,nearest_neighbors,arpack
7,5,rbf,arpack
8,6,nearest_neighbors,arpack
9,6,rbf,arpack


In [42]:
final_iris_spectral = Parallel(n_jobs=-1)(delayed(algorithm)(iris_x_scaled, iris_y, "spectral", spectral_parameters.iloc[i]) for i in range(0, len(spectral_parameters)))
final_wine_spectral = Parallel(n_jobs=-1)(delayed(algorithm)(wine_x_scaled, wine_y, "spectral", spectral_parameters.iloc[i]) for i in range(0, len(spectral_parameters)))

In [43]:
final_spectral_df=pd.DataFrame({
    "Dataset":[],
    "[n_clusters, affinity]":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[],
})

In [44]:
for i in range(0, len(spectral_parameters)):
    final_spectral_df =final_spectral_df.append({
        "Dataset":Dataset[0],
        "[n_clusters, affinity]":spectral_parameters.iloc[i].to_list(),
        "f1 score":final_iris_spectral[i][1],
        "Adjusted Random Score":final_iris_spectral[i][2],
        "Silhouette Score":final_iris_spectral[i][3],
        "Execution Time":final_iris_spectral[i][4],
        },ignore_index=True)
    
for i in range(0, len(spectral_parameters)):
    final_spectral_df =final_spectral_df.append({
        "Dataset":Dataset[1],
        "[n_clusters, affinity]":spectral_parameters.iloc[i].to_list(),
        "f1 score":final_wine_spectral[i][1],
        "Adjusted Random Score":final_wine_spectral[i][2],
        "Silhouette Score":final_wine_spectral[i][3],
        "Execution Time":final_wine_spectral[i][4],
        },ignore_index=True)
    
    
final_spectral_df 

Unnamed: 0,Dataset,"[n_clusters, affinity]",f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,Iris,"[2, nearest_neighbors, arpack]",0.555556,0.568116,0.629468,0.475728
1,Iris,"[2, rbf, arpack]",0.555556,0.568116,0.629468,0.451792
2,Iris,"[3, nearest_neighbors, arpack]",0.275862,0.744526,0.506163,0.496669
3,Iris,"[3, rbf, arpack]",0.473544,0.623063,0.486174,0.480718
4,Iris,"[4, nearest_neighbors, arpack]",0.082596,0.610785,0.401764,0.057846
5,Iris,"[4, rbf, arpack]",0.277778,0.54203,0.398204,0.06682
6,Iris,"[5, nearest_neighbors, arpack]",0.008772,0.478848,0.355756,0.052858
7,Iris,"[5, rbf, arpack]",0.024096,0.570905,0.319697,0.061836
8,Iris,"[6, nearest_neighbors, arpack]",0.480252,0.524279,0.336171,0.065825
9,Iris,"[6, rbf, arpack]",0.0,0.46867,0.286243,0.060836


In [45]:
final_spectral_df .to_csv('Spectral_Data.csv')

# Ward hierarchical

In [46]:
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 0, 1, 1, 2]
metrics.rand_score(labels_true, labels_pred)

0.8666666666666667

In [47]:
ap =MeanShift(bandwidth=0.75)
y_predict = ap.fit_predict(wine_x_scaled)

y_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 2, 1, 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)