In [1]:
import openml
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder #For encoding categorical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score,f1_score,adjusted_rand_score,silhouette_score
from joblib import Parallel,delayed
import time

# Download Dataset Using openml

In [2]:
dataset = openml.datasets.get_dataset("iris")
df, y, categorical_indicator, attribute_names = dataset.get_data(
    target=dataset.default_target_attribute, dataset_format="dataframe"
)
df["class"]=y
df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
df["class"].values

['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', ..., 'Iris-virginica', 'Iris-virginica', 'Iris-virginica', 'Iris-virginica', 'Iris-virginica']
Length: 150
Categories (3, object): ['Iris-setosa' < 'Iris-versicolor' < 'Iris-virginica']

# Identify Data Types

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    float64 
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
 4   class        150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [5]:
df.describe()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


# Transform categorical variable to numeric

In [6]:
le=LabelEncoder()
df[["class"]]=df[["class"]].apply(lambda col : le.fit_transform(col))
df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [7]:
y=df["class"].values
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [8]:
class_types = ('Iris-setosa','Iris-versicolor','Iris-virginica')
class_df = pd.DataFrame(class_types, columns=['Class_Types'])


class_df['Class_No'] = le.fit_transform(class_df['Class_Types'])
class_df=class_df.set_index("Class_Types")
class_df

Unnamed: 0_level_0,Class_No
Class_Types,Unnamed: 1_level_1
Iris-setosa,0
Iris-versicolor,1
Iris-virginica,2


# Min-max normalise

In [9]:
x=df.iloc[:,:4].values

scaler = MinMaxScaler()
scaler.fit(x)

x_scaled = scaler.transform(x)

# KMeans

In [10]:
def kmeans (x, y, parameters):
    start_time = time.time()
    kmeans = KMeans(n_clusters = parameters[0], max_iter = parameters[1], n_init = parameters[2])
    y_kmeans = kmeans.fit_predict(x_scaled)
    
    kmeans_f1score = f1_score(y, y_kmeans, average = 'weighted')
    kmeans_ars = adjusted_rand_score(y, y_kmeans)
    kmeans_sscore = silhouette_score(x_scaled, y_kmeans, metric='euclidean')
    kmeans_execution_time = time.time() - start_time
    return y_kmeans, kmeans_f1score, kmeans_ars, kmeans_sscore, kmeans_execution_time

In [11]:
n_clusters=[*[2,3,4],*[5,6,7]]
max_iter=[200,300,400]
n_init=[5,10,15]
    
kmean_parameters = pd.DataFrame({"n_clusters":[] , 
                                 "max_iter":[] , 
                                 "n_init":[]}).astype(int)


In [13]:
for i in n_clusters:
        for  ite in max_iter:
            for n in n_init:
                kmean_parameters = kmean_parameters.append(
                    {'n_clusters' : i, 'max_iter' : ite, 'n_init' : n},ignore_index=True
                )
                


In [14]:
kmean_parameters

Unnamed: 0,n_clusters,max_iter,n_init
0,2,200,5
1,2,200,10
2,2,200,15
3,2,300,5
4,2,300,10
5,2,300,15
6,2,400,5
7,2,400,10
8,2,400,15
9,3,200,5


In [22]:
final_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(x_scaled, y, kmean_parameters.iloc[i]) for i in range(0, 27))
final_kmeans

[(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
  0.5555555555555555,
  0.5681159420289855,
  0.6294675561906644,
  0.014960289001464844),
 (array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1

In [20]:
final_kmeans_df=pd.DataFrame({
    "n_clusters":[],
    "max_iter":[],
    "n_init":[],
    "f1 score":[],
    "Adjusted Random Score":[],
    "Silhouette Score":[],
    "Execution Time":[]
})

In [28]:
for i in range(0, len(final_kmeans)):
    final_kmeans_df =final_kmeans_df.append({
        "n_clusters":kmean_parameters["_clusters]
        "max_iter":kmean_parameters.iloc[2],
        "n_init":kmean_parameters.iloc[3],
        "f1 score":final_kmeans[i][1],
        "Adjusted Random Score":final_kmeans[i][2],
        "Silhouette Score":final_kmeans[i][3],
        "Execution Time":final_kmeans[i][4]
        },ignore_index=True)
    
final_kmeans_df

Unnamed: 0,n_clusters,max_iter,n_init,f1 score,Adjusted Random Score,Silhouette Score,Execution Time
0,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,0.555556,0.568116,0.629468,0.014960
1,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,0.555556,0.568116,0.629468,0.031916
2,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,0.000000,0.568116,0.629468,0.064827
3,n_clusters 2 max_iter 300 n_init ...,n_clusters 2 max_iter 300 n_init ...,n_clusters 2 max_iter 300 n_init ...,0.000000,0.568116,0.629468,0.020943
4,n_clusters 2 max_iter 300 n_init ...,n_clusters 2 max_iter 300 n_init ...,n_clusters 2 max_iter 300 n_init ...,0.555556,0.568116,0.629468,0.034909
...,...,...,...,...,...,...,...
76,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 300 n_init ...,0.137681,0.623093,0.444627,0.048870
77,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 300 n_init ...,0.744726,0.623093,0.444627,0.066350
78,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 300 n_init ...,0.744726,0.623093,0.444627,0.035420
79,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 200 n_init ...,n_clusters 2 max_iter 300 n_init ...,0.016878,0.623093,0.444627,0.038424


# KMean Clustering

In [None]:
kmeans = KMeans(n_clusters = [3,4,5,6,7], max_iter = [200,300,400], n_init = [5,10,15], init = 'k-means++')
y_kmeans=kmeans.fit_predict(x_scaled)

In [None]:
import matplotlib.pyplot as plt  
plt.plot(range(1, 11), sum_of_square_distance)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of Square_Distance') #within cluster sum of squares
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans=kmeans.fit_predict(x_scaled)
y_kmeans

In [None]:
round(accuracy_score(y,y_kmeans),3)

In [None]:
plt.scatter(x_scaled[y_kmeans == 0, 0], x_scaled[y_kmeans == 0, 1], s = 100, c = 'purple', label = 'Iris-setosa')
plt.scatter(x_scaled[y_kmeans == 1, 0], x_scaled[y_kmeans == 1, 1], s = 100, c = 'orange', label = 'Iris-versicolour')
plt.scatter(x_scaled[y_kmeans == 2, 0], x_scaled[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')

# Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

In [None]:
result=shc.linkage(x_scaled, method='ward')
plt.figure(figsize=(10,10))
shc.dendrogram(result)
plt.show()

In [None]:
agglomerative =AgglomerativeClustering(n_clusters=3, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='average')
y_agglomerative=agglomerative.fit_predict(x_scaled)
y_agglomerative

In [None]:
round(accuracy_score(y,y_agglomerative),3)

In [None]:
plt.scatter(x_scaled[y_agglomerative == 0, 0], x_scaled[y_agglomerative == 0, 1], s = 100, c = 'purple', label = 'Iris-setosa')
plt.scatter(x_scaled[y_agglomerative == 1, 0], x_scaled[y_agglomerative == 1, 1], s = 100, c = 'orange', label = 'Iris-versicolour')
plt.scatter(x_scaled[y_agglomerative == 2, 0], x_scaled[y_agglomerative == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')

# DBScan Clustering

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(x_scaled)
distances, indices = neighbors_fit.kneighbors(x_scaled)

distances = np.sort(distances, axis=0)
distances = distances[:,1]

plt.plot(distances)

In [None]:
dbscan=DBSCAN(eps=0.135,min_samples=8)
y_dbscan=dbscan.fit_predict(x_scaled)
y_dbscan

In [None]:
accuracy_score(y,y_dbscan)

In [None]:
n_clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
n_noise = list(y_dbscan).count(-1)

print('Estimated number of clusters: %d' % n_clusters)
print('Estimated number of noise points: %d' % n_noise)

In [None]:
colors = ['mediumorchid', 'maroon', 'forestgreen','navy','goldenrod']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])

plt.scatter(x_scaled[:,0], x_scaled[:,1], c=vectorizer(y_dbscan))

Unlike k-means, DBSCAN will figure out the number of clusters. 
DBSCAN works by determining whether the minimum number of points are close enough to 
one another to be considered part of a single cluster. DBSCAN is very sensitive to scale since 
epsilon is a fixed value for the maximum distance between two points.

# Optics Clustering

# Gaussian mixtures Clustering

# Affinity propagation

In [None]:
# from sklearn.cluster import AffinityPropagation

# afp = AffinityPropagation(damping=0.9, max_iter=200, convergence_iter=15, copy=True, preference=-5, affinity='euclidean', verbose=False, random_state=None)
# y_afp=afp.fit_predict(x_scaled)
# y_afp

# Mean-shift 

# Spectral Clustering

# Ward hierarchical

# f1_score , adjusted_rand_score and silhouette_score

In [None]:
df=pd.DataFrame({"Clustering Algoritham":
                 ['K-Means','Agglomerative','DBScan'],
                "f1_score":[(round(f1_score(y,y_kmeans,average='weighted'),3)),(round(f1_score(y,y_agglomerative,average='weighted'),3)),
                            (round(f1_score(y,y_dbscan,average='weighted'),3))],
                "silhouette_score":[(round(silhouette_score(x_scaled,y_kmeans,metric='sqeuclidean'),3)),(round(silhouette_score(x_scaled,y_agglomerative,metric='sqeuclidean'),3)),
                                   (round(silhouette_score(x_scaled,y_dbscan,metric='sqeuclidean'),3))],
                "adjusted_rand_score":[(round(adjusted_rand_score(y,y_kmeans),3)),(round(adjusted_rand_score(y,y_agglomerative),3)),
                                      (round(adjusted_rand_score(y,y_dbscan),3))]})

# ,'Optics','Gaussian mixtures','Affinity propagation',
#                   'Mean-shift','Spectral','Ward hierarchical','Birch','Self organising maps'
df

In [None]:
print(round(f1_score(y,y_kmeans,average='weighted'),3))
print(round(f1_score(y,y_dbscan,average='weighted'),3))
print(round(f1_score(y,y_agglomerative,average='weighted'),3))

In [None]:
round(silhouette_score(x_scaled,y_kmeans,metric='sqeuclidean'),3)

In [None]:
(round(adjusted_rand_score(y,y_kmeans),3))