<a href="https://colab.research.google.com/github/KirtanDwivedi/Performance-using-different-clustering-techniques-on-various-parameters/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pycaret &> /dev/null

In [2]:
from pycaret.utils import version
version()

'3.3.2'

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from pycaret.datasets import get_data
myDataSet = get_data("jewellery")

Unnamed: 0,Age,Income,SpendingScore,Savings
0,58,77769,0.791329,6559.829923
1,59,81799,0.791082,5417.661426
2,62,74751,0.702657,9258.992965
3,59,74373,0.76568,7346.334504
4,87,17760,0.348778,16869.50713


In [64]:
from pycaret.clustering import *
import pandas as pd

# Using K-Means Clustering

data = {}

# Process 'No Data Processing' with clusters 3, 4, 5
s = setup(myDataSet, verbose=False)
data['No Data Processing'] = {}
for c in [3, 4, 5]:
    kmeans = create_model('kmeans', num_clusters=c, verbose=False)
    metrics = pull()[["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]]
    data['No Data Processing'][c] = metrics.iloc[0].tolist()

data_table = {
    'Using Normalization': ['normalize'],
    'Using Transform': ['transform'],
    'Using PCA': ['pca'],
    'Using T+N': ['transformation', 'normalize'],
    'T+N+PCA': ['transformation', 'normalize', 'pca']
}

# Process each preprocessing method with clusters 3, 4, 5
for name, preprocess_steps in data_table.items():
    s = setup(myDataSet,
              normalize='normalize' in preprocess_steps,
              transformation='transform' in preprocess_steps,
              pca='pca' in preprocess_steps,
              verbose=False)
    data[name] = {}
    for c in [3, 4, 5]:
        kmeans = create_model('kmeans', num_clusters=c, verbose=False)
        metrics = pull()[["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]]
        data[name][c] = metrics.iloc[0].tolist()

# Prepare data for DataFrame
columns = []
rows = []

for preproc in data:
    for c in [3, 4, 5]:
        sil, cal, dav = data[preproc][c]
        columns.append(('Using K-Mean Clustering', preproc, c))
        rows.append([sil, cal, dav])

# Create DataFrame with MultiIndex columns
df = pd.DataFrame(rows, columns=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
df.index = pd.MultiIndex.from_tuples(columns)

print(df)

                                               Silhouette  Calinski-Harabasz  \
Using K-Mean Clustering No Data Processing  3      0.7393          3567.5370   
                                            4      0.7207          5011.8115   
                                            5      0.6221          4566.5936   
                        Using Normalization 3      0.6963          1066.5812   
                                            4      0.7581          1611.2647   
                                            5      0.8049          3671.3580   
                        Using Transform     3      0.7383          1843.8285   
                                            4      0.7809          3519.5778   
                                            5      0.6503          2976.4266   
                        Using PCA           3      0.7393          3567.5370   
                                            4      0.7217          5009.6748   
                                        

In [65]:
a= df.transpose()

In [68]:
# Using Hierarchical clustering

data = {}

# Process 'No Data Processing' with clusters 3, 4, 5
s = setup(myDataSet, verbose=False)
data['No Data Processing'] = {}
for c in [3, 4, 5]:
    hclust = create_model('hclust', num_clusters=c, verbose=False)
    metrics = pull()[["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]]
    data['No Data Processing'][c] = metrics.iloc[0].tolist()

data_table = {
    'Using Normalization': ['normalize'],
    'Using Transform': ['transform'],
    'Using PCA': ['pca'],
    'Using T+N': ['transformation', 'normalize'],
    'T+N+PCA': ['transformation', 'normalize', 'pca']
}

# Process each preprocessing method with clusters 3, 4, 5
for name, preprocess_steps in data_table.items():
    s = setup(myDataSet,
              normalize='normalize' in preprocess_steps,
              transformation='transform' in preprocess_steps,
              pca='pca' in preprocess_steps,
              verbose=False)
    data[name] = {}
    for c in [3, 4, 5]:
        hclust = create_model('hclust', num_clusters=c, verbose=False)
        metrics = pull()[["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]]
        data[name][c] = metrics.iloc[0].tolist()

# Prepare data for DataFrame
columns = []
rows = []

for preproc in data:
    for c in [3, 4, 5]:
        sil, cal, dav = data[preproc][c]
        columns.append(('Using Hierarchical Clustering', preproc, c))
        rows.append([sil, cal, dav])

# Create DataFrame with MultiIndex columns
df = pd.DataFrame(rows, columns=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
df.index = pd.MultiIndex.from_tuples(columns)

print(df)

                                                     Silhouette  \
Using Hierarchical Clustering No Data Processing  3      0.7391   
                                                  4      0.7060   
                                                  5      0.6908   
                              Using Normalization 3      0.6963   
                                                  4      0.7581   
                                                  5      0.8049   
                              Using Transform     3      0.7383   
                                                  4      0.7809   
                                                  5      0.6506   
                              Using PCA           3      0.7391   
                                                  4      0.7060   
                                                  5      0.6908   
                              Using T+N           3      0.6963   
                                                  4      0.758

In [69]:
b= df.transpose()

In [70]:
# Using K-Means Shift clustering

data = {}

# Process 'No Data Processing' with clusters 3, 4, 5
s = setup(myDataSet, verbose=False)
data['No Data Processing'] = {}
for c in [3, 4, 5]:
    meanshift = create_model('meanshift', num_clusters=c, verbose=False)
    metrics = pull()[["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]]
    data['No Data Processing'][c] = metrics.iloc[0].tolist()

data_table = {
    'Using Normalization': ['normalize'],
    'Using Transform': ['transform'],
    'Using PCA': ['pca'],
    'Using T+N': ['transformation', 'normalize'],
    'T+N+PCA': ['transformation', 'normalize', 'pca']
}

# Process each preprocessing method with clusters 3, 4, 5
for name, preprocess_steps in data_table.items():
    s = setup(myDataSet,
              normalize='normalize' in preprocess_steps,
              transformation='transform' in preprocess_steps,
              pca='pca' in preprocess_steps,
              verbose=False)
    data[name] = {}
    for c in [3, 4, 5]:
        meanshift = create_model('meanshift', num_clusters=c, verbose=False)
        metrics = pull()[["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]]
        data[name][c] = metrics.iloc[0].tolist()

# Prepare data for DataFrame
columns = []
rows = []

for preproc in data:
    for c in [3, 4, 5]:
        sil, cal, dav = data[preproc][c]
        columns.append(('Using K-Means Shift Clustering', preproc, c))
        rows.append([sil, cal, dav])

# Create DataFrame with MultiIndex columns
df = pd.DataFrame(rows, columns=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
df.index = pd.MultiIndex.from_tuples(columns)

print(df)

                                                      Silhouette  \
Using K-Means Shift Clustering No Data Processing  3      0.7393   
                                                   4      0.7393   
                                                   5      0.7393   
                               Using Normalization 3      0.7325   
                                                   4      0.7325   
                                                   5      0.7325   
                               Using Transform     3      0.7383   
                                                   4      0.7383   
                                                   5      0.7383   
                               Using PCA           3      0.7393   
                                                   4      0.7393   
                                                   5      0.7393   
                               Using T+N           3      0.7325   
                                                

In [71]:
c= df.transpose()

In [72]:
table= pd.concat([a, b, c], ignore_index=False)

In [73]:
table.to_csv("data_table.csv")

In [74]:
from google.colab import files
files.download("data_table.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>