<a href="https://colab.research.google.com/github/Ksingla1423/Iris_Clustering/blob/main/102203080_Clustering_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [6]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
X = pd.read_csv(url, names=column_names)

X = X.drop(columns='species')

print(X.columns)

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')


In [7]:
def normalize(data):
    return MinMaxScaler().fit_transform(data)

def standardize(data):
    return StandardScaler().fit_transform(data)

def transform_and_normalize(data):
    data_std = standardize(data)
    return normalize(data_std)

def apply_pca(data, n_components=2):
    return PCA(n_components=n_components).fit_transform(data)


In [8]:
def evaluate_clustering(X, method_name, cluster_func, c_values=[3, 4, 5]):
    results = []

    for c in c_values:
        try:
            if method_name == 'MeanShift':
                model = cluster_func()
                labels = model.fit_predict(X)
            else:
                model = cluster_func(n_clusters=c)
                labels = model.fit_predict(X)

            sil = silhouette_score(X, labels)
            ch = calinski_harabasz_score(X, labels)
            db = davies_bouldin_score(X, labels)
            results.append((sil, ch, db))
        except Exception:
            results.append(('NA', 'NA', 'NA'))

    return results

In [9]:
preprocessing_methods = {
    "No Processing": X,
    "Normalization": normalize(X),
    "Transform": standardize(X),
    "PCA": apply_pca(X),
    "T+N": transform_and_normalize(X),
    "T+N+PCA": apply_pca(transform_and_normalize(X)),
}

c_values = [3, 4, 5]
columns = ['c=3', 'c=4', 'c=5']
index = pd.MultiIndex.from_product([['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins'], columns])

def run_all(method_name, cluster_func):
    all_results = []
    for method, data in preprocessing_methods.items():
        metrics = evaluate_clustering(data, method_name, cluster_func, c_values)
        # Flatten results row-wise
        row = []
        for result in metrics:
            row.extend(result)
        all_results.append(pd.Series(row, index=index, name=method))
    return pd.DataFrame(all_results)


In [10]:
# KMeans
kmeans_table = run_all("KMeans", lambda n_clusters: KMeans(n_clusters=n_clusters, n_init=10, random_state=42))

# Hierarchical Clustering
hierarchical_table = run_all("Hierarchical", lambda n_clusters: AgglomerativeClustering(n_clusters=n_clusters))

# Mean Shift Clustering
meanshift_table = run_all("MeanShift", lambda: MeanShift())

In [11]:
print("=== K-Means Clustering ===")
print(kmeans_table)

print("\n=== Hierarchical Clustering ===")
print(hierarchical_table)

print("\n=== K-Means Shift Clustering ===")
print(meanshift_table)

# Optionally export to CSV
kmeans_table.to_csv("kmeans_results.csv")
hierarchical_table.to_csv("hierarchical_results.csv")
meanshift_table.to_csv("meanshift_results.csv")


=== K-Means Clustering ===
              Silhouette                       Calinski-Harabasz              \
                     c=3         c=4       c=5               c=3         c=4   
No Processing   0.552592  560.399924  0.662323          0.497826  529.398294   
Normalization   0.504319  358.567217  0.760975          0.444627  313.183841   
Transform       0.458972  239.341801  0.835410          0.385285  206.092874   
PCA             0.597565  692.404721  0.565084          0.558166  717.787035   
T+N             0.504319  358.567217  0.760975          0.444627  313.183841   
T+N+PCA         0.565224  472.089568  0.609629          0.526889  449.205143   

                        Davies-Bouldins                        
                    c=5             c=3         c=4       c=5  
No Processing  0.780640        0.488518  494.094382  0.806241  
Normalization  0.901127        0.355383  289.927527  0.953884  
Transform      0.872729        0.347265  201.987516  0.940632  
PCA         

In [12]:
def print_formatted_results(title, df):
    print(f"\n=== {title} ===\n")

    metrics = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins']
    cluster_counts = ['c=3', 'c=4', 'c=5']

    for metric in metrics:
        print(metric)
        print(f"{'Parameters':<20} {'c=3':<12} {'c=4':<12} {'c=5':<12}")
        print("-" * 60)
        for index, row in df.iterrows():
            values = []
            for c in cluster_counts:
                val = row.get((metric, c), 'NA')
                if isinstance(val, (int, float)):
                    values.append(f"{val:<12.3f}")
                else:
                    values.append(f"{str(val):<12}")
            print(f"{index:<20} {values[0]} {values[1]} {values[2]}")
        print()


# Example usage:
print_formatted_results("K-Means Clustering", kmeans_table)
print_formatted_results("Hierarchical Clustering", hierarchical_table)
print_formatted_results("K-Means Shift Clustering", meanshift_table)


=== K-Means Clustering ===

Silhouette
Parameters           c=3          c=4          c=5         
------------------------------------------------------------
No Processing        0.553        560.400      0.662       
Normalization        0.504        358.567      0.761       
Transform            0.459        239.342      0.835       
PCA                  0.598        692.405      0.565       
T+N                  0.504        358.567      0.761       
T+N+PCA              0.565        472.090      0.610       

Calinski-Harabasz
Parameters           c=3          c=4          c=5         
------------------------------------------------------------
No Processing        0.498        529.398      0.781       
Normalization        0.445        313.184      0.901       
Transform            0.385        206.093      0.873       
PCA                  0.558        717.787      0.613       
T+N                  0.445        313.184      0.901       
T+N+PCA              0.527        449.2