In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_digits
from ucimlrepo import fetch_ucirepo


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_digits
from ucimlrepo import fetch_ucirepo

# Load Wine Dataset from UCI ML Repository
def load_wine_dataset():
    wine = fetch_ucirepo(id=109)
    X = wine.data.features
    return X

# Load Mall Customers Dataset from local file
def load_mall_customers_dataset():
    df = pd.read_csv("mall_customers.csv")
    df.drop(columns=['CustomerID'], inplace=True)  # Remove non-numeric ID column
    
    # Encode categorical variables
    if 'Gender' in df.columns:
        df['Gender'] = df['Gender'].astype(str)  # Ensure it's a string type
        encoder = LabelEncoder()
        df['Gender'] = encoder.fit_transform(df['Gender'])
    
    return df

# Load Digits Dataset
def load_digits_dataset():
    data = load_digits()
    df = pd.DataFrame(data.data)
    return df


In [None]:
# Load all datasets
wine_df = load_wine_dataset()
mall_df = load_mall_customers_dataset()
digits_df = load_digits_dataset()

In [None]:
def preprocess_data(df):
    scaler = StandardScaler()
    return scaler.fit_transform(df)

# Preprocess all datasets
wine_scaled = preprocess_data(wine_df)
mall_scaled = preprocess_data(mall_df)
digits_scaled = preprocess_data(digits_df)

In [None]:
# Function to apply K-Means and PCA
def apply_kmeans_pca(data, dataset_name):
    # K-Means Before PCA
    inertia = []
    k_range = range(1, 11)
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        inertia.append(kmeans.inertia_)

    # Plot Elbow Method
    plt.figure(figsize=(8, 5))
    plt.plot(k_range, inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title(f'Elbow Method for {dataset_name}')
    plt.show()

    # Apply Optimal K-Means Before PCA
    optimal_k = 3  # Adjust based on elbow plot
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    clusters_before_pca = kmeans.fit_predict(data)

    # Apply PCA
    pca = PCA(n_components=2)
    data_pca = pca.fit_transform(data)
    
    # Explained Variance
    explained_variance = pca.explained_variance_ratio_
    print(f"Explained Variance Ratio for {dataset_name}:", explained_variance)

    # K-Means After PCA
    kmeans_pca = KMeans(n_clusters=optimal_k, random_state=42)
    clusters_after_pca = kmeans_pca.fit_predict(data_pca)

    # Visualization
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.scatterplot(x=data_pca[:, 0], y=data_pca[:, 1], hue=clusters_before_pca, palette='viridis')
    plt.title(f"Clusters Before PCA - {dataset_name}")

    plt.subplot(1, 2, 2)
    sns.scatterplot(x=data_pca[:, 0], y=data_pca[:, 1], hue=clusters_after_pca, palette='viridis')
    plt.title(f"Clusters After PCA - {dataset_name}")
    plt.show()

    # Silhouette Score Comparison
    sil_before = silhouette_score(data, clusters_before_pca)
    sil_after = silhouette_score(data_pca, clusters_after_pca)
    print(f"Silhouette Score Before PCA for {dataset_name}: {sil_before}")
    print(f"Silhouette Score After PCA for {dataset_name}: {sil_after}")
    print(f"PCA has" + (" improved " if sil_after > sil_before else " not improved ") + f" clustering quality for {dataset_name} based on Silhouette Score.\n")

# Apply the analysis to all datasets
apply_kmeans_pca(wine_scaled, "Wine Dataset")
apply_kmeans_pca(mall_scaled, "Mall Customers Dataset")
apply_kmeans_pca(digits_scaled, "Digits Dataset")
