# Lab 3 - Part 2: PCA and Clustering (12 marks)
### Due Date: Monday, March 13 at 12pm

Author: *Kunj Patel*

The purpose of this portion of the assignment is to practice using PCA and clustering techniques on a given dataset

In [1]:
import numpy as np
import pandas as pd

## 0. Function definitions (2 marks)

In [2]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

def cluster_fn(n_clusters, X, n_components=0):
    '''Calculate silhouette score for a given dataset, number of clusters, 
       and number of principle components using Kmeans clustering (random_state=0)
        
        n_clusters (int): number of clusters to use for Kmeans
        X (numpy.array or pandas.DataFrame): unlabelled dataset
        n_components (int): number of principle components (optional)
        
        returns: silhouette score
    
    '''
    # Apply PCA to reduce dimensionality if n_components is provided
    if n_components > 0:
        pca = PCA(n_components=n_components)
        X = pca.fit_transform(X)
    
    # Cluster the data using KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(X)
    
    # Calculate the silhouette score
    score = silhouette_score(X, labels)
    
    return score

## 1. Load data (2 marks)

For this assignment, we will use the dataset found below:

https://archive.ics.uci.edu/ml/datasets/Chemical+Composition+of+Ceramic+Samples

In [3]:
# TODO: Import dataset
import pandas as pd

df = pd.read_csv('Chemical Composion of Ceramic.csv')

Two of the columns are non-numeric. For this assignment, we will remove those two columns and focus on clustering the ceramic samples based on the numerical measurements

In [4]:
# TODO: Remove non-numeric columns
# Select only numeric columns

df = df.select_dtypes(include='number')

## 2. Implement clustering (8 marks)

### 2.1 Cluster using raw data (1 mark)

Implement Kmeans clustering using the raw data. Compare the silhouette scores using 2, 3, 4, 5 and 6 clusters

In [5]:
# TODO: Implement clustering with raw data using cluster_fn above
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Get the raw data
X = df.values

# Try 2 to 6 clusters
for n_clusters in range(2, 7):
    # Fit KMeans model and calculate silhouette score
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    
    # Print the results
    print(f"Silhouette score for {n_clusters} clusters: {score:.3f}")

Silhouette score for 2 clusters: 0.584
Silhouette score for 3 clusters: 0.562
Silhouette score for 4 clusters: 0.543
Silhouette score for 5 clusters: 0.508
Silhouette score for 6 clusters: 0.510


### 2.2 Cluster using PCA-transformed data (2 marks)

Implement Kmeans clustering using the PCA-transformed data. Compare the silhouette scores using 2, 3, 4, 5 and 6 clusters and 2, 3, 4, 5 and 6 principle components 

In [6]:
# TODO: Implement clustering with PCA-transformed data using cluster_fn above

def cluster_fn_pca(n_clusters, n_components, X):
    '''Calculate silhouette score for a given dataset, number of clusters, 
       and number of principle components using Kmeans clustering (random_state=0)
        
        n_clusters (int): number of clusters to use for Kmeans
        n_components (int): number of principle components
        X (numpy.array or pandas.DataFrame): unlabelled dataset
        
        returns: silhouette score
    '''
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # Cluster the data using KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(X_pca)
    
    # Calculate the silhouette score
    score = silhouette_score(X_pca, labels)
    
    return score

scores = {}

for n_clusters in range(2, 7):
    scores[n_clusters] = {}
    for n_components in range(2, 7):
        score = cluster_fn_pca(n_clusters, n_components, X)
        scores[n_clusters][n_components] = score
        


### 2.3 Display results (2 marks)

Print the results for 2.1 and 2.2 in a table. Include column and row labels

In [7]:
# TODO: Display results
import pandas as pd

# Results for raw data clustering
raw_data_scores = {}
for n_clusters in range(2, 7):
    score = cluster_fn(n_clusters, X)
    raw_data_scores[n_clusters] = score
    
# Results for PCA-transformed data clustering
pca_scores = {}
for n_clusters in range(2, 7):
    pca_scores[n_clusters] = {}
    for n_components in range(2, 7):
        score = cluster_fn_pca(n_clusters, n_components, X)
        pca_scores[n_clusters][n_components] = score

# Create a DataFrame to display the results
df = pd.DataFrame({
    'Raw data': raw_data_scores,
    'PCA (2 components)': pca_scores[2],
    'PCA (3 components)': pca_scores[3],
    'PCA (4 components)': pca_scores[4],
    'PCA (5 components)': pca_scores[5],
})

# Rename the index to reflect the number of clusters
df.index.name = 'Number of clusters'

# Print the results
print(df)

                    Raw data  PCA (2 components)  PCA (3 components)  \
Number of clusters                                                     
2                   0.584013            0.619442            0.611625   
3                   0.561640            0.599961            0.586609   
4                   0.543411            0.589955            0.570949   
5                   0.508064            0.587472            0.567470   
6                   0.510399            0.585963            0.564725   

                    PCA (4 components)  PCA (5 components)  
Number of clusters                                          
2                             0.600752            0.567088  
3                             0.570531            0.545911  
4                             0.553715            0.521348  
5                             0.549286            0.515809  
6                             0.546752            0.512537  


**Question**: Which combination of number of clusters and number of components produced the best results? What is the silhouette score for this combination? **(3 marks)**

*Based on the table, it appears that the combination of 2 clusters and 2 PCA components produced the best result with a silhouette score of 0.619.*

## 3. Improve results (Bonus - 3 marks)

Think about how you could improve the results from the previous section. Two potential methods include preprocessing the data or selecting a different clustering algorithm. Repeat section 2 with your selected improvement method to determine what the new silhouette scores would be

In [11]:
# TODO: Repeat steps 2.1-2.3 using a different method/preprocessing/etc.

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

def cluster_fn_dbscan(eps, min_samples, X):
    '''Calculate silhouette score for a given dataset, epsilon,
    and minimum number of samples using DBSCAN clustering
    eps (float): maximum distance between two samples for them to be considered as in the same neighborhood
    min_samples (int): minimum number of samples required for a cluster
    X (numpy.array or pandas.DataFrame): unlabelled dataset
    
    returns: silhouette score

    '''
    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Cluster the data using DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X_scaled)

    # Calculate the silhouette score
    if len(set(labels)) > 1:
        score = silhouette_score(X_scaled, labels)
    else:
        score = -1

    return score

#Test different combinations of parameters
scores_dbscan = {}
for eps in [0.1, 0.5, 1]:
    scores_dbscan[eps] = {}
    for min_samples in [2, 5, 10]:
        score = cluster_fn_dbscan(eps, min_samples, X)
        scores_dbscan[eps][min_samples] = score

#Print results in a table
import pandas as pd
df_scores_dbscan = pd.DataFrame(scores_dbscan)
df_scores_dbscan.index.name = 'Min samples'
df_scores_dbscan.columns.name = 'Epsilon'
print(df_scores_dbscan)

best_params = (max(scores_dbscan, key=lambda x: max(scores_dbscan[x].values())),
               max(scores_dbscan[max(scores_dbscan, key=lambda x: max(scores_dbscan[x].values()))], key=scores_dbscan[max(scores_dbscan, key=lambda x: max(scores_dbscan[x].values()))].get))
best_score = scores_dbscan[best_params[0]][best_params[1]]
print(f"Best parameters: eps={best_params[0]}, min_samples={best_params[1]}")
print(f"Best silhouette score: {best_score:.3f}")

Epsilon      0.1  0.5  1.0
Min samples               
2             -1   -1   -1
5             -1   -1   -1
10            -1   -1   -1
Best parameters: eps=0.1, min_samples=2
Best silhouette score: -1.000


**Question**: Why did you select this improvement method? Which combination of number of clusters and number of components produced the best results? Did you improve the silhouette scores? If yes, how much of an improvement did you get over the previous results?

*I selected this method because it can handle datasets with varying densities and noise, and can potentially identify clusters. It seems like the DBSCAN clustering did not produce any clusters for any of the combinations of eps and min_samples, as all silhouette scores are -1. This could be due to the data not being suitable for DBSCAN clustering or the chosen parameter values being suboptimal.*