#Data Minining and Machine Learning
##  Task 4 - Clustering

In [None]:
# Hyper-parameters
# User can change the number of components for PCA
# to 2 or 3
dims = 3

# User can change number of desired clusters for
# application of KMeans clustering and plotting
clusters = 8

# Max iterations for KMeans clustering and random 
# seed for weights
k_iter = 300
k_rand_state = 42

In [None]:
# This block takes the pre-processing block from task 1 (as asked) to use
# for this task.

import numpy as np
from sklearn.preprocessing import StandardScaler

train_data = np.load('x_train.npy')
train_labels = np.load('y_train.npy')
test_data = np.load('x_test.npy')
test_labels = np.load('y_test.npy')

scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)

In [None]:
# This block initialises the PCA function and a component parameter
# it then performs a reduction on the dataset.

from sklearn.decomposition import PCA

pca = PCA(n_components = dims)
train_data_pca = pca.fit_transform(train_data_scaled)

In [None]:
# This block uses an if statement to produce either a 3D or 2D
# plot depending on the number of components specified
# (should be 2 or 3).

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

if dims >= 3:
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(train_data_pca[:, 0], train_data_pca[:, 1], train_data_pca[:, 2], c=train_labels, cmap='viridis', edgecolor='k', s=50)
    ax.set_title('PCA-reduced Data (3D)')
    ax.set_xlabel('Principal Component 1')
    ax.set_ylabel('Principal Component 2')
    ax.set_zlabel('Principal Component 3')
    plt.colorbar(scatter, ax=ax, label='Class')
    plt.show()
elif dims == 2:
    plt.figure(figsize=(8, 6))
    plt.scatter(train_data_pca[:, 0], train_data_pca[:, 1], c=train_labels, cmap='viridis', edgecolor='k', s=50)
    plt.title('PCA-reduced Data')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(label='Class')
    plt.show()
elif dims == 1:
    plt.figure(figsize=(8, 6))
    plt.scatter(train_data_pca[:, 0], np.zeros_like(train_data_pca[:, 0]), c=train_labels, cmap='viridis', edgecolor='k', s=50)
    plt.title('PCA-reduced Data')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Class')
    plt.colorbar(label='Class')
    plt.show()
else:
    print('Dimensionality is invalid.')

In [None]:
# This block implements the KMeans clustering
# algorithm on the reduced data.

from sklearn.cluster import KMeans

# Clustering on PCA-reduced data with
# random state seed of 42
# Number of clusters can be specified, for
# this dataset, not specifying a cluster number
# KMeans defaults to 8 clusters.
kmeans = KMeans(n_init='auto', n_clusters=clusters ,max_iter=k_iter, random_state=k_rand_state)
kmeans.fit(train_data_pca)
cluster_labels = kmeans.predict(train_data_pca)

In [None]:
# Like the other plot block, this block uses an 
# if statement to produce either a 3D or 2D
# plot depending on the number of components specified
# (should be 2 or 3).

if dims >= 3:
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(train_data_pca[:, 0], train_data_pca[:, 1], train_data_pca[:, 2], c=cluster_labels, cmap='viridis', edgecolor='k', s=50)
    ax.set_title('Clustering on PCA-reduced Data (3D)')
    ax.set_xlabel('Principal Component 1')
    ax.set_ylabel('Principal Component 2')
    ax.set_zlabel('Principal Component 3')
    plt.show()
elif dims == 2:
    plt.figure(figsize=(8, 6))
    plt.scatter(train_data_pca[:, 0], train_data_pca[:, 1], c=cluster_labels, cmap='viridis', edgecolor='k', s=50)
    plt.title('Clustering on PCA-reduced Data')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(label='Cluster')
    plt.show()
elif dims == 1:
    plt.figure(figsize=(8, 6))
    plt.scatter(train_data_pca[:, 0], np.zeros_like(train_data_pca[:, 0]), c=cluster_labels, cmap='viridis', edgecolor='k', s=50)
    plt.title('Clustering on PCA-reduced Data')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Cluster')
    plt.colorbar(label='Cluster')
    plt.show()
else:
    print('Dimensionality is invalid.')

# Markdown Question

How do you think the visualisation will change if you used 3 PCA components?

#### Answer

Using 3 PCA components instead of 2 introduces a third dimension to the reduced data and will change the visualization in the following ways:

Given that we now have a new dimension the visualisation will now have to be in a 3-dimensional space in order to properly display the clustering instead of a 2-dimensional space. This would introduce a third principle component.

The visualisation may remain similar if viewed from a 'similar' plane but the introduction of an additional dimension allows for the relationship between each individual point and cluster's relationship with the new component.

Additionally, using 3 PCA components may provide a richer representation of the data, allowing for a more detailed exploration of its structure and relationships.