In [1]:
# Imports

import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import PCA
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
import pandas as pd

In [3]:
# Load Data

# Load the MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
n_train = X_train.shape[0]
n_test = X_test.shape[0]

# Normalise images to be in the range [-1, 1]
X_train = X_train / 127.5 - 1
X_test = X_test / 127.5 - 1

# Convert each 28x28 image into a 784 dimensional vector
features_count = np.prod(X_train.shape[1:])
X_train_flatened = X_train.reshape(n_train, features_count)
X_test_flatened = X_test.reshape(n_test, features_count)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


#### --- Task 1 --- ####

In [None]:
# PCA and Centroids 
# The centroid is calculated by averaging the coordinates of all the points in a cluster. This average gives you a single point that best represents the center of that cluster. 

# Reduce the dimensionality of the data to 2 dimensions
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_flatened)

# Create a scatter plot of the PCA data, colored by digit
fig = px.scatter(X_train_pca, x=0, y=1, color=y_train, title='PCA plot of the MNIST Dataset', width=1000, height=600)
fig.update_layout(xaxis_title='Principal Component 1', yaxis_title='Principal Component 2')

# Create a DataFrame with the PCA data and digit labels
df_pca = pd.DataFrame(X_train_pca, columns=['PC1', 'PC2'])
df_pca['digit'] = y_train

# Compute centroids for each class by taking the mean of PC1 and PC2
centroids = df_pca.groupby('digit')[['PC1', 'PC2']].mean()

# Colour mapping 
color_sequence = px.colors.qualitative.Plotly
unique_digits = sorted(df_pca['digit'].unique())
color_map = {digit: color_sequence[i % len(color_sequence)] for i, digit in enumerate(unique_digits)}

# Add centroids as larger markers, each colored according to its class
for digit, row in centroids.iterrows():
    fig.add_trace(
go.Scatter(
            x=[row['PC1']],
            y=[row['PC2']],
            mode='markers',
            marker=dict(color=color_map[digit], size=15, symbol='diamond'),
            name=f'Centroid {digit}'
        )
    )

fig.show()

In [None]:
# Scree Plot - Shows the percentage of variance explained by each principal component

pca_full = PCA(n_components=50)
pca_full.fit(X_train_flatened)
variance_ratios = pca_full.explained_variance_ratio_
components = np.arange(1, len(variance_ratios) + 1)

df = pd.DataFrame({'Principal Component': components, 'Explained Variance': variance_ratios * 100})
df['Cumulative Variance'] = df['Explained Variance'].cumsum()

fig = px.bar(df, x='Principal Component', y='Explained Variance', title='Scree Plot & Cumulative Variance', labels={'Explained Variance': 'Percentage of Variance Explained'}, width=1000, height=500)
fig.add_scatter(x=df['Principal Component'], y=df['Cumulative Variance'], mode='lines+markers', name='Cumulative Variance', line=dict(color='red'))
fig.show()

In [6]:
# Questions / Notes

# Why is PCA a good option to visualise data?
# PCA is a good option to visualise data because it reduces the dimensionality of the data to 2 dimensions, which makes it easier to plot and understand.

# Observations
# Clustering of Classes - 784D to 2D space and visually see the differences between the different classes, which is not possible in the original 784D space.
# Separation of Certain Classes - Some digits form more isolated clusters. Eg digit 1 has a tight cluster. - Relatively simple and unique shape compared to other digits.
# Overlap Among Other Classes - Digit classes, such as 3, 5, and 8, have clusters that overlap considerably. This suggests that their differences may not be well captured by a linear projection onto the first two principal components.

# Q - Which classes can be linearly separated?
# A - 1 and 0 can be linearly separated. Where 3, 5 and 8 cannot be linearly separated.

# PCA Notes
# When you have high dimensional data, there are many directions in which the data can vary. The first principal component is the direction along which the data varies the most.
# By projecting your data onto the space defined by the top few principal components (often just two for visualisation), you reduce the dimensionality while retaining most of the information (variance) in the original data. 

#### --- Task 2 --- ####

In [12]:
# Main Functions


def prepare_data(digit_1, digit_2):

    # Convert each 28x28 image into a 784 dimensional vector
    features_count = np.prod(X_train.shape[1:])
    X_train_flatened = X_train.reshape(n_train, features_count)
    X_test_flatened = X_test.reshape(n_test, features_count)

    # Filter out for digit_1 and digit_2 for binary classification
    cond = (y_train == digit_1) + (y_train == digit_2)
    binary_x_train = X_train_flatened[cond, :]
    binary_y_train = y_train[cond] * 1.0

    # Normalise training labels
    binary_y_train[binary_y_train == digit_1] = -1
    binary_y_train[binary_y_train == digit_2] = 1

    # Filter out for digit_1 and digit_2 for binary classification
    cond_test = (y_test == digit_1) + (y_test == digit_2)
    binary_x_test = X_test_flatened[cond_test, :]
    binary_y_test = y_test[cond_test] * 1.0

    # Normalise test labels
    binary_y_test[binary_y_test == digit_1] = -1
    binary_y_test[binary_y_test == digit_2] = 1

    return binary_x_train, binary_y_train, binary_x_test, binary_y_test



def predict(x, w, b):

        # Compute the linear combination for each sample    
        z = np.dot(x, w) + b         

        # If z >= 0, predict 1, otherwise predict -1
        prediction = np.where(z >= 0, 1, -1)  

        return prediction



def run_epoch_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test, num_epochs=100, learning_rate=0.01):

    def train_perceptron(x_train, y_train, num_epochs, learning_rate):

        # Get the number of samples and features
        n_samples, n_features = x_train.shape

        # Initialise weights and bias to zero
        w = np.zeros(n_features)  
        b = 0.0                   
        
        # Lists to store accuracy values
        train_accuracies = []
        epochs = []
        
        # Batch of stochastic gradient descent
        for epoch in range(num_epochs):
            for i in range(n_samples):

                # Check if the sample is misclassified
                if y_train[i] * (np.dot(x_train[i], w) + b) <= 0:

                    # Update weights and bias using the perceptron rule
                    w += learning_rate * y_train[i] * x_train[i]
                    b += learning_rate * y_train[i]

            # Evaluate training progress at each epoch
            predictions = predict(x_train, w, b)
            accuracy = np.mean(predictions == y_train)
            train_accuracies.append(accuracy)
            epochs.append(epoch + 1)

        # Plot accuracy vs epochs
        fig = px.line(x=epochs, y=train_accuracies, title='Training Accuracy vs Epochs', labels={'x': 'Epoch', 'y': 'Accuracy'}, width=1000, height=500)
        fig.show()
        
        return w, b


    # Train the perceptron using the binary training data
    w, b = train_perceptron(binary_x_train, binary_y_train, num_epochs, learning_rate)

    # Predict on the training data
    train_predictions = predict(binary_x_train, w, b)
    train_accuracy = np.mean(train_predictions == binary_y_train)
    print('Final Training Accuracy:', train_accuracy)

    # Predict on the test data
    test_predictions = predict(binary_x_test, w, b)
    test_accuracy = np.mean(test_predictions == binary_y_test)
    print('Test Accuracy:', test_accuracy)

    return test_accuracy



def run_optimisation_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test, max_iters=1000, learning_rate=0.01, tolerance=1e-3):

    def optimise_perceptron(x, y, max_iters, learning_rate, tolerance):

        # Initialise variables
        iter = 0
        error = np.inf
        error_list = []
        n,m = x.shape
        rng = np.random.default_rng()
        w = rng.random(m)
        b = rng.random()

        # While the iteration is less than the maximum number of iterations and the error is greater than the tolerance
        while (iter <= max_iters) & (error > tolerance):
            
            # Predict all samples
            predictions = predict(x, w, b)
            
            # Identify misclassified samples
            misclassified_indices = np.where(predictions != y)[0]
            
            # Compute current error (fraction of misclassified samples)
            error = len(misclassified_indices) / n
            error_list.append(error)
            
            # If no misclassifications, we can stop early
            if len(misclassified_indices) == 0:
                break

            # Update w, b for each misclassified sample
            for i in misclassified_indices:
                w += learning_rate * y[i] * x[i]
                b += learning_rate * y[i]

            iter += 1

        return w, b, error_list


    # Optimise on the training set
    w_opt, b_opt, error_list = optimise_perceptron(binary_x_train, binary_y_train, max_iters, learning_rate, tolerance)

    # Evaluate on training
    train_pred = predict(binary_x_train, w_opt, b_opt)
    train_accuracy = np.mean(train_pred == binary_y_train)
    print('Final Training Accuracy:', train_accuracy)

    # Evaluate on test
    test_pred = predict(binary_x_test, w_opt, b_opt)
    test_accuracy = np.mean(test_pred == binary_y_test)
    print('Test Accuracy:', test_accuracy)

    # Error Curve
    df_error = pd.DataFrame({'Iteration': list(range(1, len(error_list) + 1)), 'Misclassification Error': error_list})
    fig_error = px.line(df_error, x='Iteration', y='Misclassification Error', title='Perceptron Training Error', markers=True, width=1000, height=500)
    fig_error.show()

    # Visualise the learned weights as an image
    w_image = w_opt.reshape(28, 28)
    fig_weights = px.imshow(w_image, color_continuous_scale='RdBu', title='Learned Weight Image', width=1000, height=500)
    fig_weights.show()

    return test_accuracy



In [None]:
# Run

digits = {'sample_1': (1, 0), 'sample_2': (8, 3), 'sample_3': (5, 3), 'sample_4': (8, 7), 'sample_5': (2, 9)}
results = {}

for run, (digit_1, digit_2) in enumerate(digits.values()):
    print(f'\n\nRun: {run + 1 }: -- Training for digits {digit_1} and {digit_2} --\n\n')
    print(' -- Epoch Perceptron Training --\n')
    binary_x_train, binary_y_train, binary_x_test, binary_y_test = prepare_data(digit_1, digit_2)
    epoch_test_accuracy = run_epoch_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test)
    print('\n -- Optimisation Perceptron Training --\n')
    optimisation_test_accuracy = run_optimisation_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test)  
    results[f'run_{run + 1}'] = {'digit_1': round(digit_1, 0), 'digit_2': round(digit_2, 0), 'epoch_test_accuracy': round(epoch_test_accuracy, 2), 'optimisation_test_accuracy': round(optimisation_test_accuracy, 2)}

df = pd.DataFrame(results)
display(df)