# **Task 1**

In [3]:
# Imports

import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import PCA
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Load Data and flatten the image to a 1D vector

# Load the MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
n_train = X_train.shape[0]
n_test = X_test.shape[0]

# Normalise images to be in the range [-1, 1]
X_train = X_train / 127.5 - 1
X_test = X_test / 127.5 - 1

# Convert each 28x28 image into a 784 dimensional vector
features_count = np.prod(X_train.shape[1:])
X_train_flatened = X_train.reshape(n_train, features_count)
X_test_flatened = X_test.reshape(n_test, features_count)

In [None]:
# PCA plot with Centroids

# Reduce the dimensionality of the data to 2 dimensions
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_flatened)

# Create a scatter plot of the PCA data, colored by digit
pca_fig = px.scatter(X_train_pca, x=0, y=1, color=y_train, title='PCA plot of the MNIST Dataset', width=1000, height=600)
pca_fig.update_layout(xaxis_title='PC1', yaxis_title='PC2')

# Create a DataFrame with the PCA data and digit labels
df_pca = pd.DataFrame(X_train_pca, columns=['PC1', 'PC2'])
df_pca['digit'] = y_train

# Compute centroids for each class by taking the mean of PC1 and PC2
centroids = df_pca.groupby('digit')[['PC1', 'PC2']].mean()

# Plot the centroids of the pca components
centroids_fig = px.scatter(centroids, x='PC1', y='PC2', color=centroids.index, title='PCA plot of the MNIST Dataset', width=1000, height=600)
centroids_fig.update_traces(marker=dict(size=20))

# Plot
pca_fig.show()
centroids_fig.show()

# **Task 2**

In [5]:
# Prepare data for binary classification

def prepare_data(digit_1, digit_2):

    # Convert each 28x28 image into a 784 dimensional vector
    features_count = np.prod(X_train.shape[1:])
    X_train_flatened = X_train.reshape(n_train, features_count)
    X_test_flatened = X_test.reshape(n_test, features_count)

    # Filter out for digit_1 and digit_2 for binary classification
    cond = (y_train == digit_1) + (y_train == digit_2)
    binary_x_train = X_train_flatened[cond, :]
    binary_y_train = y_train[cond] * 1.0

    # Normalise training labels
    binary_y_train[binary_y_train == digit_1] = -1
    binary_y_train[binary_y_train == digit_2] = 1

    # Filter out for digit_1 and digit_2 for binary classification
    cond_test = (y_test == digit_1) + (y_test == digit_2)
    binary_x_test = X_test_flatened[cond_test, :]
    binary_y_test = y_test[cond_test] * 1.0

    # Normalise test labels
    binary_y_test[binary_y_test == digit_1] = -1
    binary_y_test[binary_y_test == digit_2] = 1

    return binary_x_train, binary_y_train, binary_x_test, binary_y_test

In [6]:
# Predict function for the perceptron

def predict(x, w, b):

    # Compute the linear combination for each sample
    z = np.dot(x, w) + b

    # If z >= 0, predict 1, otherwise predict -1
    prediction = np.where(z >= 0, 1, -1)

    return prediction

In [7]:
# Run epoch perceptron

def run_epoch_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test, num_epochs=100, learning_rate=0.01):

    def train_perceptron(x_train, y_train, num_epochs, learning_rate):

        # Get the number of samples and features
        n_samples, n_features = x_train.shape

        # Initialise weights and bias to zero
        w = np.zeros(n_features)
        b = 0.0

        # Lists to store accuracy values
        train_accuracies = []
        epochs = []

        # Batch of stochastic gradient descent
        for epoch in range(num_epochs):
            for i in range(n_samples):

                # Check if the sample is misclassified
                if y_train[i] * (np.dot(x_train[i], w) + b) <= 0:

                    # Update weights and bias using the perceptron rule
                    w += learning_rate * y_train[i] * x_train[i]
                    b += learning_rate * y_train[i]

            # Evaluate training progress at each epoch
            predictions = predict(x_train, w, b)
            accuracy = np.mean(predictions == y_train)
            train_accuracies.append(accuracy)
            epochs.append(epoch + 1)

        # Plot accuracy vs epochs
        fig = px.line(x=epochs, y=train_accuracies, title='Training Accuracy vs Epochs', labels={'x': 'Epoch', 'y': 'Accuracy'}, width=1000, height=500)
        fig.show()

        return w, b


    # Train the perceptron using the binary training data
    w, b = train_perceptron(binary_x_train, binary_y_train, num_epochs, learning_rate)

    # Predict on the training data
    train_predictions = predict(binary_x_train, w, b)
    train_accuracy = np.mean(train_predictions == binary_y_train)
    print('Final Training Accuracy:', train_accuracy)

    # Predict on the test data
    test_predictions = predict(binary_x_test, w, b)
    test_accuracy = np.mean(test_predictions == binary_y_test)
    print('Test Accuracy:', test_accuracy)

    return test_accuracy


In [8]:
# Run optimisation perceptron

def run_optimisation_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test, max_iters=1000, learning_rate=0.01, tolerance=1e-3):

    def optimise_perceptron(x, y, max_iters, learning_rate, tolerance):

        # Initialise variables
        iter = 0
        error = np.inf
        error_list = []
        n,m = x.shape
        rng = np.random.default_rng()
        w = rng.random(m)
        b = rng.random()

        # While the iteration is less than the maximum number of iterations and the error is greater than the tolerance
        while (iter <= max_iters) & (error > tolerance):

            # Predict all samples
            predictions = predict(x, w, b)

            # Identify misclassified samples
            misclassified_indices = np.where(predictions != y)[0]

            # Compute current error (fraction of misclassified samples)
            error = len(misclassified_indices) / n
            error_list.append(error)

            # If no misclassifications, we can stop early
            if len(misclassified_indices) == 0:
                break

            # Update w, b for each misclassified sample
            for i in misclassified_indices:
                w += learning_rate * y[i] * x[i]
                b += learning_rate * y[i]

            iter += 1

        return w, b, error_list


    # Optimise on the training set
    w_opt, b_opt, error_list = optimise_perceptron(binary_x_train, binary_y_train, max_iters, learning_rate, tolerance)

    # Evaluate on training
    train_pred = predict(binary_x_train, w_opt, b_opt)
    train_accuracy = np.mean(train_pred == binary_y_train)
    print('Final Training Accuracy:', train_accuracy)

    # Evaluate on test
    test_pred = predict(binary_x_test, w_opt, b_opt)
    test_accuracy = np.mean(test_pred == binary_y_test)
    print('Test Accuracy:', test_accuracy)

    # Error Curve
    df_error = pd.DataFrame({'Iteration': list(range(1, len(error_list) + 1)), 'Misclassification Error': error_list})
    fig_error = px.line(df_error, x='Iteration', y='Misclassification Error', title='Perceptron Training Error', markers=True, width=1000, height=500)
    fig_error.show()

    # Visualise the learned weights as an image
    w_image = w_opt.reshape(28, 28)
    fig_weights = px.imshow(w_image, color_continuous_scale='RdBu', title='Learned Weight Image', width=1000, height=500)
    fig_weights.show()

    return test_accuracy

In [None]:
# Run

digits = {'sample_1': (1, 0), 'sample_2': (8, 3), 'sample_3': (4, 9), 'sample_4': (8, 7), 'sample_5': (2, 9)}
results = {}

for run, (digit_1, digit_2) in enumerate(digits.values()):
    print(f'\n\nRun: {run + 1 }: -- Training for digits {digit_1} and {digit_2} --\n\n')
    print(' -- Epoch Perceptron Training --\n')
    binary_x_train, binary_y_train, binary_x_test, binary_y_test = prepare_data(digit_1, digit_2)
    epoch_test_accuracy = run_epoch_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test)
    print('\n -- Optimisation Perceptron Training --\n')
    optimisation_test_accuracy = run_optimisation_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test)
    results[f'run_{run + 1}'] = {'digit_1': round(digit_1, 0), 'digit_2': round(digit_2, 0), 'epoch_test_accuracy': round(epoch_test_accuracy, 2), 'optimisation_test_accuracy': round(optimisation_test_accuracy, 2)}

df = pd.DataFrame(results)
display(df)

# **Task 3**

In [10]:
# Load Data

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train/255
y_train = np.eye(10)[y_train]
X_test = X_test/ 255
y_test = np.eye(10)[y_test]

In [11]:
# Plot Training and Testing Accuracy Curves

def plot(history):

    train_acc = history.history['accuracy'][-1] * 100
    test_acc = history.history['val_accuracy'][-1] * 100
    print(f"Training accuracy: {train_acc:.2f}%")
    print(f"Test accuracy: {test_acc:.2f}%")

    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Test Accuracy')
    plt.title('Training vs. Testing Accuracy')

    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [None]:
# Base MLP Model

model = Sequential([
    Flatten(input_shape=(28, 28)),
    Dense(1000, activation='relu'),
    Dense(1000, activation='relu'),
    Dense(10, activation='softmax')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train, y_train,
    batch_size=50,
    epochs=10,
    validation_data=(X_test, y_test)
)

plot(history)

In [None]:
# Second MLP Model (Example)

model = Sequential([
    Flatten(input_shape=(28, 28)),
    Dense(500, activation='relu'),
    Dense(500, activation='relu'),
    Dense(500, activation='relu'),
    Dense(500, activation='relu'),
    Dense(500, activation='relu'),
    Dense(10, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    X_train, y_train,
    batch_size=50,
    epochs=10,
    validation_data=(X_test, y_test)
)

plot(history)

# **Task 4**

In [13]:
# Plot

def plot(history):

    train_acc = history.history['accuracy'][-1] * 100
    test_acc = history.history['val_accuracy'][-1] * 100
    print(f'Training accuracy: {train_acc:.2f}%')
    print(f'Test accuracy: {test_acc:.2f}%')

    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Test Accuracy')
    plt.title('Training vs. Testing Accuracy')

    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()


In [14]:
# Load and prepare data

(x_train, y_train), (x_test, y_test) = mnist.load_data()
n_train = x_train.shape[0]
n_test = x_test.shape[0]

# Reshape and normalise the data
x_train = x_train.reshape((n_train, 28, 28, 1)) / 255.0
x_test = x_test.reshape((n_test, 28, 28, 1)) / 255.0

# One-hot encode the labels
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

In [None]:
# Base CNN Model

# Build the model
model = Sequential([
    Conv2D(32, kernel_size=(4, 4), strides=(1, 1), activation='relu', input_shape=(28, 28, 1)),
    Conv2D(64, kernel_size=(4, 4), strides=(2, 2), activation='relu'),
    Conv2D(128, kernel_size=(4, 4), strides=(2, 2), activation='relu'),
    Flatten(),
    Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, batch_size=50, epochs=10, validation_data=(x_test, y_test))

plot(history)

# **NAJIB**

In [16]:
# Importing libraries

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import pandas as pd
import math
import seaborn as sns
import time
from tensorflow import keras
from tensorflow.keras import layers, models, Model

### TASK 1

In [None]:
# Define MNIST rainbow colors for consistency
DIGIT_COLORS = [
    'red',         # 0
    'orange',      # 1
    'yellow',      # 2
    'green',       # 3
    'cyan',        # 4
    'blue',        # 5
    'indigo',      # 6
    'violet',      # 7
    'magenta',     # 8
    'brown'        # 9
]

# Load the MNIST Dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Prepare data (using a smaller subset to reduce memory usage)
sample_size = 10000  # Use only 10,000 samples to reduce computation
indices = np.random.choice(len(x_train), sample_size, replace=False)
x_train_subset = x_train[indices]
y_train_subset = y_train[indices]

# Rescale and reshape
x_train_subset = x_train_subset / 127.5 - 1
n_train = x_train_subset.shape[0]
nb_features = np.prod(x_train_subset.shape[1:])
x_train_reshaped = x_train_subset.reshape((n_train, nb_features))

# Standardize and apply PCA
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_reshaped)
pca = PCA(n_components=2)
x_train_pca = pca.fit_transform(x_train_scaled)

# Step 1: Plot centroids
def plot_digit_centroids():
    plt.figure(figsize=(10, 8))

    # Calculate centroids for each digit
    for digit in range(10):
        points = x_train_pca[y_train_subset == digit]
        if len(points) > 0:
            centroid = np.mean(points, axis=0)
            plt.scatter(centroid[0], centroid[1],
                       s=100, color=DIGIT_COLORS[digit], edgecolor='black', linewidth=1)
            plt.text(centroid[0], centroid[1], str(digit),
                    ha='center', va='center', color='black', fontweight='bold')

    plt.title("MNIST PCA - Digit Centroids")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('digit_centroids.png', dpi=300, bbox_inches='tight')
    plt.show()

# Step 2: Calculate regression for all digits
def calculate_all_regressions():
    slopes = {}
    vectors = {}

    for digit in range(10):
        # Extract points for this digit
        points = x_train_pca[y_train_subset == digit]
        if len(points) > 0:
            # Fit regression
            reg = LinearRegression()
            X = points[:, 0].reshape(-1, 1)
            y = points[:, 1]
            reg.fit(X, y)

            # Store slope and vector
            slope = reg.coef_[0]
            intercept = reg.intercept_
            slopes[digit] = (slope, intercept)

            # Create and normalize direction vector
            vec = np.array([1, slope])
            vectors[digit] = vec / np.linalg.norm(vec)

    return slopes, vectors

# Step 3: Create angle matrix
def create_angle_matrix(vectors):
    angle_matrix = np.zeros((10, 10))

    for i in range(10):
        if i not in vectors:
            continue

        for j in range(10):
            if j not in vectors:
                continue

            # Cosine similarity between direction vectors
            cos_sim = np.dot(vectors[i], vectors[j])

            # Calculate angle in degrees (0-90 degrees range)
            angle = math.degrees(math.acos(min(max(cos_sim, -1.0), 1.0)))

            # For proper angle representation, we want the smallest angle between lines
            # If angle > 90 degrees, take 180-angle instead (the supplementary angle)
            if angle > 90:
                angle = 180 - angle

            angle_matrix[i, j] = angle

    # Create a dataframe for the angle matrix
    angle_df = pd.DataFrame(
        angle_matrix,
        index=[f'Digit {i}' for i in range(10)],
        columns=[f'Digit {i}' for i in range(10)]
    ).round(1)

    return angle_df

# Step 4: Plot angle heatmap
def plot_angle_heatmap(angle_df):
    plt.figure(figsize=(12, 10))
    sns.heatmap(angle_df, annot=True, cmap='YlOrRd', vmin=0, vmax=90,
               square=True, linewidths=.5, cbar_kws={"shrink": .8, "label": "Angle (degrees)"})
    plt.title('Angle Between Digit Regression Lines (degrees)', fontsize=16)
    plt.tight_layout()
    plt.savefig('angle_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()

# Step 5: Plot example pairs (only a few to avoid overload)
def plot_example_pairs(slopes, selected_pairs=[(0,1), (3,8), (4,9)]):
    for digit1, digit2 in selected_pairs:
        # Get points for each digit
        points1 = x_train_pca[y_train_subset == digit1]
        points2 = x_train_pca[y_train_subset == digit2]

        if len(points1) == 0 or len(points2) == 0:
            continue

        # Get regression parameters
        slope1, intercept1 = slopes[digit1]
        slope2, intercept2 = slopes[digit2]

        # Create the plot
        plt.figure(figsize=(10, 8))

        # Plot the points
        plt.scatter(points1[:, 0], points1[:, 1], color=DIGIT_COLORS[digit1],
                   alpha=0.3, s=10, label=f'Digit {digit1}')
        plt.scatter(points2[:, 0], points2[:, 1], color=DIGIT_COLORS[digit2],
                   alpha=0.3, s=10, label=f'Digit {digit2}')

        # Add regression lines
        x1_range = np.linspace(points1[:, 0].min(), points1[:, 0].max(), 100)
        y1_pred = slope1 * x1_range + intercept1
        plt.plot(x1_range, y1_pred, color='black', linewidth=2)

        x2_range = np.linspace(points2[:, 0].min(), points2[:, 0].max(), 100)
        y2_pred = slope2 * x2_range + intercept2
        plt.plot(x2_range, y2_pred, color='black', linewidth=2)

        # Calculate vector similarity
        vec1 = np.array([1, slope1])
        vec2 = np.array([1, slope2])

        norm_vec1 = vec1 / np.linalg.norm(vec1)
        norm_vec2 = vec2 / np.linalg.norm(vec2)

        cos_sim = np.dot(norm_vec1, norm_vec2)
        angle = math.degrees(math.acos(min(max(cos_sim, -1.0), 1.0)))

        # Take the smaller angle if > 90 degrees
        if angle > 90:
            angle = 180 - angle

        # Add annotation with angle info
        plt.annotate(f'Angle between lines: {angle:.1f}°',
                    xy=(0.05, 0.95), xycoords='axes fraction',
                    bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", alpha=0.8))

        plt.title(f'Regression Comparison: Digit {digit1} vs Digit {digit2}')
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.legend()
        plt.savefig(f'comparison_{digit1}_{digit2}.png', dpi=300, bbox_inches='tight')
        plt.show()

# Run all steps quietly
try:
    plot_digit_centroids()
    slopes, vectors = calculate_all_regressions()
    angle_df = create_angle_matrix(vectors)
    plot_angle_heatmap(angle_df)
    plot_example_pairs(slopes)
except Exception as e:
    pass  # Silently continue on errors

### TASK 2

In [None]:
# Task 2.1: Implement the predict function for the perceptron
def predict(x, w, b):
    """
    Predict class labels for samples in x using the perceptron model.

    Parameters:
    x (numpy.ndarray): Input data with shape (n_samples, n_features)
    w (numpy.ndarray): Weight vector with shape (n_features,)
    b (float): Bias term

    Returns:
    numpy.ndarray: Predicted class labels (-1 or 1)
    """
    # Calculate the dot product of inputs and weights
    z = np.dot(x, w) + b

    # Apply the sign function to get predictions
    # (>= 0 maps to 1, < 0 maps to -1)
    prediction = np.sign(z)

    # Replace 0 with 1 if any (though unlikely in practice)
    prediction[prediction == 0] = 1

    return prediction

# Task 2.2: Implement the optimize function for training the perceptron
def optimize(x, y, w=None, b=None, max_iter=1000, tol=1e-3, learning_rate=0.01):
    """
    Train a perceptron model using the perceptron learning algorithm.

    Parameters:
    x (numpy.ndarray): Training data with shape (n_samples, n_features)
    y (numpy.ndarray): Target values with shape (n_samples,)
    w (numpy.ndarray, optional): Initial weight vector. If None, initialized randomly.
    b (float, optional): Initial bias term. If None, initialized randomly.
    max_iter (int): Maximum number of iterations
    tol (float): Tolerance for stopping criterion
    learning_rate (float): Learning rate for weight updates (to control weight magnitude)

    Returns:
    tuple: (w, b, error_history)
        - w: Learned weight vector
        - b: Learned bias term
        - error_history: List of errors at each iteration
    """
    # Get the shape of the input data
    n, m = x.shape

    # Initialize weights and bias if not provided
    if w is None:
        w = np.random.randn(m) * 0.01  # Scale down initial weights
    if b is None:
        b = np.random.randn() * 0.01  # Scale down initial bias

    # Initialize variables
    iter_count = 0
    error_history = []
    error = float('inf')

    # Main training loop
    while iter_count < max_iter and error > tol:
        # Get current predictions
        y_pred = predict(x, w, b)

        # Calculate misclassification error
        error = np.mean(y_pred != y)
        error_history.append(error)

        # If error is below tolerance, break the loop
        if error <= tol:
            break

        # Loop through each training sample
        for i in range(n):
            # If misclassified, update weights and bias
            if y_pred[i] != y[i]:
                # Use learning rate to scale updates
                w = w + learning_rate * y[i] * x[i]
                b = b + learning_rate * y[i]

        # Optional: Add weight normalization to prevent excessive growth
        if iter_count % 50 == 0 and iter_count > 0:
            # Print stats for debugging
            if iter_count % 200 == 0:
                print(f"Iteration {iter_count}, Error: {error:.4f}, Weight norm: {np.linalg.norm(w):.4f}")

        iter_count += 1

    print(f"Training completed after {iter_count} iterations")
    print(f"Final error: {error:.4f}")
    print(f"Weight norm: {np.linalg.norm(w):.4f}")

    return w, b, error_history

# Function to evaluate perceptron on test data
def evaluate_perceptron(x_train, y_train, x_test, y_test):
    """
    Train a perceptron and evaluate it on test data.

    Parameters:
    x_train, y_train: Training data and labels
    x_test, y_test: Test data and labels

    Returns:
    dict: Dictionary with training and test accuracy, learned weights, and error history
    """
    # Train the perceptron
    w, b, error_history = optimize(x_train, y_train)

    # Evaluate on training data
    y_train_pred = predict(x_train, w, b)
    train_accuracy = np.mean(y_train_pred == y_train)

    # Evaluate on test data
    y_test_pred = predict(x_test, w, b)
    test_accuracy = np.mean(y_test_pred == y_test)

    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'weights': w,
        'bias': b,
        'error_history': error_history
    }

# Function to visualize learned weights as an image
def visualize_weights(w, shape=(28, 28), digit_pair=None):
    """
    Visualize the learned weights as an image.

    Parameters:
    w (numpy.ndarray): Weight vector
    shape (tuple): Shape to reshape the weight vector to
    digit_pair (tuple): The pair of digits being classified
    """
    plt.figure(figsize=(12, 5))

    # Reshape weights to original image dimensions
    weight_img = w.reshape(shape)

    # Print statistics about the weights
    print(f"Weight statistics:")
    print(f"  Mean: {w.mean():.4f}")
    print(f"  Min: {w.min():.4f}")
    print(f"  Max: {w.max():.4f}")
    print(f"  Standard deviation: {w.std():.4f}")

    # Main subplot for combined weights
    plt.subplot(1, 2, 1)
    im = plt.imshow(weight_img, cmap='viridis')
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.title(f'Learned Weights for Digits {digit_pair[0]} vs {digit_pair[1]}')

    # Plot composite image of both positive and negative weights
    plt.subplot(1, 2, 2)
    # Create an RGB image showing both positive and negative weights
    composite = np.zeros((*shape, 3))
    # Get positive and negative weights
    pos_weights = weight_img.copy()
    neg_weights = weight_img.copy()
    pos_weights[pos_weights < 0] = 0
    neg_weights[neg_weights > 0] = 0
    neg_weights = abs(neg_weights)  # Make negative weights positive for visualization

    # Normalize for visualization
    if pos_weights.max() > 0:
        composite[:,:,0] = pos_weights / pos_weights.max()  # Red channel for positive
    if neg_weights.max() > 0:
        composite[:,:,2] = neg_weights / neg_weights.max()  # Blue channel for negative

    plt.imshow(composite)
    plt.title(f'Composite: Blue={digit_pair[0]}, Red={digit_pair[1]}')

    plt.tight_layout()
    plt.show()

    # Also show some examples of both digits for reference
    plt.figure(figsize=(10, 4))
    plt.suptitle(f'Example Images of Digits {digit_pair[0]} and {digit_pair[1]}')

    # Show examples of digit1
    for i in range(5):
        plt.subplot(2, 5, i+1)
        idx = np.where(y_train == digit_pair[0])[0][i]
        plt.imshow(x_train[idx], cmap='Blues')
        plt.title(f'Digit {digit_pair[0]}')
        plt.axis('off')

    # Show examples of digit2
    for i in range(5):
        plt.subplot(2, 5, i+6)
        idx = np.where(y_train == digit_pair[1])[0][i]
        plt.imshow(x_train[idx], cmap='Reds')
        plt.title(f'Digit {digit_pair[1]}')
        plt.axis('off')

    plt.tight_layout()
    plt.show()

# Function to run experiments on digit pairs
def run_digit_pair_experiments(digit_pairs):
    """
    Run perceptron experiments on multiple digit pairs.

    Parameters:
    digit_pairs (list): List of tuples containing digit pairs to classify

    Returns:
    dict: Results for each digit pair
    """
    results = {}

    for digit1, digit2 in digit_pairs:
        print(f"Training perceptron for digit pair ({digit1}, {digit2})...")

        # Create binary classification dataset - USE SUBSET DATA FOR CONSISTENCY
        cond = (y_train_subset == digit1) | (y_train_subset == digit2)
        binary_x_train = x_train_reshaped[cond]
        binary_y_train = y_train_subset[cond].copy()

        # Convert to binary labels (-1, 1) - fixing the data type issue
        binary_y_train = binary_y_train.astype(float)  # Convert to float to avoid uint8 overflow
        binary_y_train[binary_y_train == digit1] = -1
        binary_y_train[binary_y_train == digit2] = 1

        # Create test dataset
        cond_test = (y_test == digit1) | (y_test == digit2)
        binary_x_test = x_test_reshaped[cond_test]
        binary_y_test = y_test[cond_test].copy().astype(float)  # Convert to float
        binary_y_test[binary_y_test == digit1] = -1
        binary_y_test[binary_y_test == digit2] = 1

        # Train and evaluate
        result = evaluate_perceptron(binary_x_train, binary_y_train, binary_x_test, binary_y_test)
        results[f"{digit1}_vs_{digit2}"] = result

        print(f"Training accuracy: {result['train_accuracy']:.4f}")
        print(f"Test accuracy: {result['test_accuracy']:.4f}")
        print("---")

        # Plot training error curve
        plt.figure(figsize=(10, 6))
        plt.plot(result['error_history'])
        plt.title(f'Training Error Curve for Digits {digit1} vs {digit2}')
        plt.xlabel('Iterations')
        plt.ylabel('Classification Error')
        plt.grid(True)
        plt.show()

        # Visualize learned weights
        visualize_weights(result['weights'], digit_pair=(digit1, digit2))

    return results
# List of digit pairs to experiment with
digit_pairs = [(0, 1), (3, 8), (4, 9), (5, 6), (1, 7)]

# Run experiments
results = run_digit_pair_experiments(digit_pairs)

# Create results table
pairs = []
train_accs = []
test_accs = []
iterations = []

for pair, result in results.items():
    pairs.append(pair)
    train_accs.append(result['train_accuracy'])
    test_accs.append(result['test_accuracy'])
    iterations.append(len(result['error_history']))

# Create and display a comprehensive table
results_df = pd.DataFrame({
    'Digit Pair': pairs,
    'Training Accuracy': train_accs,
    'Test Accuracy': test_accs,
    'Iterations': iterations
})
print("\nPerceptron Classification Results for Different Digit Pairs:")
print(results_df)

# Create a visualization of the results
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = np.arange(len(pairs))

plt.bar(index, train_accs, bar_width, label='Training Accuracy', color='skyblue')
plt.bar(index + bar_width, test_accs, bar_width, label='Test Accuracy', color='orange')

plt.xlabel('Digit Pairs')
plt.ylabel('Accuracy')
plt.title('Perceptron Performance Across Different Digit Pairs')
plt.xticks(index + bar_width/2, pairs)
plt.legend()
plt.ylim(0.5, 1.05)  # Setting a reasonable y-axis range
plt.grid(axis='y', linestyle='--', alpha=0.7)

for i, v in enumerate(train_accs):
    plt.text(i - 0.1, v + 0.02, f'{v:.3f}', color='blue', fontweight='bold')

for i, v in enumerate(test_accs):
    plt.text(i + bar_width - 0.1, v + 0.02, f'{v:.3f}', color='darkred', fontweight='bold')

plt.tight_layout()
plt.show()

### TASK 3

In [21]:
# Task 3.1: Create and train MLP with architecture [784,1000,1000,10]
def create_mlp(input_shape=784, hidden_units=[1000, 1000], output_units=10):
    """
    Create a Multi-Layer Perceptron with the specified architecture.

    Parameters:
    input_shape (int): Number of input features
    hidden_units (list): List of hidden layer units
    output_units (int): Number of output units

    Returns:
    keras.Model: The compiled MLP model
    """
    model = models.Sequential()

    # Input layer
    model.add(layers.InputLayer(input_shape=(input_shape,)))

    # Hidden layers with ReLU activation
    for units in hidden_units:
        model.add(layers.Dense(units, activation='relu'))

    # Output layer with softmax activation
    model.add(layers.Dense(output_units, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

# Create the MLP model with architecture [784,1000,1000,10]
mlp_model = create_mlp(input_shape=nb_features, hidden_units=[1000, 1000], output_units=10)

# Print model summary
mlp_model.summary()

# Set training parameters
batch_size = 50
epochs = 10

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train/255
y_train = np.eye(10)[y_train]
X_test = X_test/ 255
y_test = np.eye(10)[y_test]

# Reshape the data for the MLP
x_train_reshaped = X_train.reshape(X_train.shape[0], -1)
x_test_reshaped = X_test.reshape(X_test.shape[0], -1)

# Train the model
history = mlp_model.fit(
    x_train_reshaped, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test_reshaped, y_test),
    verbose=1
)

# Evaluate the model on test data
test_loss, test_accuracy = mlp_model.evaluate(x_test_reshaped, y_test, verbose=0)
train_loss, train_accuracy = mlp_model.evaluate(x_train_reshaped, y_train, verbose=0)

print(f"\nTraining accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")

# Plot the training and testing curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training accuracy')
plt.plot(history.history['val_accuracy'], label='Validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Task 3.2: Implementing multiple MLPs with different depths
def create_and_train_mlp(name, hidden_layers, epochs=10, batch_size=50):
    """
    Create and train an MLP with the specified number of hidden layers.

    Parameters:
    name (str): Name for the model
    hidden_layers (list): List of hidden layer units
    epochs (int): Number of epochs to train for
    batch_size (int): Batch size for training

    Returns:
    dict: Results including accuracy, model, and parameters count
    """
    model = models.Sequential(name=name)

    # Input layer
    model.add(layers.InputLayer(input_shape=(nb_features,)))

    # Hidden layers with ReLU activation
    for units in hidden_layers:
        model.add(layers.Dense(units, activation='relu'))

    # Output layer with softmax activation
    model.add(layers.Dense(10, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Train the model
    history = model.fit(
        x_train_reshaped, y_train_one_hot,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(x_test_reshaped, y_test_one_hot),
        verbose=0
    )

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(x_test_reshaped, y_test_one_hot, verbose=0)
    train_loss, train_accuracy = model.evaluate(x_train_reshaped, y_train_one_hot, verbose=0)

    # Count parameters
    trainable_params = np.sum([np.prod(v.shape) for v in model.trainable_variables])

    return {
        'name': name,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'parameters': trainable_params,
        'model': model,
        'history': history
    }

# Define MLP architectures with different depths
# Original MLP: [784, 1000, 1000, 10] (2 hidden layers)
# Additional MLPs with 3, 4, 5, and 7 hidden layers
mlp_architectures = {
    'MLP-2': [1000, 1000],  # 2 hidden layers (original)
    'MLP-3': [800, 800, 800],  # 3 hidden layers
    'MLP-4': [700, 700, 700, 700],  # 4 hidden layers
    'MLP-5': [600, 600, 600, 600, 600],  # 5 hidden layers
    'MLP-7': [500, 500, 500, 500, 500, 500, 500]  # 7 hidden layers
}

# Train all MLPs
results = {}
for name, hidden_layers in mlp_architectures.items():
    print(f"Training {name} with architecture {hidden_layers}...")
    results[name] = create_and_train_mlp(name, hidden_layers, epochs=10, batch_size=50)
    print(f"  Train accuracy: {results[name]['train_accuracy']:.4f}")
    print(f"  Test accuracy: {results[name]['test_accuracy']:.4f}")
    print(f"  Parameters: {results[name]['parameters']:,}")
    print()

# Create a comparison table
results_table = {
    'MLP': [],
    'Hidden Layers': [],
    'Parameters': [],
    'Train Accuracy': [],
    'Test Accuracy': []
}

for name, result in results.items():
    results_table['MLP'].append(name)
    results_table['Hidden Layers'].append(len(mlp_architectures[name]))
    results_table['Parameters'].append(result['parameters'])
    results_table['Train Accuracy'].append(result['train_accuracy'])
    results_table['Test Accuracy'].append(result['test_accuracy'])

# Plot accuracy vs depth vs parameters
plt.figure(figsize=(15, 5))

# Plot 1: Accuracy vs. Number of Hidden Layers
plt.subplot(1, 2, 1)
plt.plot(results_table['Hidden Layers'], results_table['Train Accuracy'], 'o-', label='Train Accuracy')
plt.plot(results_table['Hidden Layers'], results_table['Test Accuracy'], 's-', label='Test Accuracy')
plt.xlabel('Number of Hidden Layers')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Hidden Layers')
plt.legend()
plt.grid(True)
plt.xticks(results_table['Hidden Layers'])

# Plot 2: Accuracy vs. Number of Parameters
plt.subplot(1, 2, 2)
plt.plot(results_table['Parameters'], results_table['Train Accuracy'], 'o-', label='Train Accuracy')
plt.plot(results_table['Parameters'], results_table['Test Accuracy'], 's-', label='Test Accuracy')
plt.xlabel('Number of Parameters')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Parameters')
plt.legend()
plt.grid(True)
plt.xscale('log')

plt.tight_layout()
plt.show()

# Print the final results
for i in range(len(results_table['MLP'])):
    print(f"{results_table['MLP'][i]}: Layers={results_table['Hidden Layers'][i]}, "
          f"Params={results_table['Parameters'][i]:,}, "
          f"Train Acc={results_table['Train Accuracy'][i]:.4f}, "
          f"Test Acc={results_table['Test Accuracy'][i]:.4f}")

Epoch 1/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.9058 - loss: 0.2993 - val_accuracy: 0.9630 - val_loss: 0.1134
Epoch 2/10
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.9757 - loss: 0.0754 - val_accuracy: 0.9751 - val_loss: 0.0814
Epoch 3/10
[1m 900/1200[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m2s[0m 7ms/step - accuracy: 0.9833 - loss: 0.0521

### TASK 4

In [48]:
# Task 4.1: Create a CNN with architecture [32, 64, 128]

def create_cnn(input_shape=(28, 28, 1), filters=[32, 64, 128], output_units=10):
    """
    Create a Convolutional Neural Network with the specified architecture.

    Parameters:
    input_shape (tuple): Shape of input images
    filters (list): List of filters for each convolutional layer
    output_units (int): Number of output units

    Returns:
    keras.Model: The compiled CNN model
    """
    model = models.Sequential()

    # First convolutional layer with stride 1
    model.add(layers.Conv2D(filters[0], kernel_size=(4, 4), strides=(1, 1), padding='same',
                           activation='relu', input_shape=input_shape))

    # Second convolutional layer with stride 2
    model.add(layers.Conv2D(filters[1], kernel_size=(4, 4), strides=(2, 2), padding='same',
                           activation='relu'))

    # Third convolutional layer with stride 2
    model.add(layers.Conv2D(filters[2], kernel_size=(4, 4), strides=(2, 2), padding='same',
                           activation='relu'))

    # Flatten the feature maps
    model.add(layers.Flatten())

    # Fully connected layer with 10 output units and softmax activation
    model.add(layers.Dense(output_units, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

# Reshape the data to have the correct format for CNN
def prepare_data_for_cnn(x_train, x_test):
    # Calculate the edge of the square images
    edge = int(np.sqrt(x_train.shape[1]))

    # Reshape the data to 4D tensors [samples, height, width, channels]
    x_train_cnn = x_train.reshape(x_train.shape[0], edge, edge, 1)
    x_test_cnn = x_test.reshape(x_test.shape[0], edge, edge, 1)

    return x_train_cnn, x_test_cnn

# Convert labels to one-hot encoding
y_train_one_hot = tf.keras.utils.to_categorical(y_train, 10)
y_test_one_hot = tf.keras.utils.to_categorical(y_test, 10)

# Reshape the data for CNN (assuming x_train_reshaped and x_test_reshaped are your vectorized data)
x_train_cnn, x_test_cnn = prepare_data_for_cnn(x_train_reshaped, x_test_reshaped)

def train_cnn_model(x_train_cnn, y_train_one_hot, x_test_cnn, y_test_one_hot,
                   filters=[32, 64, 128], batch_size=50, epochs=10):
    """
    Create, train and evaluate a CNN model on MNIST data.

    Parameters:
    x_train_cnn (numpy.ndarray): Training images, shaped for CNN
    y_train_one_hot (numpy.ndarray): One-hot encoded training labels
    x_test_cnn (numpy.ndarray): Test images, shaped for CNN
    y_test_one_hot (numpy.ndarray): One-hot encoded test labels
    filters (list): List of filters for each convolutional layer
    batch_size (int): Batch size for training
    epochs (int): Number of epochs to train for

    Returns:
    dict: Results including model, history, and accuracies
    """
    # Create the model
    model = create_cnn(input_shape=x_train_cnn.shape[1:], filters=filters)

    # Print model summary
    model.summary()

    # Train the model
    history = model.fit(
        x_train_cnn, y_train_one_hot,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(x_test_cnn, y_test_one_hot),
        verbose=1
    )

    # Evaluate the model
    train_loss, train_accuracy = model.evaluate(x_train_cnn, y_train_one_hot, verbose=0)
    test_loss, test_accuracy = model.evaluate(x_test_cnn, y_test_one_hot, verbose=0)

    print(f"\nTraining accuracy: {train_accuracy:.4f}")
    print(f"Test accuracy: {test_accuracy:.4f}")

    return {
        'model': model,
        'history': history,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy
    }

def plot_learning_curves(history):
    """
    Plot the training and validation accuracy and loss curves.

    Parameters:
    history: Training history object returned by model.fit()
    """
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training loss')
    plt.plot(history.history['val_loss'], label='Validation loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

# Task 4.2: Compare CNNs with different depths and widths

def create_and_train_cnn(name, filters, batch_size=50, epochs=10):
    """
    Create and train a CNN with specified filter configuration.

    Parameters:
    name (str): Name for the model
    filters (list): List of filters for each convolutional layer
    batch_size (int): Batch size for training
    epochs (int): Number of epochs to train for

    Returns:
    dict: Results including accuracy, model, and parameters count
    """
    # Create the model
    model = models.Sequential(name=name)

    # Add convolutional layers with appropriate strides
    model.add(layers.Conv2D(filters[0], kernel_size=(4, 4), strides=(1, 1), padding='same',
                           activation='relu', input_shape=(28, 28, 1)))

    for i, f in enumerate(filters[1:], 1):
        model.add(layers.Conv2D(f, kernel_size=(4, 4), strides=(2, 2), padding='same',
                               activation='relu'))

    # Flatten and add fully connected layer
    model.add(layers.Flatten())
    model.add(layers.Dense(10, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Train the model
    history = model.fit(
        x_train_cnn, y_train_one_hot,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(x_test_cnn, y_test_one_hot),
        verbose=0
    )

    # Evaluate the model
    train_loss, train_accuracy = model.evaluate(x_train_cnn, y_train_one_hot, verbose=0)
    test_loss, test_accuracy = model.evaluate(x_test_cnn, y_test_one_hot, verbose=0)

    # Count parameters
    trainable_params = np.sum([np.prod(v.shape) for v in model.trainable_weights])

    return {
        'name': name,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'parameters': trainable_params,
        'model': model,
        'history': history
    }

# Define CNN architectures with different depths and widths
cnn_architectures = {
    'CNN-3': [32, 64, 128],  # 3 layers (original)
    'CNN-4': [32, 64, 96, 128],  # 4 layers
    'CNN-5': [32, 48, 64, 96, 128],  # 5 layers
    'CNN-2': [64, 128],  # 2 layers
    'CNN-6': [16, 32, 48, 64, 96, 128]  # 6 layers
}

# Code to execute the CNN training and comparison
# Run this section to train all CNNs with different architectures
# (Make sure you've defined x_train_cnn, y_train_one_hot, x_test_cnn, y_test_one_hot)

# Train base CNN model
cnn_result = train_cnn_model(x_train_cnn, y_train_one_hot, x_test_cnn, y_test_one_hot)
plot_learning_curves(cnn_result['history'])

# Train all CNN architectures
results = {}
for name, filters in cnn_architectures.items():
    print(f"Training {name} with filters {filters}...")
    results[name] = create_and_train_cnn(name, filters, epochs=10, batch_size=50)
    print(f"  Train accuracy: {results[name]['train_accuracy']:.4f}")
    print(f"  Test accuracy: {results[name]['test_accuracy']:.4f}")
    print(f"  Parameters: {results[name]['parameters']:,}")
    print()

# Create a comparison table
results_table = {
    'CNN': [],
    'Layers': [],
    'Parameters': [],
    'Train Accuracy': [],
    'Test Accuracy': []
}

for name, result in results.items():
    results_table['CNN'].append(name)
    results_table['Layers'].append(len(cnn_architectures[name]))
    results_table['Parameters'].append(result['parameters'])
    results_table['Train Accuracy'].append(result['train_accuracy'])
    results_table['Test Accuracy'].append(result['test_accuracy'])

# Plot accuracy vs depth vs parameters
plt.figure(figsize=(15, 5))

# Plot 1: Accuracy vs. Number of Layers
plt.subplot(1, 2, 1)
plt.plot(results_table['Layers'], results_table['Train Accuracy'], 'o-', label='Train Accuracy')
plt.plot(results_table['Layers'], results_table['Test Accuracy'], 's-', label='Test Accuracy')
plt.xlabel('Number of Convolutional Layers')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Convolutional Layers')
plt.legend()
plt.grid(True)
plt.xticks(results_table['Layers'])

# Plot 2: Accuracy vs. Number of Parameters
plt.subplot(1, 2, 2)
plt.plot(results_table['Parameters'], results_table['Train Accuracy'], 'o-', label='Train Accuracy')
plt.plot(results_table['Parameters'], results_table['Test Accuracy'], 's-', label='Test Accuracy')
plt.xlabel('Number of Parameters')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Parameters')
plt.legend()
plt.grid(True)
plt.xscale('log')

plt.tight_layout()
plt.show()

# Print the final results
for i in range(len(results_table['CNN'])):
    print(f"{results_table['CNN'][i]}: Layers={results_table['Layers'][i]}, "
          f"Params={results_table['Parameters'][i]:,}, "
          f"Train Acc={results_table['Train Accuracy'][i]:.4f}, "
          f"Test Acc={results_table['Test Accuracy'][i]:.4f}")

# Compare CNN to MLP
print("\nComparison between CNN and MLP:")
print(f"CNN-3 (Original): Test Accuracy={results['CNN-3']['test_accuracy']:.4f}, Parameters={results['CNN-3']['parameters']:,}")
print(f"MLP-2 (Original): Test Accuracy={mlp_results['MLP-2']['test_accuracy']:.4f}, Parameters={mlp_results['MLP-2']['parameters']:,}")

NameError: name 'x_test_reshaped' is not defined

### TASK 5

In [51]:
# Task 5: Visualizing CNN outcomes

def plot_filters(model, layer_idx, cols=8):
    """
    Plot the filters (kernels) of a specific convolutional layer in a grid.

    Parameters:
    model: Trained CNN model
    layer_idx: Index of the convolutional layer to visualize
    cols: Number of columns in the grid plot
    """
    # Get the layer
    layer = model.layers[layer_idx]

    # Check if it's a convolutional layer
    if not isinstance(layer, layers.Conv2D):
        print(f"Layer {layer_idx} is not a convolutional layer.")
        return

    # Get the weights (filters/kernels)
    filters, biases = layer.get_weights()

    # Number of filters, filter size
    n_filters, height, width, channels = filters.shape

    # Calculate rows needed
    rows = int(np.ceil(n_filters / cols))

    # Create a figure
    plt.figure(figsize=(cols * 2, rows * 2))

    # Plot each filter
    for i in range(n_filters):
        # Create subplot for this filter
        plt.subplot(rows, cols, i + 1)

        # For each input channel, we have a separate kernel
        # We'll visualize the average across channels to get a sense of the overall pattern
        filter_img = np.mean(filters[i, :, :, :], axis=2)

        # Normalize for better visualization
        filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min() + 1e-7)

        # Display the filter
        plt.imshow(filter_img, cmap='viridis')
        plt.axis('off')
        plt.title(f'Filter {i+1}')

    plt.suptitle(f'Filters from layer {layer.name}')
    plt.tight_layout()
    plt.show()

def plot_activation_maps(model, image, layer_indices, digit_class, cols=8):
    """
    Plot the activation maps for a specific image for each filter in specified layers.

    Parameters:
    model: Trained CNN model
    image: Input image to visualize activations for (should be shaped for CNN)
    layer_indices: List of layer indices to visualize
    digit_class: The class of the digit for display
    cols: Number of columns in each grid plot
    """
    # Add batch dimension if needed
    if len(image.shape) == 3:
        image = np.expand_dims(image, axis=0)

    # Create models that output the activations for each specified layer
    activation_models = []
    for layer_idx in layer_indices:
        layer = model.layers[layer_idx]
        activation_model = Model(inputs=model.input, outputs=layer.output)
        activation_models.append((layer_idx, layer.name, activation_model))

    # Display the input image
    plt.figure(figsize=(5, 5))
    plt.imshow(np.squeeze(image), cmap='gray')
    plt.title(f'Input Image: Digit {digit_class}')
    plt.axis('off')
    plt.show()

    # Get activations for each layer and plot them
    for layer_idx, layer_name, activation_model in activation_models:
        # Get activations for this layer
        activations = activation_model.predict(image)

        # Number of filters in this layer
        n_filters = activations.shape[-1]

        # Calculate number of rows needed
        rows = int(np.ceil(n_filters / cols))

        # Create a figure
        plt.figure(figsize=(cols * 2, rows * 2))

        # Plot each activation map
        for i in range(n_filters):
            if i < activations.shape[-1]:
                # Create subplot for this activation map
                plt.subplot(rows, cols, i + 1)

                # Get the activation map for this filter
                activation = activations[0, :, :, i]

                # Normalize for better visualization
                activation = (activation - activation.min()) / (activation.max() - activation.min() + 1e-7)

                # Display the activation map
                plt.imshow(activation, cmap='viridis')
                plt.axis('off')
                plt.title(f'Filter {i+1}')

        plt.suptitle(f'Activation maps from layer {layer_name} for Digit {digit_class}')
        plt.tight_layout()
        plt.show()

def generate_deep_dream(model, class_idx, iterations=20, step_size=1.0, octave_scale=1.4, num_octaves=5):
    """
    Generate a deep dream image for a specific class.

    Parameters:
    model: Trained CNN model
    class_idx: Index of the class to generate a deep dream for
    iterations: Number of gradient ascent steps per octave
    step_size: Size of the gradient ascent step
    octave_scale: Scale factor between octaves
    num_octaves: Number of octave iterations

    Returns:
    numpy.ndarray: Deep dream image
    """
    # Create a random noise image
    img = np.random.normal(size=(28, 28, 1)) * 0.1

    # Define loss function to maximize the class output
    @tf.function
    def calc_loss(image, class_idx):
        # Ensure the image has the correct shape and data type
        image = tf.cast(image, tf.float32)

        # Get the model's prediction
        pred = model(image)

        # Return the activation of the target class
        return pred[:, class_idx]

    @tf.function
    def gradient_ascent_step(image, class_idx, step_size):
        with tf.GradientTape() as tape:
            tape.watch(image)
            loss = calc_loss(image, class_idx)

        # Calculate the gradient of the loss with respect to the image
        gradient = tape.gradient(loss, image)

        # Normalize the gradient
        gradient = tf.math.l2_normalize(gradient)

        # Apply gradient ascent
        image = image + gradient * step_size

        return image

    # Scale the input image
    original_shape = img.shape[:-1]
    img = np.expand_dims(img, axis=0)  # Add batch dimension

    # Process octaves from the largest to the smallest scale
    for octave in range(num_octaves):
        # Calculate the shape for this octave
        octave_shape = tuple(np.array(original_shape) * octave_scale**(num_octaves - octave - 1))
        octave_shape = tuple(map(int, octave_shape)) + (1,)

        # Resize the image for this octave
        resized_img = tf.image.resize(img, octave_shape[:-1])

        # Perform gradient ascent on this octave
        for i in range(iterations):
            resized_img = gradient_ascent_step(resized_img, class_idx, step_size)

        # Resize back to the original shape and update the image
        img = tf.image.resize(resized_img, original_shape)

    # Convert to numpy array and normalize for visualization
    dream_img = img[0].numpy()
    dream_img = (dream_img - dream_img.min()) / (dream_img.max() - dream_img.min())

    return dream_img

def visualize_cnn_outcomes(model, x_test, y_test):
    """
    Visualize CNN filters, activation maps, and generate deep dream images.

    Parameters:
    model: Trained CNN model
    x_test: Test images
    y_test: Test labels
    """
    # Display model summary
    model.summary()

    # 1. Visualize filters for each convolutional layer
    print("\nVisualizing Filters:")
    for i, layer in enumerate(model.layers):
        if isinstance(layer, layers.Conv2D):
            print(f"Layer {i}: {layer.name}")
            plot_filters(model, i)

    # 2. Find examples of digits '2' and '9' in the test set
    digit_2_indices = np.where(np.argmax(y_test, axis=1) == 2)[0]
    digit_9_indices = np.where(np.argmax(y_test, axis=1) == 9)[0]

    # Select the first instances
    digit_2_idx = digit_2_indices[0]
    digit_9_idx = digit_9_indices[0]

    digit_2_img = x_test[digit_2_idx]
    digit_9_img = x_test[digit_9_idx]

    # 3. Visualize activation maps for digits '2' and '9'
    print("\nVisualizing Activation Maps for Digit '2':")
    conv_layer_indices = [i for i, layer in enumerate(model.layers)
                           if isinstance(layer, layers.Conv2D)]

    plot_activation_maps(model, digit_2_img, conv_layer_indices, 2)

    print("\nVisualizing Activation Maps for Digit '9':")
    plot_activation_maps(model, digit_9_img, conv_layer_indices, 9)

    # 4. Generate deep dream images for classes '2' and '9'
    print("\nGenerating Deep Dream Image for Digit '2':")
    dream_2 = generate_deep_dream(model, 2)

    print("\nGenerating Deep Dream Image for Digit '9':")
    dream_9 = generate_deep_dream(model, 9)

    # Display deep dream images
    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.imshow(np.squeeze(dream_2), cmap='viridis')
    plt.title("Deep Dream: Digit '2'")
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(np.squeeze(dream_9), cmap='viridis')
    plt.title("Deep Dream: Digit '9'")
    plt.axis('off')

    plt.tight_layout()
    plt.show()

    # Analyzing deep dream images
    print("\nDeep Dream Analysis:")
    print("The deep dream images show the patterns that the model is sensitive to for each digit class.")
    print("For digit '2', we can observe curved patterns that represent the typical shape of the digit.")
    print("For digit '9', we can observe patterns with a circle at the top and a vertical line, ")
    print("which are characteristic features of the digit '9'.")

# Code to use the visualization functions
"""
# Make sure you have a trained CNN model from Task 4
# Assuming 'cnn_model' is your trained model and x_test_cnn, y_test_one_hot are your test data

# Run the visualization functions
visualize_cnn_outcomes(cnn_model, x_test_cnn, y_test_one_hot)
"""

# Alternative implementation for deep dream visualization
def visualize_deep_dream_simpler(model, class_indices=[2, 9], input_shape=(28, 28, 1)):
    """
    A simpler implementation of deep dream visualization that works well with MNIST.
    Creates deep dream images by optimizing random noise to maximize class activation.

    Parameters:
    model: Trained CNN model
    class_indices: List of class indices to generate deep dreams for
    input_shape: Shape of the input images
    """
    plt.figure(figsize=(len(class_indices) * 5, 5))

    for i, class_idx in enumerate(class_indices):
        # Start with random noise
        img = tf.random.normal((1,) + input_shape) * 0.1
        img = tf.Variable(img)

        # Optimization steps
        learning_rate = 0.1
        steps = 100

        for step in range(steps):
            with tf.GradientTape() as tape:
                # Get model prediction
                pred = model(img)
                # Loss is negative log probability of target class (we want to maximize it)
                loss = -tf.math.log(pred[0, class_idx] + 1e-7)

            # Compute gradients and update the image
            grads = tape.gradient(loss, img)
            img.assign_sub(grads * learning_rate)

            # Optional: Apply regularization or normalization here
            # This helps keep the image looking reasonable
            if step % 10 == 0:
                # Normalize to maintain contrast
                img_np = img.numpy()
                img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min() + 1e-7)
                img.assign(tf.convert_to_tensor(img_np, dtype=tf.float32))

        # Display the result
        plt.subplot(1, len(class_indices), i + 1)
        dream_img = np.squeeze(img.numpy())
        plt.imshow(dream_img, cmap='viridis')
        plt.title(f"Deep Dream: Digit '{class_idx}'")
        plt.axis('off')

    plt.tight_layout()
    plt.show()

### TASK 6

In [None]:
# Task 6: Multi-task Learning

# Define a function to load and prepare the Fashion MNIST dataset
def load_fashion_mnist_data():
    """
    Load the Fashion MNIST dataset and prepare it for multi-task learning.

    Returns:
    tuple: Training and testing data for two tasks
    """
    # Load Fashion MNIST dataset
    fashion_mnist = keras.datasets.fashion_mnist
    (train_X, train_y_1), (test_X, test_y_1) = fashion_mnist.load_data()

    # Normalize pixel values
    train_X = train_X / 255.0
    test_X = test_X / 255.0

    # Add channel dimension for CNN
    train_X = np.expand_dims(train_X, axis=-1)
    test_X = np.expand_dims(test_X, axis=-1)

    # Create Task 2 labels - group the clothing items into 3 groups
    # Group 0: Shoes (Sandal, Sneaker, Ankle Boot) - classes 5, 7, 9
    # Group 1: Gendered (Dress, Shirt, Bag) - classes 3, 6, 8
    # Group 2: Uni-Sex (T-shirt, Trouser, Pullover, Coat) - classes 0, 1, 2, 4

    def create_group_label(y):
        group_labels = np.zeros_like(y)
        # Group 0: Shoes (Sandal, Sneaker, Ankle Boot)
        group_labels[np.isin(y, [5, 7, 9])] = 0
        # Group 1: Gendered (Dress, Shirt, Bag)
        group_labels[np.isin(y, [3, 6, 8])] = 1
        # Group 2: Uni-Sex (T-shirt, Trouser, Pullover, Coat)
        group_labels[np.isin(y, [0, 1, 2, 4])] = 2
        return group_labels

    train_y_2 = create_group_label(train_y_1)
    test_y_2 = create_group_label(test_y_1)

    # Convert labels to one-hot encoding
    train_y_1_onehot = keras.utils.to_categorical(train_y_1, 10)
    test_y_1_onehot = keras.utils.to_categorical(test_y_1, 10)
    train_y_2_onehot = keras.utils.to_categorical(train_y_2, 3)
    test_y_2_onehot = keras.utils.to_categorical(test_y_2, 3)

    return train_X, train_y_1_onehot, train_y_2_onehot, test_X, test_y_1_onehot, test_y_2_onehot

# Task 6.1: Create individual CNN models for each task
def create_single_task_cnn(input_shape, num_classes, task_name):
    """
    Create a CNN model for a single task.

    Parameters:
    input_shape: Shape of input images
    num_classes: Number of output classes
    task_name: Name of the task for the model

    Returns:
    keras.Model: Compiled CNN model
    """
    model = models.Sequential(name=f"Single_{task_name}")

    # Convolutional layers with filters [32, 64, 128]
    model.add(layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same',
                           activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same',
                           activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same',
                           activation='relu'))

    # Flatten and dense layers
    model.add(layers.Flatten())
    model.add(layers.Dense(3136, activation='relu'))
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

def train_single_task_models(train_X, train_y_1, train_y_2, test_X, test_y_1, test_y_2, batch_size=10, epochs=5):
    """
    Train individual CNN models for each task.

    Parameters:
    train_X, train_y_1, train_y_2, test_X, test_y_1, test_y_2: Training and testing data
    batch_size: Batch size for training
    epochs: Number of training epochs

    Returns:
    dict: Results for both models
    """
    # Create models
    model_task1 = create_single_task_cnn(train_X.shape[1:], 10, "Task1_Item")
    model_task2 = create_single_task_cnn(train_X.shape[1:], 3, "Task2_Group")

    # Print model summaries
    print("Task 1 (Item Classification) Model Summary:")
    model_task1.summary()

    print("\nTask 2 (Group Classification) Model Summary:")
    model_task2.summary()

    # Train Task 1 model
    print("\nTraining Task 1 (Item Classification) Model...")
    start_time = time.time()
    history_task1 = model_task1.fit(
        train_X, train_y_1,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(test_X, test_y_1),
        verbose=1
    )
    task1_train_time = time.time() - start_time

    # Evaluate Task 1 model
    task1_test_loss, task1_test_accuracy = model_task1.evaluate(test_X, test_y_1, verbose=0)
    print(f"Task 1 Test Accuracy: {task1_test_accuracy:.4f}")

    # Train Task 2 model
    print("\nTraining Task 2 (Group Classification) Model...")
    start_time = time.time()
    history_task2 = model_task2.fit(
        train_X, train_y_2,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(test_X, test_y_2),
        verbose=1
    )
    task2_train_time = time.time() - start_time

    # Evaluate Task 2 model
    task2_test_loss, task2_test_accuracy = model_task2.evaluate(test_X, test_y_2, verbose=0)
    print(f"Task 2 Test Accuracy: {task2_test_accuracy:.4f}")

    # Count parameters
    task1_params = model_task1.count_params()
    task2_params = model_task2.count_params()

    return {
        'task1': {
            'model': model_task1,
            'history': history_task1,
            'accuracy': task1_test_accuracy,
            'params': task1_params,
            'train_time': task1_train_time
        },
        'task2': {
            'model': model_task2,
            'history': history_task2,
            'accuracy': task2_test_accuracy,
            'params': task2_params,
            'train_time': task2_train_time
        }
    }

# Task 6.2: Create a multi-task learning model
def create_multitask_model(input_shape, lambda_value=0.5):
    """
    Create a multi-task learning model with a shared backbone.

    Parameters:
    input_shape: Shape of input images
    lambda_value: Weight for balancing between tasks (0-1)

    Returns:
    keras.Model: Compiled multi-task model
    """
    # Input layer
    inputs = keras.Input(shape=input_shape)

    # Shared convolutional layers
    x = layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu')(inputs)
    x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)

    x = layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)

    x = layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu')(x)

    # Flatten
    x = layers.Flatten()(x)

    # Shared dense layer
    shared_dense = layers.Dense(3136, activation='relu')(x)

    # Task-specific layers for Task 1 (Item Classification)
    task1 = layers.Dense(1024, activation='relu')(shared_dense)
    task1 = layers.Dense(100, activation='relu')(task1)
    task1_output = layers.Dense(10, activation='softmax', name='task1_output')(task1)

    # Task-specific layers for Task 2 (Group Classification)
    task2 = layers.Dense(1024, activation='relu')(shared_dense)
    task2 = layers.Dense(100, activation='relu')(task2)
    task2_output = layers.Dense(3, activation='softmax', name='task2_output')(task2)

    # Create model with multiple outputs
    model = keras.Model(inputs=inputs, outputs=[task1_output, task2_output])

    # Compile with weighted losses
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss={
            'task1_output': 'categorical_crossentropy',
            'task2_output': 'categorical_crossentropy'
        },
        loss_weights={
            'task1_output': lambda_value,
            'task2_output': 1 - lambda_value
        },
        metrics=['accuracy']
    )

    return model

def train_multitask_models(train_X, train_y_1, train_y_2, test_X, test_y_1, test_y_2,
                          lambda_values=[0, 0.25, 0.5, 0.75, 1.0],
                          batch_size=10, epochs=5):
    """
    Train multiple multi-task models with different lambda values.

    Parameters:
    train_X, train_y_1, train_y_2, test_X, test_y_1, test_y_2: Training and testing data
    lambda_values: List of lambda values to try
    batch_size: Batch size for training
    epochs: Number of training epochs

    Returns:
    dict: Results for all MTL models
    """
    mtl_results = {}

    for lambda_val in lambda_values:
        print(f"\nTraining Multi-Task Model with λ = {lambda_val}")

        # Create and compile model
        mtl_model = create_multitask_model(train_X.shape[1:], lambda_val)

        # Only print summary for the first model
        if lambda_val == lambda_values[0]:
            mtl_model.summary()

        # Train model
        start_time = time.time()
        history = mtl_model.fit(
            train_X,
            {'task1_output': train_y_1, 'task2_output': train_y_2},
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(
                test_X,
                {'task1_output': test_y_1, 'task2_output': test_y_2}
            ),
            verbose=1
        )
        train_time = time.time() - start_time

        # Evaluate model
        test_results = mtl_model.evaluate(
            test_X,
            {'task1_output': test_y_1, 'task2_output': test_y_2},
            verbose=0
        )

        # Extract test accuracies
        # Structure of test_results: [total_loss, task1_loss, task2_loss, task1_acc, task2_acc]
        task1_accuracy = test_results[3]
        task2_accuracy = test_results[4]

        print(f"λ = {lambda_val}:")
        print(f"  Task 1 (Item) Test Accuracy: {task1_accuracy:.4f}")
        print(f"  Task 2 (Group) Test Accuracy: {task2_accuracy:.4f}")

        # Count parameters
        param_count = mtl_model.count_params()

        # Store results
        mtl_results[lambda_val] = {
            'model': mtl_model,
            'history': history,
            'task1_accuracy': task1_accuracy,
            'task2_accuracy': task2_accuracy,
            'params': param_count,
            'train_time': train_time
        }

    return mtl_results

def plot_mtl_results(single_task_results, mtl_results, lambda_values):
    """
    Plot and compare the results of single-task and multi-task models.

    Parameters:
    single_task_results: Results from single-task models
    mtl_results: Results from multi-task models with different lambda values
    lambda_values: Lambda values used for MTL models
    """
    # Create a table for comparison
    task1_accuracies = [mtl_results[lam]['task1_accuracy'] for lam in lambda_values]
    task2_accuracies = [mtl_results[lam]['task2_accuracy'] for lam in lambda_values]

    # Add single task accuracies for reference
    single_task1_acc = single_task_results['task1']['accuracy']
    single_task2_acc = single_task_results['task2']['accuracy']

    # Plot the results
    plt.figure(figsize=(12, 6))

    plt.plot(lambda_values, task1_accuracies, 'o-', label='MTL Task 1 (Item)')
    plt.plot(lambda_values, task2_accuracies, 's-', label='MTL Task 2 (Group)')

    # Add horizontal lines for single task accuracies
    plt.axhline(y=single_task1_acc, color='r', linestyle='--',
                label=f'Single Task 1 (Item): {single_task1_acc:.4f}')
    plt.axhline(y=single_task2_acc, color='g', linestyle='--',
                label=f'Single Task 2 (Group): {single_task2_acc:.4f}')

    plt.xlabel('Lambda Value (λ)')
    plt.ylabel('Test Accuracy')
    plt.title('Multi-Task Learning Performance vs Lambda Value')
    plt.grid(True)
    plt.legend()
    plt.xticks(lambda_values)

    plt.tight_layout()
    plt.show()

    # Create a comparison table
    print("\nResults Comparison Table:")
    print("-" * 80)
    print(f"{'Model':<20} | {'Task 1 Accuracy':<20} | {'Task 2 Accuracy':<20} | {'Parameters':<15}")
    print("-" * 80)

    # Single task models
    print(f"{'Single Task 1':<20} | {single_task1_acc:<20.4f} | {'-':<20} | {single_task_results['task1']['params']:<15,}")
    print(f"{'Single Task 2':<20} | {'-':<20} | {single_task2_acc:<20.4f} | {single_task_results['task2']['params']:<15,}")

    # Total parameters for both single task models
    total_single_params = single_task_results['task1']['params'] + single_task_results['task2']['params']
    print(f"{'Single Tasks Total':<20} | {'-':<20} | {'-':<20} | {total_single_params:<15,}")
    print("-" * 80)

    # MTL models
    for lam in lambda_values:
        model_name = f"MTL (λ={lam})"
        task1_acc = mtl_results[lam]['task1_accuracy']
        task2_acc = mtl_results[lam]['task2_accuracy']
        params = mtl_results[lam]['params']
        print(f"{model_name:<20} | {task1_acc:<20.4f} | {task2_acc:<20.4f} | {params:<15,}")

    print("-" * 80)

    # Calculate parameter savings
    param_savings = total_single_params - mtl_results[0.5]['params']
    param_savings_percent = (param_savings / total_single_params) * 100
    print(f"Parameter savings with MTL: {param_savings:,} ({param_savings_percent:.2f}%)")

    # Analysis for λ=0 and λ=1
    print("\nAnalysis of Special Lambda Values:")
    print(f"λ=0: The model focuses entirely on Task 2 (Group Classification), ignoring Task 1")
    print(f"λ=1: The model focuses entirely on Task 1 (Item Classification), ignoring Task 2")

    # Overall analysis
    print("\nMulti-Task Learning Analysis:")

    # Find best lambda value
    best_avg_lambda = max(lambda_values, key=lambda lam: (mtl_results[lam]['task1_accuracy'] +
                                                          mtl_results[lam]['task2_accuracy']) / 2)

    best_avg_accuracy = (mtl_results[best_avg_lambda]['task1_accuracy'] +
                         mtl_results[best_avg_lambda]['task2_accuracy']) / 2

    print(f"Best average performance at λ={best_avg_lambda} with average accuracy: {best_avg_accuracy:.4f}")

    # Compare MTL vs Single Task
    avg_single_acc = (single_task1_acc + single_task2_acc) / 2
    print(f"Average single task accuracy: {avg_single_acc:.4f}")

    if best_avg_accuracy > avg_single_acc:
        print("MTL outperforms the average of single task models!")
    else:
        print("Single task models outperform MTL on average.")

# Main execution to run the tasks
# Load and prepare Fashion MNIST data
train_X, train_y_1, train_y_2, test_X, test_y_1, test_y_2 = load_fashion_mnist_data()

# Task 6.1: Train individual CNN models
single_task_results = train_single_task_models(
    train_X, train_y_1, train_y_2,
    test_X, test_y_1, test_y_2,
    batch_size=10, epochs=5
)

# Task 6.2: Train MTL models with different lambda values
lambda_values = [0, 0.25, 0.5, 0.75, 1.0]
mtl_results = train_multitask_models(
    train_X, train_y_1, train_y_2,
    test_X, test_y_1, test_y_2,
    lambda_values=lambda_values,
    batch_size=10, epochs=5
)

# Plot and analyze results
plot_mtl_results(single_task_results, mtl_results, lambda_values)