In [1]:
import pickle
import numpy as np
import pandas as pd
# Specify the full path to mnist.pkl
file_path = './data/mnist.pkl'

# Load the MNIST dataset from the saved file
with open(file_path, 'rb') as f:
    mnist_data = pickle.load(f, encoding='latin1')

In [8]:
def normalize_data(inp):
    """
    Normalizes image pixels here to have 0 mean and unit variance.

    args:
        inp : N X d 2D array where N is the number of examples and d is the number of dimensions

    returns:
        normalized inp: N X d 2D array

    """

def normalize_data(inp):
    mean = np.mean(inp, axis=0)
    std = np.std(inp, axis=0)
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        normalized_inp = np.where(std == 0, 0, (inp - mean) / std)
    return normalized_inp


def one_hot_encoding(labels, num_classes=10):
    """
    Encodes labels using one hot encoding.

    args:
        labels : N dimensional 1D array where N is the number of examples
        num_classes: Number of distinct labels that we have (10 for MNIST)

    returns:
        oneHot : N X num_classes 2D array
    """
    return np.eye(num_classes)[labels.flatten()]

def load_data(data):
    """
    Loads, splits our dataset - MNIST into train, val and test sets and normalizes them

    args:
        path: Path to MNIST dataset
    returns:
        train_normalized_images, train_one_hot_labels, val_normalized_images, val_one_hot_labels,  test_normalized_images, test_one_hot_labels

    """
 

    train_images, train_labels, test_images, test_labels = data


    # Reformat the images and labels
    train_images, test_images = train_images.reshape(train_images.shape[0], -1), test_images.reshape(test_images.shape[0], -1)
    train_labels, test_labels = np.expand_dims(train_labels, axis=1), np.expand_dims(test_labels, axis=1)
    print('Reformatting done.')
    print(f'Shape:  train {train_images.shape}, test {test_images.shape}')
    print(f'Shape:  train {train_labels.shape}, test {test_labels.shape}')

    # Create 80-20 train-validation split
    train_images, train_labels, val_images, val_labels = createTrainValSplit(train_images, train_labels)
    print('Splitting train and validation done.')

    # Preprocess data
    train_normalized_images = normalize_data(train_images)
    train_one_hot_labels = one_hot_encoding(train_labels, num_classes=10)  # (n, 10)

    val_normalized_images = normalize_data(val_images)
    val_one_hot_labels = one_hot_encoding(val_labels, num_classes=10)  # (n, 10)

    test_normalized_images = normalize_data(test_images)
    test_one_hot_labels = one_hot_encoding(test_labels, num_classes=10)  # (n, 10)

    return train_normalized_images, train_one_hot_labels, val_normalized_images, val_one_hot_labels, test_normalized_images, test_one_hot_labels

def createTrainValSplit(x_train,y_train):

    """
    Creates the train-validation split (80-20 split for train-val). Please shuffle the data before creating the train-val split.
    """
    assert len(x_train) == len(y_train)
    indices = np.arange(len(x_train))
    np.random.shuffle(indices)
    x_train_shuffled = x_train[indices]
    y_train_shuffled = y_train[indices]
    split_index = int(0.8 * len(x_train))

    x_train_split = x_train_shuffled[:split_index]
    y_train_split = y_train_shuffled[:split_index]
    x_val_split = x_train_shuffled[split_index:]
    y_val_split = y_train_shuffled[split_index:]

    return x_train_split, y_train_split, x_val_split, y_val_split

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Ensure the preprocessing functions and load_data are defined in your notebook

# Load and preprocess the data
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data(mnist_data)

# One-hot encoding is not necessary for Decision Tree classifier with scikit-learn
# Scikit-learn's Decision Tree can handle multi-class labels without one-hot encoding
# We use the labels directly
y_train = np.argmax(y_train, axis=1)
y_valid = np.argmax(y_valid, axis=1)
y_test = np.argmax(y_test, axis=1)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)

# Fit the classifier to the training data
clf.fit(x_train, y_train)

# Make predictions on the validation data
y_valid_pred = clf.predict(x_valid)

# Calculate accuracy on validation data
valid_accuracy = accuracy_score(y_valid, y_valid_pred)

# Print the classification report for precision, recall, f1-score on validation data
valid_report = classification_report(y_valid, y_valid_pred)

# Print the validation results
print("Validation Accuracy:", valid_accuracy)
print("Validation Classification Report:")
print(valid_report)

# After tuning on validation data, evaluate on test data
y_test_pred = clf.predict(x_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

# Print the test results
print("Test Accuracy:", test_accuracy)
print("Test Classification Report:")
print(test_report)




Reformatting done.
Shape:  train (60000, 784), test (10000, 784)
Shape:  train (60000, 1), test (10000, 1)
Splitting train and validation done.
Validation Accuracy: 0.66925
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.91      0.83      1193
           1       0.80      0.24      0.37      1346
           2       0.63      0.79      0.70      1189
           3       0.74      0.83      0.79      1189
           4       0.50      0.86      0.64      1136
           5       0.71      0.76      0.74      1086
           6       0.81      0.87      0.84      1248
           7       0.53      0.36      0.43      1288
           8       0.65      0.70      0.68      1163
           9       0.66      0.44      0.53      1162

    accuracy                           0.67     12000
   macro avg       0.68      0.68      0.65     12000
weighted avg       0.68      0.67      0.65     12000

Test Accuracy: 0.6123
Test Classif

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(random_state=0)

# Fit the classifier to the training data
rf_clf.fit(x_train, y_train)

# Make predictions on the validation data
y_valid_pred_rf = rf_clf.predict(x_valid)

# Calculate accuracy on validation data
valid_accuracy_rf = accuracy_score(y_valid, y_valid_pred_rf)

# Print the classification report for precision, recall, f1-score on validation data
valid_report_rf = classification_report(y_valid, y_valid_pred_rf)

# Print the validation results for Random Forest
print("Random Forest Validation Accuracy:", valid_accuracy_rf)
print("Random Forest Validation Classification Report:")
print(valid_report_rf)

# After tuning on validation data, evaluate on test data
y_test_pred_rf = rf_clf.predict(x_test)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
test_report_rf = classification_report(y_test, y_test_pred_rf)

# Print the test results for Random Forest
print("Random Forest Test Accuracy:", test_accuracy_rf)
print("Random Forest Test Classification Report:")
print(test_report_rf)


Random Forest Validation Accuracy: 0.9641666666666666
Random Forest Validation Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1193
           1       0.99      0.99      0.99      1346
           2       0.95      0.96      0.96      1189
           3       0.95      0.96      0.95      1189
           4       0.95      0.97      0.96      1136
           5       0.96      0.95      0.96      1086
           6       0.98      0.98      0.98      1248
           7       0.98      0.94      0.96      1288
           8       0.95      0.95      0.95      1163
           9       0.94      0.94      0.94      1162

    accuracy                           0.96     12000
   macro avg       0.96      0.96      0.96     12000
weighted avg       0.96      0.96      0.96     12000

Random Forest Test Accuracy: 0.9549
Random Forest Test Classification Report:
              precision    recall  f1-score   support

      

In [14]:
def expected_label_distribution_accuracy(train_labels):
    """
    Calculate the expected accuracy based on the label frequency distribution in the training set.
    This is essentially the accuracy we would expect if we always predicted the most frequent label.
    """
    _, counts = np.unique(train_labels, return_counts=True)
    expected_accuracy = np.max(counts) / np.sum(counts)
    return expected_accuracy

# Calculate the expected accuracy based on training label distribution
expected_accuracy = expected_label_distribution_accuracy(y_train)

# Print the expected accuracy
print(f"Expected Accuracy based on Label Frequency Distribution: {expected_accuracy}")


Expected Accuracy based on Label Frequency Distribution: 0.11241666666666666


In [28]:
from sklearn.neural_network import MLPClassifier

# Initialize a simple Multi-Layer Perceptron (neural network) classifier
# We'll start with a single hidden layer with 50 neurons, which is a good starting point.
# 'relu' activation function and 'adam' optimizer are commonly used and good defaults.
mlp_clf = MLPClassifier(hidden_layer_sizes=(1,), activation='relu', solver='adam', random_state=0)

# Fit the classifier to the training data
mlp_clf.fit(x_train, y_train)

# Make predictions on the validation data
y_valid_pred_mlp = mlp_clf.predict(x_valid)

# Calculate accuracy on validation data
valid_accuracy_mlp = accuracy_score(y_valid, y_valid_pred_mlp)

# Print the classification report for precision, recall, f1-score on validation data
valid_report_mlp = classification_report(y_valid, y_valid_pred_mlp)

# Print the validation results for the neural network
print("Neural Network Validation Accuracy:", valid_accuracy_mlp)
print("Neural Network Validation Classification Report:")
print(valid_report_mlp)

# After tuning on validation data, evaluate on test data
y_test_pred_mlp = mlp_clf.predict(x_test)
test_accuracy_mlp = accuracy_score(y_test, y_test_pred_mlp)
test_report_mlp = classification_report(y_test, y_test_pred_mlp)

# Print the test results for the neural network
print("Neural Network Test Accuracy:", test_accuracy_mlp)
print("Neural Network Test Classification Report:")
print(test_report_mlp)


Neural Network Validation Accuracy: 0.4136666666666667
Neural Network Validation Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.08      0.13      1193
           1       0.44      0.89      0.59      1346
           2       0.22      0.14      0.17      1189
           3       0.30      0.45      0.36      1189
           4       0.44      0.60      0.50      1136
           5       0.00      0.00      0.00      1086
           6       0.38      0.89      0.54      1248
           7       0.71      0.62      0.66      1288
           8       0.00      0.00      0.00      1163
           9       0.46      0.32      0.38      1162

    accuracy                           0.41     12000
   macro avg       0.33      0.40      0.33     12000
weighted avg       0.34      0.41      0.34     12000

Neural Network Test Accuracy: 0.4041
Neural Network Test Classification Report:
              precision    recall  f1-score   support

  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
from sklearn.neural_network import MLPClassifier

# Initialize a simple Multi-Layer Perceptron (neural network) classifier
# We'll start with a single hidden layer with 50 neurons, which is a good starting point.
# 'relu' activation function and 'adam' optimizer are commonly used and good defaults.
mlp_clf = MLPClassifier(hidden_layer_sizes=(512,), activation='relu', solver='adam', random_state=0)

# Fit the classifier to the training data
mlp_clf.fit(x_train, y_train)

# Make predictions on the validation data
y_valid_pred_mlp = mlp_clf.predict(x_valid)

# Calculate accuracy on validation data
valid_accuracy_mlp = accuracy_score(y_valid, y_valid_pred_mlp)

# Print the classification report for precision, recall, f1-score on validation data
valid_report_mlp = classification_report(y_valid, y_valid_pred_mlp)

# Print the validation results for the neural network
print("Neural Network Validation Accuracy:", valid_accuracy_mlp)
print("Neural Network Validation Classification Report:")
print(valid_report_mlp)

# After tuning on validation data, evaluate on test data
y_test_pred_mlp = mlp_clf.predict(x_test)
test_accuracy_mlp = accuracy_score(y_test, y_test_pred_mlp)
test_report_mlp = classification_report(y_test, y_test_pred_mlp)

# Print the test results for the neural network
print("Neural Network Test Accuracy:", test_accuracy_mlp)
print("Neural Network Test Classification Report:")
print(test_report_mlp)



Neural Network Validation Accuracy: 0.9764166666666667
Neural Network Validation Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1193
           1       0.99      0.99      0.99      1346
           2       0.97      0.97      0.97      1189
           3       0.97      0.97      0.97      1189
           4       0.97      0.98      0.98      1136
           5       0.97      0.97      0.97      1086
           6       0.98      0.98      0.98      1248
           7       0.98      0.97      0.98      1288
           8       0.97      0.97      0.97      1163
           9       0.97      0.97      0.97      1162

    accuracy                           0.98     12000
   macro avg       0.98      0.98      0.98     12000
weighted avg       0.98      0.98      0.98     12000

Neural Network Test Accuracy: 0.9772
Neural Network Test Classification Report:
              precision    recall  f1-score   support

  