In [None]:
import pickle
import os
import pandas as pd
import numpy as np


In [None]:
train_file = "./kaggle/input/fii-nn-2025-homework-2/extended_mnist_train.pkl"
test_file = "./kaggle/input/fii-nn-2025-homework-2/extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)

with open(test_file, "rb") as fp:
    test = pickle.load(fp)

In [None]:
train_data = []
train_labels = []
for image, label in train:
    train_data.append(image.flatten())
    train_labels.append(label)


In [None]:
test_data = []
for image, label in test:
    test_data.append(image.flatten())


In [None]:
# convert to numpy arrays
X_train = np.array(train_data)
y_train = np.array(train_labels)
X_test = np.array(test_data)

# normalize data (255 values for 0-255 black-white images)
X_train = X_train / 255.0
X_test = X_test / 255.0
# standarizare (z-score normalization)
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0) + 1e-9
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

def one_hot(y, num_classes):
    # one-hot, meaning:
    # for digit n, we create a vector of size num_classes
    # all values are 0, except the nth position, which is 1
    one_hot_y = np.zeros((y.shape[0], num_classes))
    for row, col in enumerate(y):
        one_hot_y[row, col] = 1 # row = sample, col = digit 0-9
    return one_hot_y

class Perceptron:
    def __init__(self, n_inputs, n_outputs, learning_rate=0.1):
        # weight initialized with small random values
        self.W = np.random.randn(n_inputs, n_outputs) * 0.01
        # bias initialized with small random values
        self.b = np.random.randn(n_outputs) * 0.01
        # learning rate
        self.lr = learning_rate

    def _softmax(self, z):
        # softmax activation formula (numerically stable version)
        # subtract max for numerical stability to prevent overflow in predict()
        z_stable = z - np.max(z, axis=1, keepdims=True)
        exp_z = np.exp(z_stable)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def forward(self, X):
        ### forward step
        # @ = matrix multiplication
        # z = XW + b
        z = X @ self.W + self.b
        # apply softmax activation
        y = self._softmax(z)
        return y

    def fit(self, X, y, epochs=150):
        # number of samples
        num_samples = X.shape[0]
        num_classes = len(np.unique(y)) # always 10 (digits 0 to 9)
        y_one_hot = one_hot(y, num_classes)
        print("TRAINING: START!")
        # pass through the dataset EPOCHS times
        for epoch in range(epochs):
            # forward pass, predicted probabilities
            y_predicted_probabilities = self.forward(X)
            # y_one_hot is the expected output (Target)
            # y_predicted_probabilities is the predicted output (y)
            error_term = y_one_hot - y_predicted_probabilities # Target - y
            grad_W = X.T @ error_term # (Target - y) * Tranpose(X)
            grad_b = np.sum(error_term, axis=0) # sum of errors (Target - y)

            # learning rate is the percent of the "correction" we apply to W and b
            self.W += self.lr * (grad_W / num_samples) # W = W + lr * gradient_W
            self.b += self.lr * (grad_b / num_samples) # b = b + lr * gradient_b
            if (epoch + 1) % 10 == 0:
                # cross-entropy loss
                loss = -np.sum(y_one_hot * np.log(y_predicted_probabilities)) / num_samples
                # calculate accuracy: compare predicted classes to true classes
                y_pred_classes = np.argmax(y_predicted_probabilities, axis=1)
                y_true_classes = np.argmax(y_one_hot, axis=1)
                acc = np.mean(y_pred_classes == y_true_classes)
                print(f"EPOCH {epoch + 1}/{epochs} WITH LOSS: {loss:.5f}, ACCURACY: {acc:.5f}")
        print("TRAINING: COMPLETE!")

    def predict(self, X):
        # get predicted probabilities (returns a matrix of 10 columns,
        # one for each digit 0-9) and len(X) rows
        y_probabilities = self.forward(X)
        # return the label with highest probability for each row (sample)
        # np.argmax - index cu val max (predicted)
        return np.argmax(y_probabilities, axis=1)


In [None]:
n_inputs = 784 # 28x28 images flattened
n_outputs = 10 # digits 0-9
learning_rate = 0.25
epochs = 250

model = Perceptron(n_inputs=n_inputs, n_outputs=n_outputs, learning_rate=learning_rate)
# data, labels, no. epochs
model.fit(X_train, y_train, epochs=epochs)

In [None]:
predictions = model.predict(X_test)

In [None]:
predictions_csv = {
    "ID": list(range(len(predictions))),
    "target": predictions,
}

df = pd.DataFrame(predictions_csv)
df.to_csv("submission.csv", index=False)