In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# Load the Fashion MNIST dataset (CSV format)
df = pd.read_csv("/kaggle/input/fashion-mnist-train-csv/fashion-mnist_train.csv")

# Group data by label to stratify manually
grouped = df.groupby("label")
train_list = []
test_list = []

# Split each class into 80% train and 20% test
for label, group in grouped:
    train_split, test_split = train_test_split(
        group,
        test_size=0.2,
        random_state=42,
        shuffle=True,
        stratify=None  # Manual stratification
    )
    train_list.append(train_split)
    test_list.append(test_split)

# Combine splits and shuffle
train_df = pd.concat(train_list).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat(test_list).sample(frac=1, random_state=42).reset_index(drop=True)

# Show one example image per class
examples = train_df.groupby("label").first().reset_index()
plt.figure(figsize=(10, 4))
for i in range(10):
    ax = plt.subplot(2, 5, i + 1)
    img = examples.loc[i].drop("label").values.astype(np.uint8).reshape(28, 28)
    plt.imshow(img, cmap="gray")
    plt.title(f"Label: {examples.loc[i, 'label']}")
    plt.axis("off")
plt.tight_layout()
plt.show()

# Extract feature matrix and labels
X = train_df.drop("label", axis=1).values.astype(np.float32)
y = train_df["label"].values

# Center the data by subtracting mean
x_mean = np.mean(X, axis=0)
X = X - x_mean

# Compute covariance matrix
covar_x = (1 / (X.shape[0] - 1)) * (X.T @ X)

# Eigen decomposition
eigen_values, eigen_vector = np.linalg.eig(covar_x)

# Sort eigenvalues and eigenvectors in descending order
sort_idx = np.argsort(eigen_values)[::-1]
sort_vect = eigen_vector[:, sort_idx]
eigen_values = eigen_values[sort_idx]

# Select top k principal components
k = 64
W = eigen_vector[:, :k]

# Project data onto reduced space
X_red = X @ W

# k-Nearest Neighbors prediction
k_neighbors = 5
y_pred = []

for i in range(1000):  # Predict labels for first 1000 samples
    # Compute distances to all others
    all_dist = np.full(1000, np.inf)
    for j in range(1000):
        if i != j:
            all_dist[j] = np.linalg.norm(X_red[i] - X_red[j])

    # Find k nearest neighbors
    min_x = np.argsort(all_dist)[:k_neighbors]

    # Get labels and vote
    y_all = y[min_x]
    counts = np.bincount(y_all)
    y_pred.append(np.argmax(counts))

# Compute classification accuracy
acc = np.mean(y_pred == y[:1000]) * 100
print("Accuracy:", acc)
