### 5. Cleaning & Preprocessing Fashion-MNIST

In [9]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

DATA_DIR = '/Users/limengfeiyang/CMOR438-final-project/data/fashion_mnist'
OUT_DIR  = '/Users/limengfeiyang/CMOR438-final-project/data/processed/fashion_mnist1'
os.makedirs(OUT_DIR, exist_ok=True)


In [10]:
df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
df_test  = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

# Merge into one DataFrame
df_all = pd.concat([df_train, df_test], ignore_index=True)
print("Combined shape:", df_all.shape)


Combined shape: (70000, 785)


In [11]:
# Ensure there are no missing values
missing = df_all.isna().sum()
print("Any missing values?\n", missing[missing > 0])

# Check pixel range
pixels = df_all.iloc[:, 1:].values
print("Pixel range:", pixels.min(), "→", pixels.max())


Any missing values?
 Series([], dtype: int64)
Pixel range: 0 → 255


In [12]:
# Labels (for later visualization)
y = df_all['label'].values

# Feature matrix of raw pixels
X_raw = df_all.drop(columns=['label']).values
print("Raw X shape:", X_raw.shape, "y shape:", y.shape)


Raw X shape: (70000, 784) y shape: (70000,)


In [13]:
# Scale from [0, 255] to [0.0, 1.0]
X_norm = X_raw.astype('float32') / 255

print("After normalization: min =", X_norm.min(), "max =", X_norm.max())


After normalization: min = 0.0 max = 1.0


In [14]:
# Zero mean, unit variance
scaler = StandardScaler()
X = scaler.fit_transform(X_norm)

print("After standardization: mean ~", X.mean().round(5), " std ~", X.std().round(5))


After standardization: mean ~ -0.0  std ~ 1.0


In [15]:
# Full set
np.save(os.path.join(OUT_DIR, 'X_fashion.npy'), X)
np.save(os.path.join(OUT_DIR, 'y_fashion.npy'), y)

print("Saved processed arrays to:", OUT_DIR)


Saved processed arrays to: /Users/limengfeiyang/CMOR438-final-project/data/processed/fashion_mnist1
