In [None]:
import os
import numpy as np
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from PIL import Image
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report)

# Parameters
datasetLocation = './dataset'  
num_samples_per_class = 1000
image_size = (28, 28)  

data = []
labels = []

classes = [str(i) for i in range(10)]  

for cls in classes:
    class_dir = os.path.join(datasetLocation, cls)
    all_images = os.listdir(class_dir)
    
    if len(all_images) < num_samples_per_class:
        raise ValueError(f"Not enough images in class {cls}")
    
    sampled_images = random.sample(all_images, num_samples_per_class)
    
    for img_name in sampled_images:
        img_path = os.path.join(class_dir, img_name)
        
        # Open the image file
        with Image.open(img_path) as img:
            # grayscale
            img = img.convert('L')
            
            img = img.resize(image_size, Image.LANCZOS)
            
            # binary
            threshold = 127  
            img = img.point(lambda p: p > threshold and 255)

            img_array = np.array(img, dtype=np.uint8)
            
            # Normalization
            img_array = img_array / 255  
            
            img_array = img_array.flatten()
            
            data.append(img_array)
            labels.append(int(cls))

data = np.array(data)
labels = np.array(labels)

data, labels = shuffle(data, labels, random_state=42)

# (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, random_state=42, stratify=labels)

print("Training data shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Testing data shape:", X_test.shape)
print("Testing labels shape:", y_test.shape)

In [None]:
#  display images
def show_image(img_array, label):
    plt.imshow(img_array.reshape(28, 28), cmap='gray')
    plt.title(f"Label: {label}")
    plt.axis('off')
    plt.show()

# Show a few samples from the training set
for i in range(5):
    show_image(X_train[i], y_train[i])