In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB

In [17]:
DEBUG = True


In [18]:
IMG_DIR = r'D:\Fourth Year\First Term\Machine\Lectures\project\10000'
img_paths = []
img_labels = []
processed_data = []

In [35]:
print("Checking directory structure...")
if not os.path.exists(IMG_DIR):
    raise ValueError(f"Directory {IMG_DIR} does not exist!")

Checking directory structure...


In [36]:
class_folders = [f for f in os.listdir(IMG_DIR) if os.path.isdir(os.path.join(IMG_DIR, f))]
if not class_folders:
    raise ValueError(f"No subdirectories found in {IMG_DIR}")

print(f"Found {len(class_folders)} class folders: {class_folders}")

Found 10 class folders: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [37]:
for label_idx, folder in enumerate(class_folders):
    folder_path = os.path.join(IMG_DIR, folder)
    files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    print(f"\nProcessing class {folder} (label {label_idx})")
    print(f"Found {len(files)} images in {folder}")
    
    for img_file in files:
        try:
            full_path = os.path.join(folder_path, img_file)
            
            # Process image
            with Image.open(full_path) as img:
                gray_img = img.convert('L')
                small_img = gray_img.resize((28, 28))
                flat_img = np.array(small_img).flatten() / 255.0
                
                processed_data.append(flat_img)
                img_paths.append(full_path)
                img_labels.append(label_idx)
                
        except Exception as e:
            print(f"Error processing {img_file}: {str(e)}")
            continue


Processing class 0 (label 0)
Found 1000 images in 0

Processing class 1 (label 1)
Found 1000 images in 1

Processing class 2 (label 2)
Found 1000 images in 2

Processing class 3 (label 3)
Found 1000 images in 3

Processing class 4 (label 4)
Found 1000 images in 4

Processing class 5 (label 5)
Found 1000 images in 5

Processing class 6 (label 6)
Found 1000 images in 6

Processing class 7 (label 7)
Found 1000 images in 7

Processing class 8 (label 8)
Found 1000 images in 8

Processing class 9 (label 9)
Found 1000 images in 9


In [38]:
    class_count = sum(1 for label in img_labels if label == label_idx)
    print(f"Successfully processed {class_count} images for class {label_idx}")

Successfully processed 1000 images for class 9


In [39]:
X = np.array(processed_data)
y = np.array(img_labels)

In [40]:
print("\nData Overview:")
print(f"Total samples: {len(X)}")
print(f"Feature dimension: {X.shape[1]}")
print("\nClass distribution:")
unique_classes, class_counts = np.unique(y, return_counts=True)
for cls, count in zip(unique_classes, class_counts):
    print(f"Class {cls}: {count} samples")

if len(unique_classes) < 2:
    raise ValueError(f"Found only {len(unique_classes)} classes. Need at least 2 classes for classification!")


Data Overview:
Total samples: 11000
Feature dimension: 784

Class distribution:
Class 0: 1000 samples
Class 1: 2000 samples
Class 2: 1000 samples
Class 3: 1000 samples
Class 4: 1000 samples
Class 5: 1000 samples
Class 6: 1000 samples
Class 7: 1000 samples
Class 8: 1000 samples
Class 9: 1000 samples


In [41]:
image_df = pd.DataFrame(X)
image_df['target'] = y

In [42]:
features = image_df.drop(columns=['target'])
labels = image_df['target']

In [43]:
print("\nVerifying class distribution in labels:")
print(labels.value_counts())


Verifying class distribution in labels:
target
1    2000
0    1000
2    1000
3    1000
4    1000
5    1000
6    1000
7    1000
8    1000
9    1000
Name: count, dtype: int64


In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)

In [45]:
train_classes = np.unique(y_train)
print("\nClasses in training data:", train_classes)
if len(train_classes) < 2:
    raise ValueError("Training data must have at least 2 classes!")


Classes in training data: [0 1 2 3 4 5 6 7 8 9]


In [46]:
def check_model_performance(model, train_X, train_y, test_X, test_y, model_name=""):
    print(f"\nTraining {model_name}...")
    model.fit(train_X, train_y)
    preds = model.predict(test_X)
    
    if isinstance(model, LinearRegression):
        mse = mean_squared_error(test_y, preds)
        r2 = r2_score(test_y, preds)
        print(f"MSE: {mse:.4f}")
        print(f"R² Score: {r2:.4f}")
    else:
        acc = accuracy_score(test_y, preds)
        prec = precision_score(test_y, preds, average='weighted', zero_division=0)
        rec = recall_score(test_y, preds, average='weighted', zero_division=0)
        f1 = f1_score(test_y, preds, average='weighted', zero_division=0)
        conf_mat = confusion_matrix(test_y, preds)
        
        print(f"Accuracy: {acc:.4f}")
        print(f"Precision: {prec:.4f}")
        print(f"Recall: {rec:.4f}")
        print(f"F1: {f1:.4f}")
        print("\nConfusion Matrix:")
        print(conf_mat)
    
    print("-" * 40)

In [47]:
print("\nInitializing models...")
model_zoo = {
    "Logistic Regression": LogisticRegression(max_iter=500, solver='lbfgs', multi_class='multinomial'),
    "Gaussian NB": GaussianNB(),
    "Linear Regression": LinearRegression()
}



Initializing models...


In [48]:
print("\nTuning Logistic Regression hyperparameters...")
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs'],
    'multi_class': ['ovr'],
    'max_iter': [100, 200]
}


Tuning Logistic Regression hyperparameters...


In [49]:
log_reg = LogisticRegression()
grid = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [50]:
print(f"\nBest LogReg params found: {grid.best_params_}")
best_log_reg = grid.best_estimator_


Best LogReg params found: {'C': 0.1, 'max_iter': 200, 'multi_class': 'ovr', 'solver': 'lbfgs'}


In [51]:
print("\nTesting all models...")
for name, model in model_zoo.items():
    if name == "Logistic Regression":
        check_model_performance(best_log_reg, X_train, y_train, X_test, y_test, "Tuned " + name)
    else:
        check_model_performance(model, X_train, y_train, X_test, y_test, name)


Testing all models...

Training Tuned Logistic Regression...
Accuracy: 0.6618
Precision: 0.6706
Recall: 0.6618
F1: 0.6612

Confusion Matrix:
[[159  17   1   3   7   3   6   0   2   2]
 [  6 269   4   8  11   6   0  17   6  73]
 [  0  21 153   9   2   4   2   4   3   2]
 [  3  15   6 149   4   2   2   7   8   4]
 [  1  35   4   1 141   3   4   4   7   0]
 [  8   9   3   7   8 125  17   6  14   3]
 [  3   2   5   1   4  12 171   0   2   0]
 [  3  54   5   4   4   2   1 117   6   4]
 [  2  33   3   5   9  17  15   4 111   1]
 [  7  99   5   7   9   3   0   7   2  61]]
----------------------------------------

Training Gaussian NB...
Accuracy: 0.4382
Precision: 0.4516
Recall: 0.4382
F1: 0.4401

Confusion Matrix:
[[115  22   7   9  13  13  12   1   3   5]
 [ 12 134  41  42   9   4   7  57  11  83]
 [  6  27  85  50   3   0   0   7  19   3]
 [ 12  26  18 109   8   3   1   8   8   7]
 [  8  43  15  12  71   4   3   8  27   9]
 [ 11  28  18   5   7  76  15  10  27   3]
 [ 10  16  13   7   3  