In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [11]:
DEBUG = True


In [12]:
IMG_DIR = r'D:\Fourth Year\First Term\Machine\Lectures\project\10000'


In [13]:
def preprocess_image(img):
    """Enhanced image preprocessing"""
    # Convert to grayscale
    gray_img = img.convert('L')
    
    # Resize
    small_img = gray_img.resize((28, 28))
    
    # Convert to numpy array and normalize
    img_array = np.array(small_img).flatten() / 255.0
    
    # Apply contrast enhancement
    img_array = np.clip((img_array - img_array.mean()) * 1.5 + 0.5, 0, 1)
    
    return img_array


In [14]:
img_paths = []
img_labels = []
processed_data = []

In [15]:
print("Checking directory structure...")
if not os.path.exists(IMG_DIR):
    raise ValueError(f"Directory {IMG_DIR} does not exist!")

class_folders = [f for f in os.listdir(IMG_DIR) if os.path.isdir(os.path.join(IMG_DIR, f))]
if not class_folders:
    raise ValueError(f"No subdirectories found in {IMG_DIR}")

print(f"Found {len(class_folders)} class folders: {class_folders}")

Checking directory structure...
Found 10 class folders: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [16]:
for label_idx, folder in enumerate(class_folders):
    folder_path = os.path.join(IMG_DIR, folder)
    files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    print(f"\nProcessing class {folder} (label {label_idx})")
    print(f"Found {len(files)} images in {folder}")
    
    for img_file in files:
        try:
            full_path = os.path.join(folder_path, img_file)
            with Image.open(full_path) as img:
                processed_img = preprocess_image(img)
                processed_data.append(processed_img)
                img_paths.append(full_path)
                img_labels.append(label_idx)
                
        except Exception as e:
            print(f"Error processing {img_file}: {str(e)}")
            continue
    
    class_count = sum(1 for label in img_labels if label == label_idx)
    print(f"Successfully processed {class_count} images for class {label_idx}")


Processing class 0 (label 0)
Found 1000 images in 0
Successfully processed 1000 images for class 0

Processing class 1 (label 1)
Found 1000 images in 1
Successfully processed 1000 images for class 1

Processing class 2 (label 2)
Found 1000 images in 2
Successfully processed 1000 images for class 2

Processing class 3 (label 3)
Found 1000 images in 3
Successfully processed 1000 images for class 3

Processing class 4 (label 4)
Found 1000 images in 4
Successfully processed 1000 images for class 4

Processing class 5 (label 5)
Found 1000 images in 5
Successfully processed 1000 images for class 5

Processing class 6 (label 6)
Found 1000 images in 6
Successfully processed 1000 images for class 6

Processing class 7 (label 7)
Found 1000 images in 7
Successfully processed 1000 images for class 7

Processing class 8 (label 8)
Found 1000 images in 8
Successfully processed 1000 images for class 8

Processing class 9 (label 9)
Found 1000 images in 9
Successfully processed 1000 images for class 9


In [17]:
X = np.array(processed_data)
y = np.array(img_labels)

In [18]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [19]:
image_df = pd.DataFrame(X_scaled)
image_df['target'] = y

In [20]:
features = image_df.drop(columns=['target'])
labels = image_df['target']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)

In [22]:
def check_model_performance(model, train_X, train_y, test_X, test_y, model_name=""):
    print(f"\nTraining {model_name}...")
    model.fit(train_X, train_y)
    preds = model.predict(test_X)
    
    if isinstance(model, LinearRegression):
        mse = mean_squared_error(test_y, preds)
        r2 = r2_score(test_y, preds)
        print(f"MSE: {mse:.4f}")
        print(f"R² Score: {r2:.4f}")
    else:
        acc = accuracy_score(test_y, preds)
        prec = precision_score(test_y, preds, average='weighted', zero_division=0)
        rec = recall_score(test_y, preds, average='weighted', zero_division=0)
        f1 = f1_score(test_y, preds, average='weighted', zero_division=0)
        conf_mat = confusion_matrix(test_y, preds)
        
        print(f"Accuracy: {acc:.4f}")
        print(f"Precision: {prec:.4f}")
        print(f"Recall: {rec:.4f}")
        print(f"F1: {f1:.4f}")
        if hasattr(model, 'penalty'):
            print(f"Regularization type: {model.penalty}")
            print(f"Regularization strength (C): {model.C}")
        print("\nConfusion Matrix:")
        print(conf_mat)
    
    print("-" * 40)

In [23]:
model_zoo = {
    "Logistic Regression": LogisticRegression(
        penalty='l2',  # L2 regularization
        C=0.1,        # Stronger regularization
        solver='lbfgs',
        multi_class='multinomial',
        max_iter=1000
    ),
    "Linear Regression": LinearRegression(),
    "Gaussian NB": GaussianNB()
}


In [24]:
print("\nTuning Logistic Regression hyperparameters...")
param_dist = {
    'C': [0.001, 0.01, 0.1, 1.0, 10.0],       # Regularization strength
    'penalty': ['l1', 'l2'],                   # Regularization type
    'solver': ['liblinear', 'saga'],           # Solvers that support both l1 and l2
    'multi_class': ['ovr'],
    'max_iter': [1000],
    'class_weight': [None, 'balanced']
}


Tuning Logistic Regression hyperparameters...


In [5]:
random_search = RandomizedSearchCV(
    LogisticRegression(),
    param_distributions=param_dist,
    n_iter=10,  # Use 10 random combinations
    cv=5,       # Use 3-fold cross-validation for faster computation
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

In [25]:
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [20]:
print(f"\nBest LogReg params found: {grid.best_params_}")
best_log_reg = grid.best_estimator_


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'