In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
DEBUG = True


In [3]:
IMG_DIR = r'D:\Fourth Year\First Term\Machine\Lectures\project\10000'


In [4]:
def preprocess_image(img):
    """Enhanced image preprocessing"""
    # Convert to grayscale
    gray_img = img.convert('L')
    
    # Resize
    small_img = gray_img.resize((28, 28))
    
    # Convert to numpy array and normalize
    img_array = np.array(small_img).flatten() / 255.0
    
    # Apply contrast enhancement
    img_array = np.clip((img_array - img_array.mean()) * 1.5 + 0.5, 0, 1)
    
    return img_array

In [5]:
img_paths = []
img_labels = []
processed_data = []

In [7]:
print("Checking directory structure...")
if not os.path.exists(IMG_DIR):
    raise ValueError(f"Directory {IMG_DIR} does not exist!")

class_folders = [f for f in os.listdir(IMG_DIR) if os.path.isdir(os.path.join(IMG_DIR, f))]
if not class_folders:
    raise ValueError(f"No subdirectories found in {IMG_DIR}")

print(f"Found {len(class_folders)} class folders: {class_folders}")

Checking directory structure...
Found 10 class folders: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [8]:
for label_idx, folder in enumerate(class_folders):
    folder_path = os.path.join(IMG_DIR, folder)
    files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    print(f"\nProcessing class {folder} (label {label_idx})")
    print(f"Found {len(files)} images in {folder}")



Processing class 0 (label 0)
Found 1000 images in 0

Processing class 1 (label 1)
Found 1000 images in 1

Processing class 2 (label 2)
Found 1000 images in 2

Processing class 3 (label 3)
Found 1000 images in 3

Processing class 4 (label 4)
Found 1000 images in 4

Processing class 5 (label 5)
Found 1000 images in 5

Processing class 6 (label 6)
Found 1000 images in 6

Processing class 7 (label 7)
Found 1000 images in 7

Processing class 8 (label 8)
Found 1000 images in 8

Processing class 9 (label 9)
Found 1000 images in 9


In [9]:
 
    for img_file in files:
        try:
            full_path = os.path.join(folder_path, img_file)
            with Image.open(full_path) as img:
                processed_img = preprocess_image(img)
                processed_data.append(processed_img)
                img_paths.append(full_path)
                img_labels.append(label_idx)
                
        except Exception as e:
            print(f"Error processing {img_file}: {str(e)}")
            continue

In [11]:
    class_count = sum(1 for label in img_labels if label == label_idx)
    print(f"Successfully processed {class_count} images for class {label_idx}")

Successfully processed 1000 images for class 9


In [12]:
X = np.array(processed_data)
y = np.array(img_labels)


In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
image_df = pd.DataFrame(X_scaled)
image_df['target'] = y

In [15]:
features = image_df.drop(columns=['target'])
labels = image_df['target']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)


In [17]:
def check_model_performance(model, train_X, train_y, test_X, test_y, model_name=""):
    print(f"\nTraining {model_name}...")
    model.fit(train_X, train_y)
    preds = model.predict(test_X)
    
    acc = accuracy_score(test_y, preds)
    prec = precision_score(test_y, preds, average='weighted', zero_division=0)
    rec = recall_score(test_y, preds, average='weighted', zero_division=0)
    f1 = f1_score(test_y, preds, average='weighted', zero_division=0)
    conf_mat = confusion_matrix(test_y, preds)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_mat)
    print("-" * 40)
    
    return acc

In [18]:
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        multi_class='multinomial',
        solver='lbfgs',
        C=1.0,
        penalty='l2'  # Adding L2 regularization penalty
    ),
    "Linear Regression": LinearRegression(),
    "Gaussian NB": GaussianNB()
}

In [19]:
print("\nPerforming enhanced grid search for Logistic Regression...")
log_param_grid = {
    'C': [0.1, 1.0, 10.0],  # Reduced options for speed
    'solver': ['lbfgs'],     # Limited to one solver to avoid long grid search time
    'multi_class': ['multinomial'],  # Limited to multinomial to speed up the grid search
    'max_iter': [1000],
    'class_weight': [None],  # Removed balanced class weights to make it quicker
    'penalty': ['l2']  # Keeping only L2 penalty for simplicity
}


Performing enhanced grid search for Logistic Regression...


In [24]:
log_reg = LogisticRegression()
grid = GridSearchCV(
    log_reg,
    log_param_grid,
    cv=5,  
    scoring='accuracy',
    verbose=1,
    n_jobs=-1  # Use all CPU cores
)

In [25]:
grid.fit(X_train, y_train)


Fitting 5 folds for each of 3 candidates, totalling 15 fits


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Anacoda\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Anacoda\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anacoda\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1252, in fit
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 9
