In [23]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [15]:
sdir = r"D:\Fourth Year\First Term\Machine\Lectures\project\10000"
filepaths = []
labels = []
data = []  # Reset data
filepaths = [] 
class_index = 0 
classlist = os.listdir(sdir)


In [16]:
for klass in classlist:
    classpath = os.path.join(sdir, klass)
    if not os.path.isdir(classpath):
        print(f"Skipping {klass}: Not a directory")
        continue

    flist = os.listdir(classpath)
    print(f"Processing class '{klass}' with {len(flist)} files")

    for f in flist:
        fpath = os.path.join(classpath, f)
        try:
            with Image.open(fpath) as img:
                img = img.convert('L')  # Convert to grayscale
                img_resized = img.resize((28, 28))  # Resize
                img_array = np.array(img_resized).flatten() / 255.0  # Normalize
                data.append(img_array)  # Add to data
                labels.append(class_index)  # Add label
                filepaths.append(fpath)  # Track processed files
        except Exception as e:
            print(f"Error processing file {fpath}: {e}")

    class_index += 1

Processing class '0' with 1000 files
Processing class '1' with 1000 files
Processing class '2' with 1000 files
Processing class '3' with 1000 files
Processing class '4' with 1000 files
Processing class '5' with 1000 files
Processing class '6' with 1000 files
Processing class '7' with 1000 files
Processing class '8' with 1000 files
Processing class '9' with 1000 files


In [17]:
print(f"Total processed images: {len(data)}")
print(f"Total labels: {len(labels)}")

Total processed images: 10000
Total labels: 10000


In [18]:
if len(data) != len(labels):
    raise ValueError("Mismatch between the number of images and labels!")

In [19]:
data_df = pd.DataFrame(data)
data_df["labels"] = labels
print(data_df.head())
print('DataFrame length:', len(data_df))

          0         1         2         3         4         5         6  \
0  0.627451  0.658824  0.654902  0.627451  0.635294  0.647059  0.639216   
1  0.549020  0.470588  0.243137  0.101961  0.211765  0.415686  0.533333   
2  0.686275  0.682353  0.674510  0.674510  0.674510  0.674510  0.682353   
3  0.741176  0.717647  0.741176  0.764706  0.725490  0.658824  0.650980   
4  0.435294  0.262745  0.113725  0.286275  0.443137  0.454902  0.454902   

          7         8         9  ...       775       776       777       778  \
0  0.635294  0.647059  0.654902  ...  0.647059  0.615686  0.607843  0.658824   
1  0.600000  0.709804  0.678431  ...  0.678431  0.690196  0.698039  0.682353   
2  0.682353  0.678431  0.678431  ...  0.486275  0.419608  0.517647  0.666667   
3  0.682353  0.662745  0.690196  ...  0.498039  0.549020  0.623529  0.709804   
4  0.450980  0.454902  0.443137  ...  0.458824  0.462745  0.466667  0.470588   

        779       780       781       782       783  labels  
0  0.6

In [20]:
balance = data_df['labels'].value_counts()
print("Class balance:\n", balance)

Class balance:
 labels
0    1000
1    1000
2    1000
3    1000
4    1000
5    1000
6    1000
7    1000
8    1000
9    1000
Name: count, dtype: int64


In [21]:
X = data_df.drop(columns=['labels'])
y = data_df['labels']

In [24]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [26]:
logistic_model = LogisticRegression(max_iter=500, solver='lbfgs', multi_class='multinomial')
naive_bayes_model = GaussianNB()
linear_model = LinearRegression()

In [27]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    if isinstance(model, LinearRegression):
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"R-squared: {r2:.4f}")
    else:
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
        recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
        f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
        conf_matrix = confusion_matrix(y_test, predictions)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

    print('-' * 50)

In [28]:
print("\nLogistic Regression:")
evaluate_model(logistic_model, X_train, y_train, X_test, y_test)


Logistic Regression:
Accuracy: 0.6740
Precision: 0.6736
Recall: 0.6740
F1-Score: 0.6729
Confusion Matrix:
[[144   9   3   3   6  11   8   4   7   5]
 [  2 154   4   6   8   4   2  10   4   6]
 [  7  12 134  10   4   7   2   7   6  11]
 [  8   5  10 139   5   6   0  12   8   7]
 [ 12  10   3   2 135   5   7   4  13   9]
 [ 10   3   9   5   7 130  18   3  10   5]
 [  3   6   2   0   5  21 156   0   7   0]
 [  1  14   6   9  11   1   1 121   8  28]
 [ 10   8   8   8  20  12   6  14 101  13]
 [  9   3  10   8   9   2   0  21   4 134]]
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'sag'],
    'C': [0.1, 1, 10],
    'max_iter': [200, 500, 1000]
}

In [30]:
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("\nBest parameters for Logistic Regression:", grid_search.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Best parameters for Logistic Regression: {'C': 0.1, 'max_iter': 200, 'solver': 'sag'}




In [31]:
print("\nEvaluating the best Logistic Regression model:")
evaluate_model(best_model, X_train, y_train, X_test, y_test)


Evaluating the best Logistic Regression model:
Accuracy: 0.7275
Precision: 0.7271
Recall: 0.7275
F1-Score: 0.7262
Confusion Matrix:
[[154   9   1   2   4   6  10   2   5   7]
 [  3 161   4   6   2   3   1  11   7   2]
 [  4   7 141  14   4   8   2   8   4   8]
 [  8   5  13 141   6   8   0   6   5   8]
 [  7   6   4   4 145   5   7   4   9   9]
 [  9   3   4   2   4 150  17   4   6   1]
 [  1   8   1   0   2  20 163   0   5   0]
 [  1   9   9   2   5   1   1 143   7  22]
 [ 11   4   6   9  18  10   8  12 111  11]
 [  8   2   5   7  11   1   0  17   3 146]]
--------------------------------------------------


