In [20]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB

In [21]:
sdir = r'D:\Fourth Year\First Term\Machine\Lectures\project\10000'
filepaths = []
labels = []
data = []
classlist = os.listdir(sdir)
class_index = 0

In [22]:
for klass in classlist:
    classpath = os.path.join(sdir, klass)
    flist = os.listdir(classpath)

    for f in flist:
        fpath = os.path.join(classpath, f)
        filepaths.append(fpath)
        labels.append(class_index)

        # Process the image
        with Image.open(fpath) as img:
            img = img.convert('L')  # Convert to grayscale
            img_resized = img.resize((28, 28))  # Ensure size is 28x28
            img_array = np.array(img_resized).flatten()  # Flatten to 1D vector
            img_array = img_array / 255.0  # Normalize to [0, 1]
            data.append(img_array)  # Append the processed data

    class_index += 1

In [23]:
data_df = pd.DataFrame(data)  # Image data as 784 columns
data_df['labels'] = labels  

In [24]:
print(data_df.head())  # Print the first few rows
print('DataFrame length:', len(data_df))

          0         1         2         3         4         5         6  \
0  0.627451  0.658824  0.654902  0.627451  0.635294  0.647059  0.639216   
1  0.549020  0.470588  0.243137  0.101961  0.211765  0.415686  0.533333   
2  0.686275  0.682353  0.674510  0.674510  0.674510  0.674510  0.682353   
3  0.741176  0.717647  0.741176  0.764706  0.725490  0.658824  0.650980   
4  0.435294  0.262745  0.113725  0.286275  0.443137  0.454902  0.454902   

          7         8         9  ...       775       776       777       778  \
0  0.635294  0.647059  0.654902  ...  0.647059  0.615686  0.607843  0.658824   
1  0.600000  0.709804  0.678431  ...  0.678431  0.690196  0.698039  0.682353   
2  0.682353  0.678431  0.678431  ...  0.486275  0.419608  0.517647  0.666667   
3  0.682353  0.662745  0.690196  ...  0.498039  0.549020  0.623529  0.709804   
4  0.450980  0.454902  0.443137  ...  0.458824  0.462745  0.466667  0.470588   

        779       780       781       782       783  labels  
0  0.6

In [25]:
balance = data_df['labels'].value_counts()
print(balance)

labels
0    1000
1    1000
2    1000
3    1000
4    1000
5    1000
6    1000
7    1000
8    1000
9    1000
Name: count, dtype: int64


In [26]:
X = data_df.drop(columns=['labels'])
y = data_df['labels']


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [28]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [29]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)  # Train the model
    predictions = model.predict(X_test)  # Predict on test data

    if isinstance(model, LinearRegression):
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"R-squared: {r2:.4f}")
    else:
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
        recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
        f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
        conf_matrix = confusion_matrix(y_test, predictions)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

    print('*' * 50)

In [37]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, solver='lbfgs', multi_class='multinomial'),
    "Naive Bayes": GaussianNB(),
    "Linear Regression": LinearRegression()
}

In [38]:
param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['lbfgs'],
    'multi_class': ['ovr'],  # One-vs-rest strategy
    'max_iter': [100, 200]
}

In [39]:
logistic_model = LogisticRegression()
grid_search = GridSearchCV(logistic_model, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [40]:
print("Best Parameters for Logistic Regression:", grid_search.best_params_)
best_logistic_model = grid_search.best_estimator_


Best Parameters for Logistic Regression: {'C': 0.1, 'max_iter': 200, 'multi_class': 'ovr', 'solver': 'lbfgs'}


In [41]:
print("\nBest Logistic Regression Model:")
evaluate_model(best_logistic_model, X_train, y_train, X_test, y_test)


Best Logistic Regression Model:
Accuracy: 0.7620
Precision: 0.7619
Recall: 0.7620
F1-Score: 0.7610
Confusion Matrix:
[[162   8   0   2   7   2   7   2   4   6]
 [  5 163   5   5   2   3   1  10   6   0]
 [  4   5 150  10   6   3   1   9   3   9]
 [  8   3   7 155   9   3   0   4   7   4]
 [  5   6   3   1 154   3   8   7   6   7]
 [  7   5   3   5   2 153  11   5   8   1]
 [  0  10   1   1   1  11 172   1   3   0]
 [  1  10   9   2   6   1   3 140   5  23]
 [  9   5   4   8  13  15   7  11 123   5]
 [  7   3   5   9  10   0   0  11   3 152]]
**************************************************


In [42]:
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    if model_name == "Logistic Regression":
        evaluate_model(best_logistic_model, X_train, y_train, X_test, y_test)
    else:
        evaluate_model(model, X_train, y_train, X_test, y_test)

Evaluating Logistic Regression...
Accuracy: 0.7620
Precision: 0.7619
Recall: 0.7620
F1-Score: 0.7610
Confusion Matrix:
[[162   8   0   2   7   2   7   2   4   6]
 [  5 163   5   5   2   3   1  10   6   0]
 [  4   5 150  10   6   3   1   9   3   9]
 [  8   3   7 155   9   3   0   4   7   4]
 [  5   6   3   1 154   3   8   7   6   7]
 [  7   5   3   5   2 153  11   5   8   1]
 [  0  10   1   1   1  11 172   1   3   0]
 [  1  10   9   2   6   1   3 140   5  23]
 [  9   5   4   8  13  15   7  11 123   5]
 [  7   3   5   9  10   0   0  11   3 152]]
**************************************************
Evaluating Naive Bayes...
Accuracy: 0.5115
Precision: 0.5433
Recall: 0.5115
F1-Score: 0.5144
Confusion Matrix:
[[118  26   3   8  15   7  11   2   2   8]
 [  2 133   2  19   5   5   7  20   4   3]
 [  4  40  70  42   6   1   1  11  16   9]
 [ 11  25  11 122   7   2   0   5   6  11]
 [ 18  28   8   3  86   1  10   7  23  16]
 [ 12  45   9   3   6  89  14   5  12   5]
 [ 14  38   2   2   3  13 117 

labels
0    1000
Name: count, dtype: int64
