In [27]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
import joblib

In [30]:
base_dir = r'D:\Fourth Year\First Term\Machine\Lectures\project\10000'
file_paths = []
class_labels = []
image_data = []

In [31]:
categories = os.listdir(base_dir)
class_id = 0
for category in categories:
    category_path = os.path.join(base_dir, category)
    image_files = os.listdir(category_path)
    for image_file in image_files:
        image_path = os.path.join(category_path, image_file)
        file_paths.append(image_path)
        class_labels.append(class_id)
        with Image.open(image_path) as img:
            img = img.convert('L')  
            resized_img = img.resize((28, 28))  
            img_array = np.array(resized_img).flatten() / 255.0  
            image_data.append(img_array)
    class_id += 1


In [32]:
class_labels = np.array(class_labels).reshape(-1, 1)
label_encoder = OneHotEncoder(sparse=False)
one_hot_encoded_labels = label_encoder.fit_transform(class_labels)



In [33]:
data_frame = pd.DataFrame(image_data)  
label_frame = pd.DataFrame(one_hot_encoded_labels) 
data_frame = pd.concat([data_frame, label_frame], axis=1)

In [34]:
X_features = data_frame.iloc[:, :-len(categories)]
y_labels = data_frame.iloc[:, -len(categories):]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=42)


In [36]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)


In [37]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train.idxmax(axis=1))
    predictions = model.predict(X_test)

    if isinstance(model, LinearRegression):
        mse = mean_squared_error(y_test.idxmax(axis=1), predictions)
        r2 = r2_score(y_test.idxmax(axis=1), predictions)
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"R-squared: {r2:.4f}")
    else:
        accuracy = accuracy_score(y_test.idxmax(axis=1), predictions)
        precision = precision_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        recall = recall_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        f1 = f1_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        conf_matrix = confusion_matrix(y_test.idxmax(axis=1), predictions)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

    print('*_' * 50)

In [38]:
model_dict = {
    "Logistic Regression": LogisticRegression(max_iter=100, solver='lbfgs', multi_class='multinomial', penalty='l2'),
    "Naive Bayes": GaussianNB(),
    "Linear Regression": LinearRegression()
}

In [39]:
hyperparameter_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs'],
    'multi_class': ['multinomial'],
    'max_iter': [200]
}

In [18]:
grid_search = GridSearchCV(
        LogisticRegression(),
        log_reg_params,
        cv=5,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1
    )

In [40]:
logistic_reg = LogisticRegression()
grid_search = GridSearchCV(logistic_reg, hyperparameter_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train.idxmax(axis=1))

print("Best Parameters for Logistic Regression:", grid_search.best_params_)
optimized_logistic_model = grid_search.best_estimator_


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters for Logistic Regression: {'C': 0.1, 'max_iter': 200, 'multi_class': 'multinomial', 'solver': 'lbfgs'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
for model_name, model_instance in model_dict.items():
    print(f"Evaluating {model_name}...")
    current_model = optimized_logistic_model if model_name == "Logistic Regression" else model_instance

    evaluate_model(current_model, X_train, y_train, X_test, y_test)

    model_file = f"{model_name.replace(' ', '_').lower()}.pkl"
    joblib.dump(current_model, model_file)
    print(f"Saved {model_name} model to {model_file}")

Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7575
Precision: 0.7577
Recall: 0.7575
F1-Score: 0.7569
Confusion Matrix:
[[168   1   4   4   9   9   3   1   2   7]
 [  1 165   7   3   7   3   2   8   5   1]
 [  1   5 156   6   3   8   1   6   3   3]
 [  4   2   6 161   1   3   2   9   5   8]
 [  4   5   5   1 152   3   3   5  18  13]
 [  9   6   4   6   2 131  11   5   9   3]
 [  3   6   1   0   3  15 181   0   2   0]
 [  3   9   8   6   3   1   1 142   4  27]
 [  6   3   5   8  17  16  12   9 130   6]
 [  7   3   5   5   7   2   0  12   5 129]]
*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_
Saved Logistic Regression model to logistic_regression.pkl
Evaluating Naive Bayes...
Accuracy: 0.4995
Precision: 0.5312
Recall: 0.4995
F1-Score: 0.5028
Confusion Matrix:
[[112  26   3  10  25  17   6   1   3   5]
 [  4 136   4  13   9   4   3  15  12   2]
 [  7  38  91  17   8   1   2   5  22   1]
 [ 10  23   7 135   6   1   0   8   6   5]
 [ 27  33   9   5  84   2   5   9  28   7

In [42]:
def cross_validate_model(model, X, y):
    accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

    for train_idx, test_idx in k_folds.split(X):
        X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
        y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train_fold, y_train_fold.idxmax(axis=1))

        if isinstance(model, LinearRegression):
            predictions = np.round(model.predict(X_test_fold)).astype(int)
        else:
            predictions = model.predict(X_test_fold)

        predictions = np.clip(predictions, 0, y_train_fold.shape[1] - 1)
        accuracy_list.append(accuracy_score(y_test_fold.idxmax(axis=1), predictions))
        precision_list.append(precision_score(y_test_fold.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        recall_list.append(recall_score(y_test_fold.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        f1_list.append(f1_score(y_test_fold.idxmax(axis=1), predictions, average='weighted', zero_division=0))

    print(f"Mean Accuracy: {np.mean(accuracy_list):.4f}, Std: {np.std(accuracy_list):.4f}")
    print(f"Mean Precision: {np.mean(precision_list):.4f}, Std: {np.std(precision_list):.4f}")
    print(f"Mean Recall: {np.mean(recall_list):.4f}, Std: {np.std(recall_list):.4f}")
    print(f"Mean F1-Score: {np.mean(f1_list):.4f}, Std: {np.std(f1_list):.4f}")
    print("-*-" * 50)

In [43]:
print("Cross-validation Results:")
for model_name, model_instance in model_dict.items():
    print(f"Cross-validating {model_name}...")
    cross_validate_model(model_instance, X_features, y_labels)

Cross-validation Results:
Cross-validating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy: 0.7613, Std: 0.0057
Mean Precision: 0.7626, Std: 0.0051
Mean Recall: 0.7613, Std: 0.0057
Mean F1-Score: 0.7609, Std: 0.0056
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Cross-validating Naive Bayes...
Mean Accuracy: 0.5085, Std: 0.0098
Mean Precision: 0.5434, Std: 0.0104
Mean Recall: 0.5085, Std: 0.0098
Mean F1-Score: 0.5113, Std: 0.0095
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Cross-validating Linear Regression...
Mean Accuracy: 0.1358, Std: 0.0063
Mean Precision: 0.1911, Std: 0.0154
Mean Recall: 0.1358, Std: 0.0063
Mean F1-Score: 0.1253, Std: 0.0073
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
