In [13]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
import joblib

In [2]:
sdir = r'D:\Fourth Year\First Term\Machine\Lectures\project\10000'
filepaths = []
labels = []
data = []

In [3]:
classlist = os.listdir(sdir)
class_index = 0
for klass in classlist:
    classpath = os.path.join(sdir, klass)
    flist = os.listdir(classpath)
    for f in flist:
        fpath = os.path.join(classpath, f)
        filepaths.append(fpath)
        labels.append(class_index)
        with Image.open(fpath) as img:
            img = img.convert('L')  # Convert to grayscale
            img_resized = img.resize((28, 28))  # Ensure size is 28x28
            img_array = np.array(img_resized).flatten()  # Flatten to 1D vector
            img_array = img_array / 255.0  # Normalize to [0, 1]
            data.append(img_array)  # Append the processed data
    class_index += 1

In [4]:
labels = np.array(labels).reshape(-1, 1)  
encoder = OneHotEncoder(sparse_output=False)  
one_hot_labels = encoder.fit_transform(labels) 

In [5]:
data_df = pd.DataFrame(data)  
data_df = pd.concat([data_df, pd.DataFrame(one_hot_labels)], axis=1) 

In [6]:
X = data_df.iloc[:, :-len(classlist)]  
y = data_df.iloc[:, -len(classlist):] 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [31]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train.idxmax(axis=1))
    predictions = model.predict(X_test)

    if isinstance(model, LinearRegression):
        mse = mean_squared_error(y_test.idxmax(axis=1), predictions)
        r2 = r2_score(y_test.idxmax(axis=1), predictions)
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"R-squared: {r2:.4f}")
    else:
        accuracy = accuracy_score(y_test.idxmax(axis=1), predictions)
        precision = precision_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        recall = recall_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        f1 = f1_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        conf_matrix = confusion_matrix(y_test.idxmax(axis=1), predictions)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

    print('-#-' * 50)

In [23]:
linear_pipeline = Pipeline([
    ('linear_regression_ova', OneVsRestClassifier(LinearRegression()))
])



In [24]:
logistic_pipeline = Pipeline([
    ('logistic_step', LogisticRegression(multi_class='ovr', max_iter=1000))
])



In [25]:
naive_bayes_pipeline = Pipeline([
    ('naive_bayes_step', GaussianNB())
])

In [26]:
models = {
    "Linear Regression": linear_pipeline,
    "Logistic Regression": logistic_pipeline,
    "Naive Bayes": naive_bayes_pipeline
}

In [33]:
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs'],
    'multi_class': ['multinomial'],
    'max_iter': [200]
}

In [34]:
logistic_model = LogisticRegression()
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train.idxmax(axis=1))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
print("Best Parameters for Logistic Regression:", grid_search.best_params_)
best_logistic_model = grid_search.best_estimator_

Best Parameters for Logistic Regression: {'C': 0.1, 'max_iter': 200, 'multi_class': 'multinomial', 'solver': 'lbfgs'}


In [38]:
for model_name, pipeline in models.items():
    print(f"Evaluating {model_name}...")
    if model_name == "Logistic Regression":
        pipeline.set_params(logistic_step=best_logistic_model)

    evaluate_model(pipeline, X_train, y_train, X_test, y_test)

    model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
    joblib.dump(pipeline, model_filename)
    print(f"Saved {model_name} model to {model_filename}")

Evaluating Linear Regression...
Accuracy: 0.6590
Precision: 0.6602
Recall: 0.6590
F1-Score: 0.6571
Confusion Matrix:
[[160   3   4  11  10   9   3   1   1   6]
 [  3 153  12   2  10   4   7   9   1   1]
 [  2   6 138  13   5   5   4   4   5  10]
 [  8   6   9 149   5   7   0   7   3   7]
 [  2  16   3   1 128  10  15   4  16  14]
 [ 10   8   3   7   5 110  14   9  10  10]
 [  8   6   3   1   8  19 159   2   4   1]
 [  3  15   7   9   5   4   2 116   7  36]
 [  5   8   6  23  17  17  14  11 101  10]
 [  4   4  10  10  10   6   3  20   4 104]]
-#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#-
Saved Linear Regression model to linear_regression.pkl
Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7575
Precision: 0.7577
Recall: 0.7575
F1-Score: 0.7569
Confusion Matrix:
[[168   1   4   4   9   9   3   1   2   7]
 [  1 165   7   3   7   3   2   8   5   1]
 [  1   5 156   6   3   8   1   6   3   3]
 [  4   2   6 161   1   3   2   9   5   8]
 [  4   5   5   1 152   3   3   5  18  13]
 [  9   6   4   6   2 131  11   5   9   3]
 [  3   6   1   0   3  15 181   0   2   0]
 [  3   9   8   6   3   1   1 142   4  27]
 [  6   3   5   8  17  16  12   9 130   6]
 [  7   3   5   5   7   2   0  12   5 129]]
-#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#-
Saved Logistic Regression model to logistic_regression.pkl
Evaluating Naive Bayes...
Accuracy: 0.4995
Precision: 0.5312
Recall: 0.4995
F1-Score: 0.5028
Confusion Matrix:
[[112  26   3  10  25  17   6   1   3   5]
 [  4 136   4  13   9   4   3  15  12   2]
 [  7  38  91  17   8   1   2   5  22   1]
 [ 10  23   7 135   6   1   0   8 

In [39]:
def cross_validate_model(model, X, y):
    accuracies, precisions, recalls, f1_scores = [], [], [], []
    conf_matrices = []

    for train_idx, test_idx in kf.split(X):
        X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx]
        y_train_cv, y_test_cv = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train_cv, y_train_cv.idxmax(axis=1))

        if isinstance(model, LinearRegression):
            predictions = np.round(model.predict(X_test_cv)).astype(int)
        else:
            predictions = model.predict(X_test_cv)
        predictions = np.clip(predictions, 0, y_train_cv.shape[1] - 1)

        accuracies.append(accuracy_score(y_test_cv.idxmax(axis=1), predictions))
        precisions.append(precision_score(y_test_cv.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        recalls.append(recall_score(y_test_cv.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        f1_scores.append(f1_score(y_test_cv.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        conf_matrices.append(confusion_matrix(y_test_cv.idxmax(axis=1), predictions))

    print(f"Mean Accuracy: {np.mean(accuracies):.4f}, Std: {np.std(accuracies):.4f}")
    print(f"Mean Precision: {np.mean(precisions):.4f}, Std: {np.std(precisions):.4f}")
    print(f"Mean Recall: {np.mean(recalls):.4f}, Std: {np.std(recalls):.4f}")
    print(f"Mean F1-Score: {np.mean(f1_scores):.4f}, Std: {np.std(f1_scores):.4f}")
    print("-*-*-" * 20)

In [40]:
print("Cross-validation Results:")
for model_name, pipeline in models.items():
    print(f"Cross-validating {model_name}...")
    cross_validate_model(pipeline, X, y)
    print(f"Completed Cross-validation for {model_name}.")

Cross-validation Results:
Cross-validating Linear Regression...
Mean Accuracy: 0.6577, Std: 0.0022
Mean Precision: 0.6585, Std: 0.0033
Mean Recall: 0.6577, Std: 0.0022
Mean F1-Score: 0.6556, Std: 0.0027
-*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*-
Completed Cross-validation for Linear Regression.
Cross-validating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy: 0.7609, Std: 0.0064
Mean Precision: 0.7616, Std: 0.0062
Mean Recall: 0.7609, Std: 0.0064
Mean F1-Score: 0.7604, Std: 0.0063
-*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*-
Completed Cross-validation for Logistic Regression.
Cross-validating Naive Bayes...
Mean Accuracy: 0.5085, Std: 0.0098
Mean Precision: 0.5434, Std: 0.0104
Mean Recall: 0.5085, Std: 0.0098
Mean F1-Score: 0.5113, Std: 0.0095
-*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*--*-*-
Completed Cross-validation for Naive Bayes.
