In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder
import joblib  # Import for saving models

In [2]:
sdir = r'D:\Fourth Year\First Term\Machine\Lectures\project\10000'
filepaths = []
labels = []
data = []

In [3]:
classlist = os.listdir(sdir)
class_index = 0
for klass in classlist:
    classpath = os.path.join(sdir, klass)
    flist = os.listdir(classpath)
    for f in flist:
        fpath = os.path.join(classpath, f)
        filepaths.append(fpath)
        labels.append(class_index)
        with Image.open(fpath) as img:
            img = img.convert('L')  # Convert to grayscale
            img_resized = img.resize((28, 28))  # Ensure size is 28x28
            img_array = np.array(img_resized).flatten()  # Flatten to 1D vector
            img_array = img_array / 255.0  # Normalize to [0, 1]
            data.append(img_array)  # Append the processed data
    class_index += 1

In [4]:
labels = np.array(labels).reshape(-1, 1)  # Convert labels to a column vector
encoder = OneHotEncoder(sparse=False)  # Initialize the encoder
one_hot_labels = encoder.fit_transform(labels)  # Apply one-hot encoding



In [5]:
data_df = pd.DataFrame(data)  # Image data as 784 columns
data_df = pd.concat([data_df, pd.DataFrame(one_hot_labels)], axis=1) 

In [6]:
X = data_df.iloc[:, :-len(classlist)]  # Features (exclude one-hot encoded labels)
y = data_df.iloc[:, -len(classlist):] 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [9]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train.idxmax(axis=1))  # Use index of the max value in one-hot encoded labels
    predictions = model.predict(X_test)

    if isinstance(model, LinearRegression):
        mse = mean_squared_error(y_test.idxmax(axis=1), predictions)
        r2 = r2_score(y_test.idxmax(axis=1), predictions)
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"R-squared: {r2:.4f}")
    else:
        accuracy = accuracy_score(y_test.idxmax(axis=1), predictions)
        precision = precision_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        recall = recall_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        f1 = f1_score(y_test.idxmax(axis=1), predictions, average='weighted', zero_division=0)
        conf_matrix = confusion_matrix(y_test.idxmax(axis=1), predictions)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

    print('-' * 50)

In [10]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=100, solver='lbfgs', multi_class='multinomial', penalty='l2'),
    "Naive Bayes": GaussianNB(),
    "Linear Regression": LinearRegression()
}

In [11]:
param_grid = {
    'C': [0.1, 1, 10],  # Reduced regularization range for efficiency
    'solver': ['lbfgs'],
    'multi_class': ['multinomial'],  # Multi-class classification setup
    'max_iter': [200],  # Limited to 200 iterations for efficiency
}

In [12]:
logistic_model = LogisticRegression()
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train.idxmax(axis=1))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
print("Best Parameters for Logistic Regression:", grid_search.best_params_)
best_logistic_model = grid_search.best_estimator_

Best Parameters for Logistic Regression: {'C': 0.1, 'max_iter': 200, 'multi_class': 'multinomial', 'solver': 'lbfgs'}


In [14]:
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    if model_name == "Logistic Regression":
        current_model = best_logistic_model
    else:
        current_model = model

Evaluating Logistic Regression...
Evaluating Naive Bayes...
Evaluating Linear Regression...


In [19]:
    evaluate_model(current_model, X_train, y_train, X_test, y_test)


Mean Squared Error: 6.1759
R-squared: 0.2434
--------------------------------------------------


In [20]:
    model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
    joblib.dump(current_model, model_filename)
    print(f"Saved {model_name} model to {model_filename}")

Saved Logistic Regression model to logistic_regression.pkl


In [36]:
def cross_validate_model(model, X, y):
    accuracies, precisions, recalls, f1_scores = [], [], [], []
    conf_matrices = []
    
    for train_idx, test_idx in kf.split(X):
        X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx]
        y_train_cv, y_test_cv = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train_cv, y_train_cv.idxmax(axis=1))

        if isinstance(model, LinearRegression):
            # Linear regression outputs continuous values; round to nearest integer
            predictions = np.round(model.predict(X_test_cv)).astype(int)
        else:
            predictions = model.predict(X_test_cv)
        predictions = np.clip(predictions, 0, y_train_cv.shape[1] - 1)
        accuracies.append(accuracy_score(y_test_cv.idxmax(axis=1), predictions))
        precisions.append(precision_score(y_test_cv.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        recalls.append(recall_score(y_test_cv.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        f1_scores.append(f1_score(y_test_cv.idxmax(axis=1), predictions, average='weighted', zero_division=0))
        conf_matrices.append(confusion_matrix(y_test_cv.idxmax(axis=1), predictions))
        print(f"Mean Accuracy: {np.mean(accuracies):.4f}, Std: {np.std(accuracies):.4f}")
        print(f"Mean Precision: {np.mean(precisions):.4f}, Std: {np.std(precisions):.4f}")
        print(f"Mean Recall: {np.mean(recalls):.4f}, Std: {np.std(recalls):.4f}")
        print(f"Mean F1-Score: {np.mean(f1_scores):.4f}, Std: {np.std(f1_scores):.4f}")
        print("-" * 50)

In [37]:
print("Cross-validation Results:")
for model_name, model in models.items():
    print(f"Cross-validating {model_name}...")
    cross_validate_model(model, X, y)

Cross-validation Results:
Cross-validating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean Accuracy: 0.7660, Std: 0.0000
Mean Precision: 0.7680, Std: 0.0000
Mean Recall: 0.7660, Std: 0.0000
Mean F1-Score: 0.7656, Std: 0.0000
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean Accuracy: 0.7595, Std: 0.0065
Mean Precision: 0.7614, Std: 0.0066
Mean Recall: 0.7595, Std: 0.0065
Mean F1-Score: 0.7591, Std: 0.0065
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean Accuracy: 0.7583, Std: 0.0056
Mean Precision: 0.7604, Std: 0.0055
Mean Recall: 0.7583, Std: 0.0056
Mean F1-Score: 0.7581, Std: 0.0055
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean Accuracy: 0.7598, Std: 0.0054
Mean Precision: 0.7616, Std: 0.0053
Mean Recall: 0.7598, Std: 0.0054
Mean F1-Score: 0.7595, Std: 0.0054
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean Accuracy: 0.7613, Std: 0.0057
Mean Precision: 0.7626, Std: 0.0051
Mean Recall: 0.7613, Std: 0.0057
Mean F1-Score: 0.7609, Std: 0.0056
--------------------------------------------------
Cross-validating Naive Bayes...
Mean Accuracy: 0.4995, Std: 0.0000
Mean Precision: 0.5312, Std: 0.0000
Mean Recall: 0.4995, Std: 0.0000
Mean F1-Score: 0.5028, Std: 0.0000
--------------------------------------------------
Mean Accuracy: 0.5045, Std: 0.0050
Mean Precision: 0.5322, Std: 0.0011
Mean Recall: 0.5045, Std: 0.0050
Mean F1-Score: 0.5058, Std: 0.0030
--------------------------------------------------
Mean Accuracy: 0.5037, Std: 0.0042
Mean Precision: 0.5387, Std: 0.0092
Mean Recall: 0.5037, Std: 0.0042
Mean F1-Score: 0.5066, Std: 0.0027
--------------------------------------------------
Mean Accuracy: 0.5095, Std: 0.0108
Mean Precision: 0.5436, Std: 0.0117
Mean Recall: 0.5095, Std: 0.0108
Mean F1-Score: 0.5124, Std: 0.0103
--------------------------------------------------
Mean Accuracy: 0.5