# Model Exploration
This objective of this project is to evaluates 3 approaches to accurately analyze real-world data: a naive approach, a non deep learning approach, and a neural network-based deep learning approach 

In [1]:
# Imports
import os
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

In [2]:
# Paths
train_dir = Path("data/processed/train")
val_dir = Path("data/processed/val")
test_dir = Path("data/processed/test")

### Naive Approach
Predict the majority class

In [None]:
def count_images_per_class(directory):
    '''Count the number of images per class'''
    counts = {}
    for cls in ["0", "1"]:
        cls_path = directory / cls # Build path to class folder
        counts[cls] = len(list(cls_path.glob("*.png"))) # Line written with GPT-5
    return counts

print("Train set:", count_images_per_class(train_dir))
print("Validation set:", count_images_per_class(val_dir))
print("Test set:", count_images_per_class(test_dir))

Train set: {'0': 155946, '1': 62302}
Validation set: {'0': 17561, '1': 7029}
Test set: {'0': 25231, '1': 9455}


In [None]:
train_counts = count_images_per_class(train_dir) # Count number of images in each class in train set
majority_class = max(train_counts, key=train_counts.get) # Determine which class has the most images
print(f"Majority class in training set: {majority_class}")

Majority class in training set: 0


In [None]:
def naive_predict(directory, predicted_class):
    """
    Predicts the majority class for all images in a given directory.
    """
    y_true = []
    y_pred = []
    for cls in ["0", "1"]:
        cls_path = directory / cls
        files = list(cls_path.glob("*.png")) # List of all PNG files in class folder
        y_true.extend([int(cls)] * len(files)) # Add true lables to y_true
        y_pred.extend([int(predicted_class)] * len(files)) # Add predicted lables to y_pred
    return y_true, y_pred

In [None]:
# Validation set evaluation
y_val_true, y_val_pred = naive_predict(val_dir, majority_class)
val_accuracy = accuracy_score(y_val_true, y_val_pred)
val_f1_score = f1_score(y_val_true, y_val_pred)
val_roc_auc = roc_auc_score(y_val_true, y_val_pred)

print("Validation Set:")
print("Accuracy:", round(val_accuracy, 4))
print("F1-score:", round(val_f1_score, 4))
print("ROC-AUC:", round(val_roc_auc, 4))
print(classification_report(y_val_true, y_val_pred, zero_division=0))

# Test set evaluation
y_test_true, y_test_pred = naive_predict(test_dir, majority_class)
test_accuracy = accuracy_score(y_test_true, y_test_pred)
test_f1_score = f1_score(y_test_true, y_test_pred)
test_roc_auc = roc_auc_score(y_test_true, y_test_pred)

print("Test Set:")
print("Accuracy:", round(test_accuracy, 4))
print("F1-score:", round(test_f1_score, 4))
print("ROC-AUC:", round(test_roc_auc, 4))
print(classification_report(y_test_true, y_test_pred, zero_division=0))

Validation Set:
Accuracy: 0.7142
F1-score: 0.0
ROC-AUC: 0.5
              precision    recall  f1-score   support

           0       0.71      1.00      0.83     17561
           1       0.00      0.00      0.00      7029

    accuracy                           0.71     24590
   macro avg       0.36      0.50      0.42     24590
weighted avg       0.51      0.71      0.60     24590

Test Set:
Accuracy: 0.7274
F1-score: 0.0
ROC-AUC: 0.5
              precision    recall  f1-score   support

           0       0.73      1.00      0.84     25231
           1       0.00      0.00      0.00      9455

    accuracy                           0.73     34686
   macro avg       0.36      0.50      0.42     34686
weighted avg       0.53      0.73      0.61     34686



### Classical Machine Learning Approach

### Neural Network-based Deep Learning Approach