# New Section

In [8]:
import os
import cv2  # For image processing
import numpy as np
import zipfile
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA  # Dimensionality reduction for image data

# Configure dataset folders (updated to your absolute paths)
stop_dir = r"H:\Fall 25\cse445\project again\GIT-445-Group-08-ML-Project-main\Images"
nonstop_dir = r"H:\Fall 25\cse445\project again\GIT-445-Group-08-ML-Project-main\images without street sign"

# Backward-compatible single-folder fallback (keeps original behavior)
single_folder_fallback = os.path.join(os.getcwd(), 'Images')

size = (128, 128)

# Lists to store images and labels
images = []
labels = []  # 1 = stop sign, 0 = non-stop sign

# Helper to load images from a directory with label
def load_dir_as_label(directory, label):
    if not os.path.exists(directory):
        return 0
    count = 0
    try:
        for fname in os.listdir(directory):
            if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
                p = os.path.join(directory, fname)
                img = cv2.imread(p)
                if img is None:
                    print(f"Warning: Failed to read {p}")
                    continue
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                gray = cv2.resize(gray, size)
                gray = gray / 255.0
                images.append(gray.flatten())
                labels.append(label)
                count += 1
    except PermissionError:
        print(f"PermissionError: cannot access {directory}")
    except Exception as e:
        print(f"Error loading {directory}: {e}")
    return count

# Try loading from your specified stop/nonstop folders first
n_stop = load_dir_as_label(stop_dir, 1)
n_nonstop = load_dir_as_label(nonstop_dir, 0)

# If no stop folder found, fallback to original single folder behaviour (label all as 1)
if n_stop == 0 and os.path.exists(single_folder_fallback):
    print(f"No `Images/stop` folder found; falling back to `{single_folder_fallback}` labeling images as class 1.")
    load_dir_as_label(single_folder_fallback, 1)

# Summary of loaded data
import collections
label_counts = collections.Counter(labels)
print("Loaded images per label:", dict(label_counts))

# Convert to numpy arrays
X = np.array(images)
y = np.array(labels)

# Check if we have images
if len(X) == 0:
    print("Error: No images were loaded. Please check the image folder paths and image files.")
else:
    # Determine if we have both classes
    unique_labels = np.unique(y)
    if len(unique_labels) > 1:
        stratify_y = y
        print("Both classes detected — using stratified split.")
    else:
        stratify_y = None
        print("Only one class detected — performing regular split (no stratify).")

    # Split the data into training and test sets
    if stratify_y is not None:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=stratify_y, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Dimensionality reduction with PCA (optional but recommended for high-dimensional data)
    # Dynamically set n_components to be less than min(n_samples, n_features)
    max_components = min(X_train.shape[0], X_train.shape[1])
    n_components = min(38, max_components)  # Use 38 or the max available, whichever is smaller

    print(f"Training set shape: {X_train.shape}")
    print(f"Using {n_components} PCA components")

    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)


    # Train a Random Forest classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train_pca, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test_pca)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

Loaded images per label: {1: 50, 0: 50}
Both classes detected — using stratified split.
Training set shape: (80, 16384)
Using 38 PCA components
Accuracy: 65.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.90      0.72        10
           1       0.80      0.40      0.53        10

    accuracy                           0.65        20
   macro avg       0.70      0.65      0.63        20
weighted avg       0.70      0.65      0.63        20

