## Import Library

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from PIL import Image
import os
import pathlib
import random 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, array_to_img, load_img
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout
from sklearn.metrics import accuracy_score
from tensorflow.keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import itertools
from tensorflow.keras.utils import to_categorical
from PIL import Image
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
tf.config.list_physical_devices('GPU')
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

## Assign Path

In [None]:
# assign path for dataset
data_dir ='C:/Users/hburg/OneDrive/Desktop/Data Science MSc/Machine Learning/ML Assignment/Data Set'
train_path = data_dir + '/Train'
test_path = data_dir + '/Test'

## Data Preparation


### Load Data


In [None]:
# Load Data
train = pd.read_csv(train_path + '.csv')
test = pd.read_csv(test_path + '.csv')

## Exploratory Data Analysis

In [None]:
# List the classes in the training directories
train_classes = os.listdir(train_path)

print(f"Train Classes: {train_classes}")

In [None]:
# List all files in the test directory
test_files = os.listdir(test_path)

image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
test_images = [file for file in test_files if file.lower().endswith(image_extensions)]

print(f"Total number of images in the test set: {len(test_images)}")
print(f"Test Images: {test_images[:10]}...")  # Print the first 10 images as a sample

In [None]:
# check the shape of the train and test dataset
print(f"Train dataset shape: {train.shape}")
print(f"Test dataset shape: {test.shape}")

In [None]:
# List all subfolders (classes) in the training directory
train_classes = os.listdir(train_path)

# Define image extensions to check
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']


image_counts_by_class = {cls: {ext: 0 for ext in image_extensions} for cls in train_classes}

# Count the number of images for each extension in each class
for cls in train_classes:
    class_path = os.path.join(train_path, cls)
    for file in os.listdir(class_path):
        for ext in image_extensions:
            if file.lower().endswith(ext):
                image_counts_by_class[cls][ext] += 1

# Calculate the total number of images for each class and for all classes
total_images_by_class = {cls: sum(counts.values()) for cls, counts in image_counts_by_class.items()}
total_images = sum(total_images_by_class.values())

# Print the results for each class
print("Image Counts by Type for Each Class:")
for cls, counts in image_counts_by_class.items():
    print(f"Class: {cls}")
    for ext, count in counts.items():
        print(f"  {ext}: {count} images")
    print(f"  Total images in class '{cls}': {total_images_by_class[cls]}")
    print()

# Print the overall total number of images
print(f"Total number of images in the training set: {total_images}")

In [None]:
# List all files in the test directory
test_files = os.listdir(test_path)

image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']

image_counts = {ext: 0 for ext in image_extensions}

# Count the number of images for each extension
for file in test_files:
    for ext in image_extensions:
        if file.lower().endswith(ext):
            image_counts[ext] += 1

# Calculate the total number of images
total_images = sum(image_counts.values())

# Print the results
print("Image Counts by Type:")
for ext, count in image_counts.items():
    print(f"{ext}: {count} images")

print(f"Total number of images in the test set: {total_images}")

In [None]:
# find total class
NUM_CATEGORIES = len(os.listdir(train_path))
NUM_CATEGORIES

In [None]:
#Visualise Sample Images in Training Data
def plot_sample_images(directory, classes, n_samples=2):
    for cls in classes:
        image_path = os.path.join(directory, cls)
        image_files = random.sample(os.listdir(image_path), n_samples)
        fig, axes = plt.subplots(1, n_samples, figsize=(15, 5))
        fig.suptitle(cls)
        for ax, image_file in zip(axes, image_files):
            img = Image.open(os.path.join(image_path, image_file))
            ax.imshow(img)
            ax.axis('off')
        plt.show()

plot_sample_images(train_path, train_classes)

In [None]:
# Recursively gather all image file paths
all_image_paths = []
for root, dirs, files in os.walk(train_path):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.ppm')):
            full_path = os.path.join(root, file)
            all_image_paths.append(full_path)

# Randomly sample 24 image paths
sample_paths = random.sample(all_image_paths, 24)

# Create 6×4 grid
fig, axes = plt.subplots(6, 4, figsize=(14, 12))
fig.suptitle("Random Sample of GTSRB Images", fontsize=16)

for ax, img_path in zip(axes.ravel(), sample_paths):
    img = Image.open(img_path)
    class_id = os.path.basename(os.path.dirname(img_path))  # use folder name as class ID
    ax.imshow(img)
    ax.set_title(f"Class {class_id}", fontsize=8)
    ax.axis('off')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
# List all classes in the train set
train_classes = os.listdir(train_path)

# Calculate the number of images in each class
class_counts = {cls: len(os.listdir(os.path.join(train_path, cls))) for cls in train_classes}

# Sort the classes numerically based on their class names
sorted_class_counts = dict(sorted(class_counts.items(), key=lambda item: int(item[0])))

# Visualise the class distribution
class_counts_series = pd.Series(sorted_class_counts)
plt.figure(figsize=(10, 6))
class_counts_series.plot(kind='bar', color='blue')
plt.title('Class Distribution in Training Set')
plt.xlabel('Class')
plt.ylabel('Number of Images')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Number of classes to examine 
classes_to_plot = [0, 1, 11, 14, 22, 38]  

# Plot pixel intensity histograms for selected classes
for class_id in classes_to_plot:
    class_dir = os.path.join(train_path, str(class_id))
    if not os.path.isdir(class_dir):
        continue

    pixel_values = []

    # Load and process images in the class directory
    for img_name in os.listdir(class_dir)[:50]:  # Limit to 50 images per class for speed
        img_path = os.path.join(class_dir, img_name)
        try:
            img = Image.open(img_path).convert('L')  # Convert to grayscale
            pixel_array = np.array(img).flatten()
            pixel_values.extend(pixel_array)
        except Exception as e:
            print(f"Error loading {img_name}: {e}")

    # Plot histogram
    plt.figure(figsize=(8, 4))
    plt.hist(pixel_values, bins=30, color='gray', alpha=0.8, edgecolor='black')
    plt.title(f"Pixel Intensity Distribution - Class {class_id}")
    plt.xlabel("Pixel Intensity (0 = Black, 255 = White)")
    plt.ylabel("Frequency")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

In [None]:
# Print example images for related histograms 
train_path = r'C:\Users\hburg\OneDrive\Desktop\Data Science MSc\Machine Learning\ML Assignment\Data Set\Train'
selected_classes = [0, 1, 14, 22, 11, 38]

class_labels = {
    0: "Speed Limit 20",
    1: "Speed Limit 30",
    14: "Stop",
    22: "Bumpy Road",
    11: "Right-of-Way",
    38: "Keep Right"
}

# Create a 2×3 grid plot
fig, axes = plt.subplots(3, 2, figsize=(12, 8))
fig.suptitle("Example Images from Selected Traffic Sign Classes", fontsize=16)

for ax, class_id in zip(axes.ravel(), selected_classes):
    class_dir = os.path.join(train_path, str(class_id))
    image_file = random.choice(os.listdir(class_dir))  
    image_path = os.path.join(class_dir, image_file)

    img = Image.open(image_path)
    ax.imshow(img)
    ax.set_title(f"Class {class_id}: {class_labels[class_id]}", fontsize=10)
    ax.axis('off')

plt.tight_layout(rect=[0, 0.03, 1, 0.95]) 
plt.show()


## Label Overview

In [None]:
# define a dictionary classes that maps class IDs to their corresponding traffic sign labels
classes_labels = { 
            0:'Speed limit (20km/h)',
            1:'Speed limit (30km/h)', 
            2:'Speed limit (50km/h)', 
            3:'Speed limit (60km/h)', 
            4:'Speed limit (70km/h)', 
            5:'Speed limit (80km/h)', 
            6:'End of speed limit (80km/h)', 
            7:'Speed limit (100km/h)', 
            8:'Speed limit (120km/h)', 
            9:'No passing', 
            10:'No passing veh over 3.5 tons', 
            11:'Right-of-way at intersection', 
            12:'Priority road', 
            13:'Yield', 
            14:'Stop', 
            15:'No vehicles', 
            16:'Veh > 3.5 tons prohibited', 
            17:'No entry', 
            18:'General caution', 
            19:'Dangerous curve left', 
            20:'Dangerous curve right', 
            21:'Double curve', 
            22:'Bumpy road', 
            23:'Slippery road', 
            24:'Road narrows on the right', 
            25:'Road work', 
            26:'Traffic signals', 
            27:'Pedestrians', 
            28:'Children crossing', 
            29:'Bicycles crossing', 
            30:'Beware of ice/snow',
            31:'Wild animals crossing', 
            32:'End speed + passing limits', 
            33:'Turn right ahead', 
            34:'Turn left ahead', 
            35:'Ahead only', 
            36:'Go straight or right', 
            37:'Go straight or left', 
            38:'Keep right', 
            39:'Keep left', 
            40:'Roundabout mandatory', 
            41:'End of no passing', 
            42:'End no passing veh > 3.5 tons' }

## Data Pre-Processing

### Check For Missing Values

In [None]:
# check train data
train.isnull().any().describe()

In [None]:
# check test data
test.isnull().any().describe()

### Check for Duplicates

In [None]:
# check duplicates value
print('Duplicate data in Train Set: ',train.duplicated().sum())
print('Duplicate data in Test Set: ',test.duplicated().sum())

## Re-size Images

In [None]:
import os
import tensorflow as tf
import numpy as np

# Set image size
IMG_HEIGHT = 96
IMG_WIDTH = 96
NUM_CATEGORIES = 43

# Your train path
train_path = 'C:/Users/hburg/OneDrive/Desktop/Data Science MSc/Machine Learning/ML Assignment/Data Set/Train'

Collect image paths and labels
image_paths = []
image_labels = []

for class_id in range(NUM_CATEGORIES):
    class_folder = os.path.join(train_path, str(class_id))
    for img_file in os.listdir(class_folder):
        if img_file.endswith(('.png', '.jpg', '.jpeg')):
            image_paths.append(os.path.join(class_folder, img_file))
            image_labels.append(class_id)


path_ds = tf.data.Dataset.from_tensor_slices((image_paths, image_labels))


def process_image(file_path, label):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_png(image, channels=3) 
    image = tf.image.resize(image, [IMG_HEIGHT, IMG_WIDTH])
    image = tf.cast(image, tf.float32) / 255.0
    return image, label


dataset = path_ds.map(process_image, num_parallel_calls=tf.data.AUTOTUNE)


image_data = []
image_labels_np = []

counter = 0
for image, label in dataset:
    image_data.append(image.numpy())
    image_labels_np.append(label.numpy())
    counter += 1
    if counter % 100 == 0:
        print(f"✅ Processed {counter} images...")

# Convert to NumPy arrays
image_data = np.array(image_data)
image_labels_np = np.array(image_labels_np)

print(f"\n🎉 Done! Processed a total of {counter} images.")
print("image_data.shape:", image_data.shape)
print("image_labels.shape:", image_labels_np.shape)


## Shuffle Training Data

In [None]:
from sklearn.model_selection import train_test_split  
X = np.load("image_data")
y = np.load("image_labels.np")

# split data into train and validation set
random_seed = 10
X_train, X_val, y_train, y_val = train_test_split(
    image_data, image_labels_np, test_size=0.3, random_state=random_seed, shuffle=True
)

print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)


In [None]:
import numpy as np
# Save arrays to .npy files
np.save('X_train.npy', X_train)
np.save('X_val.npy', X_val)
np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)

print("✅ Saved preprocessed training and validation data!")

In [None]:
print("X_train.shape", X_train.shape)
print("X_valid.shape", X_val.shape)
print("y_train.shape", y_train.shape)
print("y_valid.shape", y_val.shape)

## Normalise Test and Train Data 

In [None]:
X_train = X_train/255 
X_val = X_val/255

X_train
X_val

## Label Encoding

In [None]:
# One-hot encode the labels
y_train = keras.utils.to_categorical(y_train, num_classes=NUM_CATEGORIES)
y_val = keras.utils.to_categorical(y_val, num_classes=NUM_CATEGORIES)

print(y_train.shape)
print(y_val.shape)

## Data Augmentation

In [None]:
batch_size = 32
epochs = 150

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    horizontal_flip=False,
    vertical_flip=False,
    fill_mode="nearest"
)

datagen.fit(X_train)


## Pre-Process Test Data

In [None]:
import os
import tensorflow as tf
import numpy as np

# Set image size 
IMG_HEIGHT = 96
IMG_WIDTH = 96

test_path = 'C:/Users/hburg/OneDrive/Desktop/Data Science MSc/Machine Learning/ML Assignment/Data Set/Test'


test_image_paths = []

for img_file in os.listdir(test_path):
    if img_file.endswith(('.png', '.jpg', '.jpeg')):
        test_image_paths.append(os.path.join(test_path, img_file))

test_path_ds = tf.data.Dataset.from_tensor_slices(test_image_paths)


def process_test_image(file_path):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_png(image, channels=3)  
    image = tf.image.resize(image, [IMG_HEIGHT, IMG_WIDTH])
    image = tf.cast(image, tf.float32) / 255.0
    return image


test_dataset = test_path_ds.map(process_test_image, num_parallel_calls=tf.data.AUTOTUNE)


X_test = []

counter = 0
for image in test_dataset:
    X_test.append(image.numpy())
    counter += 1
    if counter % 100 == 0:
        print(f"✅ Processed {counter} test images...")

X_test = np.array(X_test)
np.save('X_test.npy', X_test)

print(f"\n🎉 Done! Processed a total of {counter} test images.")
print("X_test.shape:", X_test.shape)

In [None]:
import pandas as pd
import numpy as np3


test_csv_path = 'C:/Users/hburg/OneDrive/Desktop/Data Science MSc/Machine Learning/ML Assignment/Data Set/Test.csv'

test_df = pd.read_csv(test_csv_path)

# Extract labels into NumPy array
y_test = test_df['ClassId'].values

# Check shape
print("✅ Loaded test labels only")
print("y_test shape:", y_test.shape)
print("Example labels:", y_test[:10])


In [None]:
import random

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

## MobileNetv2

In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import numpy as np

X_train = np.load('X_train.npy')
X_val = np.load('X_val.npy')
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')
X_test = np.load('X_test.npy')

IMG_SIZE = 96

mobilenet_base = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet')
mobilenet_base.trainable = False

model_mobilenet = Sequential([
    mobilenet_base,
    GlobalAveragePooling2D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(43, activation='softmax')
])

model_mobilenet.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

history_mobilenet = model_mobilenet.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    callbacks=[early_stopping]
)

model_mobilenet.save('mobilenet_model.h5')

test_loss, test_accuracy = model_mobilenet.evaluate(X_test, y_test)

print("MobileNetV2 Training Loss:", history_mobilenet.history['loss'][-1])
print("MobileNetV2 Training Accuracy:", history_mobilenet.history['accuracy'][-1])
print("MobileNetV2 Test Loss:", test_loss)
print("MobileNetV2 Test Accuracy:", test_accuracy)

plt.plot(history_mobilenet.history['loss'], label='train_loss')
plt.plot(history_mobilenet.history['val_loss'], label='val_loss')
plt.title('MobileNetV2 Loss')
plt.legend()
plt.show()

plt.plot(history_mobilenet.history['accuracy'], label='train_accuracy')
plt.plot(history_mobilenet.history['val_accuracy'], label='val_accuracy')
plt.title('MobileNetV2 Accuracy')
plt.legend()
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

# Load test data and model if not already in memory
X_test = np.load('X_train.npy')
y_test = np.load('y_train.npy')
model_mobilenet = load_model('mobilenet_model.h5')

# Preprocess for MobileNetV2
X_test = preprocess_input(X_test)

# Predict
y_pred_probs = model_mobilenet.predict(X_test)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

# Classification Accuracy
accuracy = np.mean(y_pred_classes == y_test)
print(f"Classification Accuracy: {accuracy:.4f}")

# Precision, Recall, F1-Score
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1_macro = f1_score(y_test, y_pred_classes, average='macro')
f1_weighted = f1_score(y_test, y_pred_classes, average='weighted')

print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=False, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - MobileNetV2')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()

# Full classification report
print("\nDetailed Classification Report:\n")
print(classification_report(y_test, y_pred_classes, digits=3))


## EfficientNetB0

In [None]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input

X_train = np.load('X_train.npy')
X_val = np.load('X_val.npy')
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')
X_test = np.load('X_test.npy')


model_mobilenet.save('efficientNetb0_model.h5')

test_loss, test_accuracy = model_efficientnet.evaluate(X_test, y_test)

print("EfficientNetB0 Training Loss:", history_efficientnet.history['loss'][-1])
print("EfficientNetB0 Training Accuracy:", history_efficientnet.history['accuracy'][-1])
print("EfficientNetB0 Test Loss:", test_loss)
print("EfficientNetB0 Test Accuracy:", test_accuracy)

plt.plot(history_efficientnet.history['loss'], label='train_loss')
plt.plot(history_efficientnet.history['val_loss'], label='val_loss')
plt.title('EfficientNetB0 Loss')
plt.legend()
plt.show()

plt.plot(history_efficientnet.history['accuracy'], label='train_accuracy')
plt.plot(history_efficientnet.history['val_accuracy'], label='val_accuracy')
plt.title('EfficientNetB0 Accuracy')
plt.legend()
plt.show()