In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Paths
BASE_DIR = "data"
TRAIN_CSV = os.path.join(BASE_DIR, "Train.csv")
TRAIN_IMG_DIR = os.path.join(BASE_DIR, "train")

# Load train CSV
train_df = pd.read_csv(TRAIN_CSV)

# Fix path: remove "Train/" if present in CSV
train_df['Path'] = train_df['Path'].str.replace("Train/", "", regex=False)

# Create full image path
train_df['FullPath'] = train_df['Path'].apply(lambda p: os.path.join(TRAIN_IMG_DIR, p))

# Filter rows where image file exists
train_df = train_df[train_df['FullPath'].apply(os.path.exists)].reset_index(drop=True)
print(f"Valid training samples: {len(train_df)}")

# Drop FullPath (optional)
train_df = train_df.drop(columns=['FullPath'])

# Train/val split
train_df_split, val_df = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df['ClassId'],
    random_state=42
)

print(f"Train: {len(train_df_split)} | Val: {len(val_df)}")

Valid training samples: 39209
Train: 31367 | Val: 7842


In [2]:
from torch.utils.data import Dataset
from PIL import Image

class GTSRBDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['Path'])
        image = Image.open(img_path).convert("RGB")
        label = int(self.data.iloc[idx]['ClassId'])

        if self.transform:
            image = self.transform(image)

        return image, label



In [4]:
from torchvision import transforms
from torch.utils.data import DataLoader

# Augmented transform for training
train_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Clean transform for validation
val_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Create datasets
train_dataset = GTSRBDataset(train_df_split, root_dir=os.path.join(BASE_DIR, "train"), transform=train_transform)
val_dataset = GTSRBDataset(val_df, root_dir=os.path.join(BASE_DIR, "train"), transform=val_transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TrafficSignCNN(nn.Module):
    def __init__(self, num_classes=43):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 32 → 16
        x = self.pool(F.relu(self.conv2(x)))  # 16 → 8
        x = self.pool(F.relu(self.conv3(x)))  # 8 → 4
        x = x.view(-1, 128 * 4 * 4)
        x = self.dropout(F.relu(self.fc1(x)))
        return self.fc2(x)

In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TrafficSignCNN().to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [7]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        train_loss, train_correct = 0.0, 0

        for images, labels in train_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            train_correct += (preds == labels).sum().item()

        train_acc = 100 * train_correct / len(train_loader.dataset)

        # Validation
        model.eval()
        val_correct = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(DEVICE), labels.to(DEVICE)
                outputs = model(images)
                _, preds = torch.max(outputs, 1)
                val_correct += (preds == labels).sum().item()

        val_acc = 100 * val_correct / len(val_loader.dataset)
        print(f"[Epoch {epoch+1}] Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")

In [8]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

[Epoch 1] Train Acc: 43.80% | Val Acc: 80.96%
[Epoch 2] Train Acc: 85.52% | Val Acc: 95.43%
[Epoch 3] Train Acc: 93.61% | Val Acc: 98.11%
[Epoch 4] Train Acc: 96.31% | Val Acc: 97.84%
[Epoch 5] Train Acc: 97.05% | Val Acc: 98.84%
[Epoch 6] Train Acc: 97.77% | Val Acc: 99.06%
[Epoch 7] Train Acc: 98.24% | Val Acc: 99.41%
[Epoch 8] Train Acc: 98.14% | Val Acc: 99.31%
[Epoch 9] Train Acc: 98.51% | Val Acc: 99.30%
[Epoch 10] Train Acc: 98.74% | Val Acc: 99.20%


In [9]:
# Save the model
MODEL_PATH = "traffic_sign_cnn.pth"
torch.save(model.state_dict(), MODEL_PATH)

print(f"Model saved to {MODEL_PATH}")

Model saved to traffic_sign_cnn.pth


In [13]:
# Load test data
test_df = pd.read_csv("data/Test.csv")
test_img_dir = os.path.join("data", "test")

# Clean test paths (remove "Test/" prefix if needed)
test_df['Path'] = test_df['Path'].str.replace("Test/", "", regex=False)

# Check if ClassId column exists
has_labels = 'ClassId' in test_df.columns
print("Test set labeled:", has_labels)

Test set labeled: True


In [14]:
# Dataset
test_dataset = GTSRBDataset(test_df, root_dir=test_img_dir, transform=val_transform)

# Loader
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [15]:
# Predict labels
y_true_test, y_pred_test = get_predictions(model, test_loader)

In [16]:
from sklearn.metrics import accuracy_score, classification_report

if has_labels:
    test_acc = accuracy_score(y_true_test, y_pred_test)
    print(f"Final Test Accuracy: {test_acc * 100:.2f}%\n")
    print(classification_report(y_true_test, y_pred_test, digits=4))
else:
    print("Test accuracy cannot be computed — no labels in Test.csv.")

Final Test Accuracy: 94.62%

              precision    recall  f1-score   support

           0     0.9811    0.8667    0.9204        60
           1     0.9788    0.9639    0.9713       720
           2     0.9494    0.9760    0.9625       750
           3     0.9718    0.9200    0.9452       450
           4     0.9968    0.9530    0.9744       660
           5     0.8154    0.9603    0.8819       630
           6     0.9752    0.7867    0.8708       150
           7     0.9973    0.8200    0.9000       450
           8     0.9018    0.9800    0.9393       450
           9     0.9958    0.9917    0.9937       480
          10     0.9732    0.9909    0.9820       660
          11     0.9597    0.9071    0.9327       420
          12     0.9510    0.9841    0.9672       690
          13     0.9986    0.9931    0.9958       720
          14     1.0000    0.9741    0.9869       270
          15     0.9952    0.9905    0.9928       210
          16     1.0000    1.0000    1.0000       15