Link to current Kaggle dataset: https://www.kaggle.com/datasets/enesbayturk/vehicle-and-pedestrian-detection-dataset

8 classes, images of big truck, small truck, bus, car, motorcycle, van, and pedestrian. 

In [1]:
import sys
import os

import datetime
import copy

import numpy as np
import sklearn

import scipy as sp
import pandas as pd

import torch
import torchvision

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import datasets, transforms
from torch.utils.data import ConcatDataset

from torch.nn import Flatten, Linear, Conv2d, MaxPool2d, Dropout, Sequential
from torch.nn import ReLU, Sigmoid, Softmax

In [None]:
aug_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5), # Data augmentation - horizontal flip, rotation, color properties
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.Resize((128, 128)),  # Resize all images to the same size
    transforms.ToTensor(),
])

regular_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# Load images, resize all to 128x128 pixels, save as tensors
dataset_unaug = datasets.ImageFolder(root='project_data', transform = regular_transform)
dataset_aug = datasets.ImageFolder(root='project_data', transform=aug_transform)

# Split Data into training and valtest
# Dont split augmented dataset, bc all of that will be for training
n_train = len(dataset_unaug) * 0.5
n_train = int(n_train)
n_valtest = len(dataset_unaug) - n_train
train_set, valtest = random_split(dataset_unaug, [n_train, n_valtest])

# Now split valtest again to get validation and test sets
n_val = len(valtest) // 2
n_test = len(valtest) - n_val
validation_set, test_set = random_split(valtest, [n_val, n_test])


# Only use data augmentation on the training set to avoid data leakage

# Now append the augmented dataset to train_set
train_combined_set = ConcatDataset([train_set, dataset_aug])
print(len(train_combined_set))
print(len(validation_set))
print(len(test_set))

# Create DataLoaders for each
batch = 32
train_loader = DataLoader(train_set, batch_size=batch, shuffle=True)
val_loader = DataLoader(validation_set, batch_size=batch, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch, shuffle=False)

In [None]:

class CNN(nn.Module):
        def __init__(self):
            super(CNN, self).__init__()
            self.conv1 = nn.Conv2d(1, 6, 5) # Conv2d(1, 6, 5) -> (input has 1 channel because grayscale, 6 filters as output, the filter_size is (5,5))
            self.pool = nn.MaxPool2d(2, 2) 
            self.conv2 = nn.Conv2d(6, 64, 3)
            self.conv3 = nn.Conv2d(64, 128, 2)
            # note: we will pool again here, but since same pooling function used, don't redefine, just clarify this order in forward func (this is what will actually happen when model is trained)
            self.fc1 = nn.Linear(128 * 2 * 2, 150) #fully connected layers
            self.fc2 = nn.Linear(150, 84)
            self.fc3 = nn.Linear(84, 8) # because 8 classes in kaggle dataset

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x))) 
            x = self.pool(F.relu(self.conv2(x))) # can print shape of x, to debug and see what expected input/output sizes are
            x = self.pool(F.relu(self.conv3(x)))
            x = x.view(-1, 128*2*2)   # returns a "view" (reshape) of the tensor without copying (in this case we flatten)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x
        
model = CNN()


opt = optim.Adam(model.parameters(), lr=0.001)
model.optimizer = opt
model.loss_func = nn.CrossEntropyLoss()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_loss = running_loss / total
        train_acc = correct / total
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)