In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'Setup OK. GPU: {gpu_name} ({gpu_mem:.1f} GB)')
    print(f'CUDA version: {torch.version.cuda}')
    print(f'PyTorch version: {torch.__version__}')
    print(f'cuDNN benchmark: enabled')
else:
    print('Setup OK. No GPU found, using CPU.')

print(f'Device: {device}')

## Section 1: Loading and Inspecting CSV File

In [None]:
import pandas as pd

# Update these paths for your environment
CSV = PATH_TO_CSV
IMG = PATH_TO_IMAGES

CSV_FILE = os.path.expanduser(CSV)
IMG_DIR = os.path.expanduser(IMG)

In [None]:
df = pd.read_csv(CSV_FILE)
df['image_path'] = df['image_path'].apply(lambda p: 
    os.path.join(IMG_DIR, os.path.basename(p))
)
df.head(), df.describe()

In [None]:
import matplotlib.pyplot as plt
plt.hist(df.steering_pulse, bins=50)
plt.title("Steering Distribution")
plt.show()

## Section 2: Normalize Labels

In [None]:
Steer_MIN, Steer_MAX = 150, 600
Steer_Center = (Steer_MIN + Steer_MAX) / 2
TH_REV, TH_STOP, TH_MAX = 205, 307, 410

def normalize_throttle(p):
    if p == TH_STOP:
        return 0.0
    elif p > TH_STOP:
        return (p - TH_STOP) / (TH_MAX - TH_STOP)
    else:
        return (p - TH_STOP) / (TH_STOP - TH_REV)

df["steer_norm"] = (df.steering_pulse - Steer_Center) / (Steer_MAX - Steer_MIN)
df["throttle_norm"] = df.throttle_pulse.apply(normalize_throttle)

## Section 3: Train and Validation Split

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
print(f'Training samples: {len(train_df)}')
print(f'Validation samples: {len(val_df)}')

## Section 4: Data Pipeline - PyTorch (Preloaded into RAM)

In [None]:
import cv2
from tqdm.auto import tqdm

class DrivingDatasetPreloaded(Dataset):
    """Dataset that preloads all images into RAM for faster training."""
    
    def __init__(self, df, input_size=(66, 200), augment=False):
        self.input_size = input_size  # (H, W)
        self.augment = augment
        self.images = []
        self.steers = []
        self.throttles = []
        
        print(f"Preloading {len(df)} images into RAM...")
        failed = 0
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Loading"):
            img = cv2.imread(row['image_path'])
            if img is not None:
                img = cv2.resize(img, (input_size[1], input_size[0]))  # cv2 uses (W, H)
                self.images.append(img)
                self.steers.append(float(row['steer_norm']))
                self.throttles.append(float(row['throttle_norm']))
            else:
                failed += 1
        
        mem_mb = len(self.images) * self.images[0].nbytes / 1e6
        print(f"Loaded {len(self.images)} images ({mem_mb:.1f} MB in RAM)")
        if failed > 0:
            print(f"Warning: {failed} images failed to load")
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img = self.images[idx]
        steer_val = self.steers[idx]
        
        if self.augment:
            img = img.copy()  # Don't modify the cached image
            # Random horizontal flip
            if np.random.rand() < 0.5:
                img = cv2.flip(img, 1)
                steer_val = -steer_val
            # Brightness jitter in HSV
            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
            v = img_hsv[:, :, 2].astype(np.float32)
            v = np.clip(v * (0.8 + 0.4 * np.random.rand()), 0, 255)
            img_hsv[:, :, 2] = v.astype(np.uint8)
            img = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)
        
        # Normalize to [-1, 1] and convert BGR to RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32) / 127.5 - 1.0
        
        # PyTorch expects (C, H, W)
        img = np.transpose(img, (2, 0, 1))
        
        return (
            torch.from_numpy(img),
            torch.tensor(steer_val, dtype=torch.float32),
            torch.tensor(self.throttles[idx], dtype=torch.float32)
        )

### Instantiate Datasets and DataLoaders

In [None]:
BATCH_SIZE = 128

train_dataset = DrivingDatasetPreloaded(train_df, input_size=(66, 200), augment=True)
val_dataset = DrivingDatasetPreloaded(val_df, input_size=(66, 200), augment=False)

# num_workers=0 is fine since data is already in RAM
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

print(f'Training batches: {len(train_loader)}')
print(f'Validation batches: {len(val_loader)}')

## Section 5: Define Model (NVIDIA PilotNet Architecture)

In [None]:
class PilotNet(nn.Module):

    def __init__(self):
        super(PilotNet, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 24, kernel_size=5, stride=2)
        self.conv2 = nn.Conv2d(24, 36, kernel_size=5, stride=2)
        self.conv3 = nn.Conv2d(36, 48, kernel_size=5, stride=2)
        self.conv4 = nn.Conv2d(48, 64, kernel_size=3, stride=1)
        self.conv5 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Fully connected layers
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 1 * 18, 100)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(100, 50)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(50, 10)
        
        # Output heads
        self.steering_out = nn.Linear(10, 1)
        self.throttle_out = nn.Linear(10, 1)
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # Convolutional layers with ReLU
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = self.relu(self.conv5(x))
        
        # Flatten and fully connected
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.relu(self.fc3(x))
        
        # Dual output heads
        steering = self.steering_out(x)
        throttle = self.throttle_out(x)
        
        return steering, throttle

model = PilotNet().to(device)
print(model)
print(f'\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}')

## Section 6: Training Loop with Callbacks

In [None]:
# GPU monitoring utility
import csv
import time
import subprocess

class GPUMonitor:
    def __init__(self, path='gpu_batch_log.csv', every_n=10):
        self.path = path
        self.every_n = every_n
        self._f = None
        self._w = None
        self.batch_count = 0
    
    def start(self):
        self._f = open(self.path, 'w', newline='')
        self._w = csv.writer(self._f)
        self._w.writerow(['ts', 'epoch', 'batch', 'gpu_util', 'mem_used_mb', 'mem_total_mb'])
        self._f.flush()
    
    def _snapshot(self):
        try:
            out = subprocess.check_output(
                'nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,nounits,noheader',
                shell=True
            ).decode().strip().splitlines()[0]
            util, used, total = [float(x.strip()) for x in out.split(',')]
            return util, used, total
        except Exception:
            return float('nan'), float('nan'), float('nan')
    
    def log_batch(self, epoch, batch):
        self.batch_count += 1
        if self.every_n and (self.batch_count % self.every_n) != 0:
            return
        util, used, total = self._snapshot()
        self._w.writerow([time.time(), epoch, batch, util, used, total])
        self._f.flush()
    
    def stop(self):
        if self._f:
            self._f.close()

print('GPUMonitor ready.')

In [None]:
import copy

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

# Training settings
EPOCHS = 50
STEERING_WEIGHT = 1.0
THROTTLE_WEIGHT = 0.5
EARLY_STOP_PATIENCE = 5

# Tracking
history = {
    'train_loss': [], 'val_loss': [],
    'train_steer_loss': [], 'val_steer_loss': [],
    'train_throttle_loss': [], 'val_throttle_loss': [],
    'train_steer_mae': [], 'val_steer_mae': [],
    'train_throttle_mae': [], 'val_throttle_mae': [],
    'lr': []
}

best_val_loss = float('inf')
best_model_state = None
epochs_no_improve = 0

# GPU monitor
gpu_monitor = GPUMonitor(path='gpu_batch_log.csv', every_n=5)
gpu_monitor.start()

# CSV logger
csv_log = open('training.log', 'w', newline='')
csv_writer = csv.writer(csv_log)
csv_writer.writerow(['epoch', 'train_loss', 'val_loss', 'train_steer_mae', 'val_steer_mae', 
                     'train_throttle_mae', 'val_throttle_mae', 'lr'])

# AMP scaler for mixed precision training
scaler = torch.amp.GradScaler('cuda')

print(f'Starting training on {device}...')
print(f'Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}')
print(f'Batch size: {BATCH_SIZE}, Epochs: {EPOCHS}')
print(f'Mixed precision (AMP): enabled')
print('-' * 80)

train_start = time.time()

for epoch in range(EPOCHS):
    epoch_start = time.time()
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_steer_loss = 0.0
    train_throttle_loss = 0.0
    train_steer_mae = 0.0
    train_throttle_mae = 0.0
    
    for batch_idx, (images, steers, throttles) in enumerate(train_loader):
        images = images.to(device, non_blocking=True)
        steers = steers.to(device, non_blocking=True).unsqueeze(1)
        throttles = throttles.to(device, non_blocking=True).unsqueeze(1)
        
        optimizer.zero_grad(set_to_none=True)
        
        # Mixed precision forward pass
        with torch.amp.autocast('cuda'):
            pred_steer, pred_throttle = model(images)
            steer_loss = criterion(pred_steer, steers)
            throttle_loss = criterion(pred_throttle, throttles)
            loss = STEERING_WEIGHT * steer_loss + THROTTLE_WEIGHT * throttle_loss
        
        # Scaled backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        train_loss += loss.item()
        train_steer_loss += steer_loss.item()
        train_throttle_loss += throttle_loss.item()
        train_steer_mae += torch.mean(torch.abs(pred_steer - steers)).item()
        train_throttle_mae += torch.mean(torch.abs(pred_throttle - throttles)).item()
        
        gpu_monitor.log_batch(epoch, batch_idx)
    
    n_train = len(train_loader)
    train_loss /= n_train
    train_steer_loss /= n_train
    train_throttle_loss /= n_train
    train_steer_mae /= n_train
    train_throttle_mae /= n_train
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_steer_loss = 0.0
    val_throttle_loss = 0.0
    val_steer_mae = 0.0
    val_throttle_mae = 0.0
    
    with torch.no_grad():
        for images, steers, throttles in val_loader:
            images = images.to(device, non_blocking=True)
            steers = steers.to(device, non_blocking=True).unsqueeze(1)
            throttles = throttles.to(device, non_blocking=True).unsqueeze(1)
            
            with torch.amp.autocast('cuda'):
                pred_steer, pred_throttle = model(images)
                steer_loss = criterion(pred_steer, steers)
                throttle_loss = criterion(pred_throttle, throttles)
                loss = STEERING_WEIGHT * steer_loss + THROTTLE_WEIGHT * throttle_loss
            
            val_loss += loss.item()
            val_steer_loss += steer_loss.item()
            val_throttle_loss += throttle_loss.item()
            val_steer_mae += torch.mean(torch.abs(pred_steer - steers)).item()
            val_throttle_mae += torch.mean(torch.abs(pred_throttle - throttles)).item()
    
    n_val = len(val_loader)
    val_loss /= n_val
    val_steer_loss /= n_val
    val_throttle_loss /= n_val
    val_steer_mae /= n_val
    val_throttle_mae /= n_val
    
    # Learning rate scheduler
    scheduler.step(val_loss)
    current_lr = optimizer.param_groups[0]['lr']
    
    # Record history
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['train_steer_loss'].append(train_steer_loss)
    history['val_steer_loss'].append(val_steer_loss)
    history['train_throttle_loss'].append(train_throttle_loss)
    history['val_throttle_loss'].append(val_throttle_loss)
    history['train_steer_mae'].append(train_steer_mae)
    history['val_steer_mae'].append(val_steer_mae)
    history['train_throttle_mae'].append(train_throttle_mae)
    history['val_throttle_mae'].append(val_throttle_mae)
    history['lr'].append(current_lr)
    
    csv_writer.writerow([epoch+1, train_loss, val_loss, train_steer_mae, val_steer_mae,
                         train_throttle_mae, val_throttle_mae, current_lr])
    csv_log.flush()
    
    epoch_time = time.time() - epoch_start
    
    # epoch summary
    print(f'Epoch {epoch+1}/{EPOCHS} ({epoch_time:.1f}s) - '
          f'loss: {train_loss:.4f} - steer_mae: {train_steer_mae:.4f} - throttle_mae: {train_throttle_mae:.4f} - '
          f'val_loss: {val_loss:.4f} - val_steer_mae: {val_steer_mae:.4f} - val_throttle_mae: {val_throttle_mae:.4f} - '
          f'lr: {current_lr:.6f}')
    
    # Model checkpoint
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state = copy.deepcopy(model.state_dict())
        torch.save(best_model_state, 'best.pt')
        print(f'  -> Saved best model (val_loss: {val_loss:.6f})')
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
    
    if epochs_no_improve >= EARLY_STOP_PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break

# Restore best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print('Restored best model weights')

total_time = time.time() - train_start
gpu_monitor.stop()
csv_log.close()
print(f'Training complete! Total time: {total_time:.1f}s')

## Section 7: Evaluate and Visualize

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Total Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history['lr'])
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule')

plt.tight_layout()
plt.show()

In [None]:
# Get a batch from validation set and compare predictions
model.eval()
images, steers_gt, throttles_gt = next(iter(val_loader))
images = images.to(device)

with torch.no_grad():
    pred_steer, pred_throttle = model(images)

print('Ground Truth Steering:', steers_gt[:5].numpy())
print('Predicted Steering:   ', pred_steer[:5].cpu().numpy().flatten())
print()
print('Ground Truth Throttle:', throttles_gt[:5].numpy())
print('Predicted Throttle:   ', pred_throttle[:5].cpu().numpy().flatten())

## Deployment

In [None]:
torch.save(model.state_dict(), 'Jan26_pt.pt')
print('Saved PyTorch model')

model.eval()
dummy_input = torch.randn(1, 3, 66, 200).to(device)

torch.onnx.export(
    model,
    dummy_input,
    '_pt.onnx',
    export_params=True,
    opset_version=11,
    do_constant_folding=True,
    input_names=['input'],
    output_names=['steering', 'throttle'],
    dynamic_axes={
        'input': {0: 'batch_size'},
        'steering': {0: 'batch_size'},
        'throttle': {0: 'batch_size'}
    }
)
print('Exported ONNX model')

In [None]:
try:
    import onnx
    import onnxruntime as ort
    
    onnx_model = onnx.load('_pt.onnx')
    onnx.checker.check_model(onnx_model)
    print('ONNX model is valid!')
    
    ort_session = ort.InferenceSession('_pt.onnx')
    test_input = np.random.randn(1, 3, 66, 200).astype(np.float32)
    outputs = ort_session.run(None, {'input': test_input})
    print(f'ONNX inference test - Steering: {outputs[0][0]}, Throttle: {outputs[1][0]}')
except ImportError:
    print('Install onnx and onnxruntime to verify ONNX model: pip install onnx onnxruntime')

## CLEAR RAM

In [None]:
del train_dataset, val_dataset, train_loader, val_loader
import gc
gc.collect()
torch.cuda.empty_cache()