## Build Model

In [16]:
import torch
import torch.nn as nn
import torchvision.models as models
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        pe = pe.unsqueeze(1)  # (max_len,1,d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (seq_len, batch, d_model)
        seq_len = x.size(0)
        x = x + self.pe[:seq_len]
        return x

class FrameFeatureEncoder(nn.Module):
    def __init__(self, pretrained=True, output_dim=512):
        super().__init__()
        resnet = models.resnet18(pretrained=pretrained)
        # remove fc layer, use avgpool output
        modules = list(resnet.children())[:-1]  # all except last fc
        self.backbone = nn.Sequential(*modules)  # output: (B,512,1,1)
        self.output_dim = output_dim  # for resnet18 it's 512

    def forward(self, x):
        # x: (B, C, H, W)
        features = self.backbone(x)  # (B, 512, 1,1)
        features = features.view(features.size(0), -1)  # (B, 512)
        return features  # (B, output_dim)

class TransformerForVideo(nn.Module):
    def __init__(self, feature_dim=512, nhead=8, num_layers=3, dim_feedforward=1024, dropout=0.1, num_classes=2, pretrained_backbone=True):
        super().__init__()
        self.frame_encoder = FrameFeatureEncoder(pretrained=pretrained_backbone, output_dim=feature_dim)
        self.feature_dim = feature_dim

        # Project features if you want to change transformer d_model
        self.input_proj = nn.Linear(feature_dim, feature_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=feature_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pos_enc = PositionalEncoding(feature_dim, max_len=512)

        # classification head: use pooled transformer output (mean pooling)
        self.classifier = nn.Sequential(
            nn.Linear(feature_dim, feature_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(feature_dim//2, num_classes)
        )

    def forward(self, seq):
        """
        seq: (batch, seq_len, C, H, W)
        Returns logits: (batch, num_classes)
        """
        B, S, C, H, W = seq.shape
        # merge batch+seq to pass through frame encoder efficiently
        seq = seq.view(B*S, C, H, W)
        feats = self.frame_encoder(seq)   # (B*S, feature_dim)
        feats = self.input_proj(feats)    # (B*S, feature_dim)
        feats = feats.view(S, B, -1)      # transformer expects (seq_len, batch, d_model)
        feats = self.pos_enc(feats)       # add positional encoding
        out = self.transformer(feats)     # (seq_len, batch, d_model)
        # pooling over temporal dimension (mean)
        out = out.mean(dim=0)             # (batch, d_model)
        logits = self.classifier(out)     # (batch, num_classes)
        return logits

## Training code

In [17]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import random_split
from tqdm.auto import tqdm
import numpy as np

def train_one_epoch(model, dataloader, optimizer, criterion, device, scaler=None):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for seqs, labels in tqdm(dataloader, desc="Train"):
        seqs = seqs.to(device)          # (B, S, C, H, W)
        labels = labels.to(device)

        optimizer.zero_grad()
        if scaler:
            with torch.cuda.amp.autocast():
                logits = model(seqs)
                loss = criterion(logits, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(seqs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

        running_loss += loss.item() * seqs.size(0)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += seqs.size(0)
    return running_loss / total, correct / total

@torch.no_grad()
def eval_one_epoch(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    for seqs, labels in tqdm(dataloader, desc="Val"):
        seqs = seqs.to(device)
        labels = labels.to(device)
        logits = model(seqs)
        loss = criterion(logits, labels)
        running_loss += loss.item() * seqs.size(0)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += seqs.size(0)
    return running_loss / total, correct / total

def fit(model, train_loader, val_loader, epochs=20, lr=1e-4, weight_decay=1e-4, device='cuda'):
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    # Use CrossEntropyLoss for multiclass (binary as 2-class)
    criterion = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler() if device.startswith('cuda') else None

    best_val_acc = 0.0
    best_state = None

    for epoch in range(1, epochs+1):
        print(f"Epoch {epoch}/{epochs}")
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device, scaler)
        val_loss, val_acc = eval_one_epoch(model, val_loader, criterion, device)
        print(f"Train loss {train_loss:.4f} acc {train_acc:.4f} | Val loss {val_loss:.4f} acc {val_acc:.4f}")

        # save best
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict().copy()
            torch.save(best_state, "best_model.pth")
            print("Saved best model (val acc improved).")
    return best_val_acc

## Run Training Loop

In [None]:
from dataset_loader import VideoFolderDataset
from torch.utils.data import DataLoader, random_split

DATA_ROOT = "./output"
class_to_label = {"non-drowsiness_faces":0, "drowsiness_faces":1}

# full dataset
dataset = VideoFolderDataset(DATA_ROOT, class_to_label=class_to_label, seq_len=30)

# train/val split
val_ratio = 0.2
n_val = int(len(dataset) * val_ratio)
n_train = len(dataset) - n_val
train_ds, val_ds = random_split(dataset, [n_train, n_val])

batch_size = 8  # try 4-16 depending on GPU mem
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

model = TransformerForVideo(feature_dim=512, nhead=8, num_layers=3, dim_feedforward=1024, pretrained_backbone=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

best_acc = fit(model, train_loader, val_loader, epochs=15, lr=1e-4, weight_decay=1e-4, device=device)
print("Best val acc:", best_acc)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\kenne/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:01<00:00, 40.1MB/s]


Epoch 1/15


Train: 100%|██████████| 12/12 [01:58<00:00,  9.90s/it]
Val: 100%|██████████| 3/3 [00:15<00:00,  5.23s/it]


Train loss 0.7225 acc 0.4583 | Val loss 0.7368 acc 0.4583
Saved best model (val acc improved).
Epoch 2/15


Train: 100%|██████████| 12/12 [01:53<00:00,  9.42s/it]
Val: 100%|██████████| 3/3 [00:13<00:00,  4.64s/it]


Train loss 0.7417 acc 0.4375 | Val loss 0.7397 acc 0.4583
Epoch 3/15


Train: 100%|██████████| 12/12 [01:54<00:00,  9.52s/it]
Val: 100%|██████████| 3/3 [00:14<00:00,  4.81s/it]


Train loss 0.7275 acc 0.4792 | Val loss 0.6939 acc 0.5417
Saved best model (val acc improved).
Epoch 4/15


Train: 100%|██████████| 12/12 [01:53<00:00,  9.46s/it]
Val:   0%|          | 0/3 [00:00<?, ?it/s]