In [13]:
import os

import torch
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader, random_split
# from torchvision.transforms import functional as F
from torch.nn import functional as F
import torchvision.transforms.functional as VF
# from pycocotools.coco import COCO
import torchvision.transforms.v2 as T
import torch.nn as nn
from torchvision.models import resnet18
from torchvision import transforms

import matplotlib.pyplot as plt
from itertools import cycle

from tqdm.notebook import tqdm

import random
import numpy as np
import torchvision.models as models

import wandb


import import_ipynb

import math

In [14]:
wandb.login()



True

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if torch.mps.is_available():
    device = "mps"
print(device)

# reduce cpu contention
torch.set_num_threads(1)
NUM_WORKERS = 6  # adjust based on CPU cores

mps


In [16]:
NUM_KEYPOINTS = 17

NUM_CLASSES = 5
EPOCHS = 20
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-4
BATCH_SIZE = 8

CROP_SIZE = (256, 256)
DATA_AUGMENTATION = False

In [17]:
def train_one_epoch(model, dataloader, optimizer, criterion, verbose_tqdm=False):
    model.train()
    total_loss = 0.0

    dl = tqdm(dataloader, desc="Training") if verbose_tqdm else dataloader    
    for imgs, keypoints, vals in dl:
        imgs = imgs.to(device)
        keypoints = keypoints.to(device)
        vals = vals.to(device)
        
        optimizer.zero_grad()
        
        preds = model(imgs, keypoints)
        loss = criterion(preds, vals)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, verbose_tqdm=False):
    model.eval()
    total_loss = 0.0

    dl = tqdm(dataloader, desc="Evaluating") if verbose_tqdm else dataloader
    with torch.no_grad():
        for imgs, keypoints, vals in dl:
            imgs = imgs.to(device)
            keypoints = keypoints.to(device)
            vals = vals.to(device)
            
            preds = model(imgs, keypoints)
            loss = criterion(preds, vals)
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [18]:

class ActionsFusionModel(nn.Module):
    def __init__(self, num_keypoints=NUM_KEYPOINTS, num_actions=10):
        super().__init__()

        # === Wizualny tor (CNN) ===
        base_model = models.resnet18(pretrained=True)
        self.cnn_backbone = nn.Sequential(*list(base_model.children())[:-1])
        self.feature_dim_img = base_model.fc.in_features  # 512

        for param in self.cnn_backbone.parameters():
            param.requires_grad = False

        # === Tor dla keypointów === 
        self.keypoint_dim = num_keypoints * 2  # (x,y) dla każdego punktu

        self.keypoint_mlp = nn.Sequential(
            nn.Linear(self.keypoint_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 128),
            nn.ReLU()
        )

        # === Klasyfikator na podstawie fuzji ===
        self.classifier = nn.Sequential(
            nn.Linear(self.feature_dim_img + 128, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(512, 256),
            nn.ReLU(),

            nn.Linear(256, num_actions)
        )

    def forward(self, x_img, x_kp):
        # === Obrazy ===
        B, T, C, H, W = x_img.shape
        x_img = x_img.view(B * T, C, H, W)

        with torch.no_grad():
            feat_img = self.cnn_backbone(x_img)  # (B*T, 512, 1, 1)
        feat_img = feat_img.view(B, T, self.feature_dim_img)
        feat_img = feat_img.mean(dim=1)  # (B, 512)

        # === Keypointy ===
        B, T, N, _ = x_kp.shape
        x_kp = x_kp.view(B, T, -1)           # (B, T, N*2)
        feat_kp = self.keypoint_mlp(x_kp)    # (B, T, 128)
        feat_kp = feat_kp.mean(dim=1)        # (B, 128)

        # === Fuzja ===
        fused = torch.cat([feat_img, feat_kp], dim=1)  # (B, 640)

        out = self.classifier(fused)  # (B, num_actions)
        return out


In [19]:
import sys
sys.path.append("..")

import actions.data_loader_keypoints_simple_aug as dl



In [20]:


train_loader_fused = DataLoader(dl.dataset_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader_fused   = DataLoader(dl.dataset_valid, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
test_loader_fused  = DataLoader(dl.dataset_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

# Model
model = ActionsFusionModel(num_keypoints=17, num_actions=NUM_CLASSES).to(device)



In [21]:
criterion = nn.CrossEntropyLoss();
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [22]:
wandb_config = {
    "epochs": EPOCHS,
    "learning_rate": LEARNING_RATE,
    "weight_decay": WEIGHT_DECAY,
    "batch_size": BATCH_SIZE,
    "train_size": len(dl.dataset_train),
    "val_size": len(dl.dataset_valid),
    "test_size": len(dl.dataset_test),
    "model": "ActionsBaselineModel",
    "criterion": "Cross entropy loss",
    "optimizer": "Adam",
    "crop_size": CROP_SIZE,
    "device": device,
    "data_augmentation": DATA_AUGMENTATION
}

wandb.init(
    entity="fejowo5522-",
    project="NN_Project",
    config=wandb_config,
    group="ActionsBaseline"
)

0,1
epoch,▁▃▆█
train_loss,█▆▃▁
val_loss,█▅▃▁

0,1
epoch,4.0
train_loss,1.27584
val_loss,1.34004


In [23]:
verbose_tqdm = True
early_stopping = True
patience = 20
best_val_loss = float('inf')
epochs_no_improve = 0

train_losses = []
val_losses = []

In [24]:
from tqdm.auto import tqdm

for epoch in tqdm(range(EPOCHS)):
    # print(f"Epoch {epoch+1}/{EPOCHS}")
    
    # Train
    train_loss = train_one_epoch(model, train_loader_fused, optimizer, criterion, verbose_tqdm=verbose_tqdm)
    train_losses.append(train_loss)
    
    # Validate
    val_loss = evaluate(model, val_loader_fused, criterion, verbose_tqdm=verbose_tqdm)
    val_losses.append(val_loss)
    
    # print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    # Log to wandb
    wandb.log({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss
    })
    
    # Early stopping
    if early_stopping:
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            epochs_no_improve += 1
        
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

  0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Training:   0%|          | 0/43 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
def test(model, dataloader, criterion, verbose_tqdm=False):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    dl = tqdm(dataloader, desc="Testing") if verbose_tqdm else dataloader
    with torch.no_grad():
        for imgs, keypoints, vals in dl:
            imgs = imgs.to(device)
            keypoints = keypoints.to(device)
            vals = vals.to(device)
            
            preds = model(imgs, keypoints)
            loss = criterion(preds, vals)
            total_loss += loss.item()

            # Oblicz przewidywaną i prawdziwą klasę
            predicted_classes = torch.argmax(preds, dim=1)
            true_classes = torch.argmax(vals, dim=1)

            # Licz poprawne
            correct += (predicted_classes == true_classes).sum().item()
            total += vals.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total if total > 0 else 0.0
    return avg_loss, accuracy

test_avg_loss, test_acc = test(model, test_loader_fused, criterion)
wandb.log({
    'test_loss': test_avg_loss,
    'test_accuracy': test_acc
})