In [2]:
import os

import torch
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader, random_split
# from torchvision.transforms import functional as F
from torch.nn import functional as F
import torchvision.transforms.functional as VF
# from pycocotools.coco import COCO
import torchvision.transforms.v2 as T
import torch.nn as nn
from torchvision.models import resnet18
from torchvision import transforms

import matplotlib.pyplot as plt
from itertools import cycle

from tqdm.notebook import tqdm

import random
import numpy as np
import torchvision.models as models

import wandb


import import_ipynb

import math

In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmaciejdengusiak[0m ([33mfejowo5522-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if torch.mps.is_available():
    device = "mps"
print(device)

# reduce cpu contention
torch.set_num_threads(1)
NUM_WORKERS = 6  # adjust based on CPU cores

mps


In [5]:
NUM_KEYPOINTS = 17

NUM_CLASSES = 51
EPOCHS = 100
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-4
BATCH_SIZE = 16

CROP_SIZE = (256, 256)
DATA_AUGMENTATION = False

In [6]:
def train_one_epoch(model, dataloader, optimizer, criterion, verbose_tqdm=False):
    model.train()
    total_loss = 0.0

    dl = tqdm(dataloader, desc="Training") if verbose_tqdm else dataloader    
    for imgs, keypoints in dl:
        imgs = imgs.to(device)
        keypoints = keypoints.to(device)
        
        optimizer.zero_grad()
        
        preds = model(imgs)
        loss = criterion(preds, keypoints)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, verbose_tqdm=False):
    model.eval()
    total_loss = 0.0

    dl = tqdm(dataloader, desc="Evaluating") if verbose_tqdm else dataloader
    with torch.no_grad():
        for imgs, keypoints in dl:
            imgs = imgs.to(device)
            keypoints = keypoints.to(device)
            
            preds = model(imgs)
            loss = criterion(preds, keypoints)
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [7]:
class FusionDataset(torch.utils.data.Dataset):
    def __init__(self, base_dataset, keypoint_model, device):
        self.base_dataset = base_dataset
        self.keypoint_model = keypoint_model.eval().to(device)
        self.device = device

    def __len__(self):
        return len(self.base_dataset)

    def __getitem__(self, idx):
        frames, label = self.base_dataset[idx]  # frames: (16, 3, 224, 224)
        frames = frames.to(self.device)

        keypoints_seq = []
        with torch.no_grad():
            for frame in frames:
                keypoints = self.keypoint_model(frame.unsqueeze(0))  # (1, 3, 224, 224)
                keypoints_seq.append(keypoints.squeeze(0))  # (17, 2)

        keypoints_tensor = torch.stack(keypoints_seq)  # shape: (16, 17, 2)

        return (frames.cpu(), keypoints_tensor.cpu()), label


In [8]:


class ActionsFusionModel(nn.Module):
    def __init__(self, num_keypoints=NUM_KEYPOINTS, num_actions=10):
        super().__init__()

        # === Wizualny tor (CNN) ===
        base_model = models.resnet18(pretrained=True)
        self.cnn_backbone = nn.Sequential(*list(base_model.children())[:-1])
        self.feature_dim_img = base_model.fc.in_features  # 512

        for param in self.cnn_backbone.parameters():
            param.requires_grad = False

        # === Tor dla keypointów === 
        self.keypoint_dim = num_keypoints * 2  # (x,y) dla każdego punktu

        self.keypoint_mlp = nn.Sequential(
            nn.Linear(self.keypoint_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 128),
            nn.ReLU()
        )

        # === Klasyfikator na podstawie fuzji ===
        self.classifier = nn.Sequential(
            nn.Linear(self.feature_dim_img + 128, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(512, 256),
            nn.ReLU(),

            nn.Linear(256, num_actions)
        )

    def forward(self, x_img, x_kp):
        # === Obrazy ===
        B, T, C, H, W = x_img.shape
        x_img = x_img.view(B * T, C, H, W)

        with torch.no_grad():
            feat_img = self.cnn_backbone(x_img)  # (B*T, 512, 1, 1)
        feat_img = feat_img.view(B, T, self.feature_dim_img)
        feat_img = feat_img.mean(dim=1)  # (B, 512)

        # === Keypointy ===
        B, T, N, _ = x_kp.shape
        x_kp = x_kp.view(B, T, -1)           # (B, T, N*2)
        feat_kp = self.keypoint_mlp(x_kp)    # (B, T, 128)
        feat_kp = feat_kp.mean(dim=1)        # (B, 128)

        # === Fuzja ===
        fused = torch.cat([feat_img, feat_kp], dim=1)  # (B, 640)

        out = self.classifier(fused)  # (B, num_actions)
        return out


In [9]:

class KeypointCropModel(nn.Module):
    def __init__(self):
        super().__init__()
        resnet = resnet18(pretrained=True)
        self.backbone = nn.Sequential(*list(resnet.children())[:-2])  # Remove avgpool and fc
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, NUM_KEYPOINTS * 3)  # Predict x, y, confidence for each keypoint
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = x.view(-1, NUM_KEYPOINTS, 3)
        return x

class PersonKeypointPipeline:
    def __init__(self, detector, keypoint_model, crop_transform, device='cpu', detection_threshold=0.8, crop_size=256):
        self.detector = detector.to(device)
        self.detector.eval()
        self.keypoint_model = keypoint_model.to(device)
        self.keypoint_model.eval()
        self.crop_transform = crop_transform
        self.device = device
        self.detection_threshold = detection_threshold
        self.crop_size = crop_size

    def predict(self, pil_img):
        """
        Args:
            pil_img: PIL.Image.Image, RGB image

        Returns:
            List of dicts, each with:
                - 'box': [x1, y1, x2, y2]
                - 'keypoints': np.ndarray of shape (NUM_KEYPOINTS, 3)
        """
        # Detect people
        img_tensor = T.Compose([T.ToTensor()])(pil_img).unsqueeze(0).to(self.device)
        with torch.no_grad():
            detections = self.detector(img_tensor)[0]

        person_mask = (detections['labels'] == 1) & (detections['scores'] > self.detection_threshold)
        boxes = detections['boxes'][person_mask].cpu().numpy()

        results = []
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            crop = pil_img.crop((x1, y1, x2, y2)).resize((self.crop_size, self.crop_size))
            crop_tensor = self.crop_transform(crop).unsqueeze(0).to(self.device)
            with torch.no_grad():
                pred_kps = self.keypoint_model(crop_tensor)[0].cpu()
            pred_kps[:, 2] = torch.sigmoid(pred_kps[:, 2])

            # Map keypoints back to original image coordinates
            box_w, box_h = x2 - x1, y2 - y1
            mapped_kps = []
            for kp in pred_kps:
                orig_x = kp[0] * (box_w / self.crop_size) + x1
                orig_y = kp[1] * (box_h / self.crop_size) + y1
                visible = 1 if kp[2] > 0.5 else 0
                mapped_kps.append([orig_x.item(), orig_y.item(), visible])
            mapped_kps = np.array(mapped_kps)

            results.append({'box': [x1, y1, x2, y2], 'keypoints': mapped_kps})

        return results
    
class CropTransform:
    def __init__(self, augmentation=False):
        if augmentation:
            self.transform = transforms.Compose([
                transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0)),
                transforms.RandomApply([transforms.RandomGrayscale(p=1.0)], p=0.2),
                transforms.RandomAdjustSharpness(sharpness_factor=2, p=0.3),
                transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
    
    def __call__(self, img):
        return self.transform(img)

In [11]:
import sys
sys.path.append("..")

import actions.data_loader as dl
import keypoints.keypoints_boundingbox_approach as kp



loaded_pipeline = KeypointCropModel().to(device)
loaded_pipeline = torch.load("../../models/keypoints_model_pipeline.pth", map_location=device, weights_only=False)

In [None]:

model = KeypointCropModel().to(device)
model.load_state_dict(torch.load("../../models/bb_23loss_keypoint_crop_model.pth", map_location=device))
model.eval()


img_tensor = dl.dataset_train[0][0][0]


# preds = loa


In [12]:
# x = dl.dataset_train[0][0][0]


# predictions = loaded_pipeline.predict(x)



In [None]:
def extract_keypoints_batch(model, imgs):
    """
    imgs: (B, T, 3, H, W)
    Zwraca: (B, T, N, 2) – keypointy
    """
    B, T, C, H, W = imgs.shape
    imgs = imgs.to(device)

    keypoints_list = []

    for b in range(B):
        sample_keypoints = []
        for t in range(T):
            frame = imgs[b, t].unsqueeze(0)  # (1, 3, H, W)
            kp = model(frame)  # np. (1, N, 2)
            sample_keypoints.append(kp.squeeze(0).cpu())  # (N, 2)
        sample_keypoints = torch.stack(sample_keypoints, dim=0)  # (T, N, 2)
        keypoints_list.append(sample_keypoints)

    keypoints_tensor = torch.stack(keypoints_list, dim=0)  # (B, T, N, 2)
    return keypoints_tensor

In [None]:
class FusionWrapperDataset(torch.utils.data.Dataset):
    def __init__(self, base_dataset, keypoint_model):
        self.base = base_dataset
        self.keypoint_model = keypoint_model

    def __len__(self):
        return len(self.base)

    def __getitem__(self, idx):
        imgs, label = self.base[idx]  # imgs: (T, 3, H, W)
        imgs = imgs.unsqueeze(0)  # (1, T, 3, H, W)

        keypoints = extract_keypoints_batch(self.keypoint_model, imgs)  # (1, T, N, 2)
        keypoints = keypoints.squeeze(0)  # (T, N, 2)

        return imgs.squeeze(0), keypoints, label

In [None]:
fusion_train = FusionWrapperDataset(dl.dataset_train, keypoint_model)
fusion_val   = FusionWrapperDataset(dl.dataset_valid, keypoint_model)
fusion_test  = FusionWrapperDataset(dl.dataset_test, keypoint_model)

train_loader_fused = DataLoader(fusion_train, batch_size=BATCH_SIZE, shuffle=True)
val_loader_fused   = DataLoader(fusion_val, batch_size=BATCH_SIZE, shuffle=False)
test_loader_fused  = DataLoader(fusion_test, batch_size=BATCH_SIZE, shuffle=False)

# Model
model = ActionsFusionModel(num_keypoints=17, num_classes=51).to(device)

TypeError: ActionsFusionModel.__init__() got an unexpected keyword argument 'num_classes'

In [None]:
criterion = nn.CrossEntropyLoss();
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [None]:
wandb_config = {
    "epochs": EPOCHS,
    "learning_rate": LEARNING_RATE,
    "weight_decay": WEIGHT_DECAY,
    "batch_size": BATCH_SIZE,
    "train_size": len(fusion_train),
    "val_size": len(fusion_val),
    "test_size": len(fusion_test),
    "model": "ActionsBaselineModel",
    "criterion": "Cross entropy loss",
    "optimizer": "Adam",
    "crop_size": CROP_SIZE,
    "device": device,
    "data_augmentation": DATA_AUGMENTATION
}

wandb.init(
    entity="fejowo5522-",
    project="NN_Project",
    config=wandb_config,
    group="ActionsBaseline"
)

In [None]:
verbose_tqdm = True
early_stopping = True
patience = 20
best_val_loss = float('inf')
epochs_no_improve = 0

train_losses = []
val_losses = []

In [None]:
from tqdm.auto import tqdm

for epoch in tqdm(range(EPOCHS)):
    # print(f"Epoch {epoch+1}/{EPOCHS}")
    
    # Train
    train_loss = train_one_epoch(model, dl.train_loader, optimizer, criterion, verbose_tqdm=verbose_tqdm)
    train_losses.append(train_loss)
    
    # Validate
    val_loss = evaluate(model, dl.val_loader, criterion, verbose_tqdm=verbose_tqdm)
    val_losses.append(val_loss)
    
    # print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    # Log to wandb
    wandb.log({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss
    })
    
    # Early stopping
    if early_stopping:
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            epochs_no_improve += 1
        
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break