In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

In [2]:
from utils import get_cuda_info

get_cuda_info()

PyTorch version: 2.5.1+cu118
**********
_CUDA version: 
CUDA version:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Oct_30_01:18:48_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0

**********
CUDNN version: 90100
Available GPU devices: 1
Device Name: NVIDIA GeForce RTX 4070 Ti SUPER


## Zdobycie danych

In [3]:
from utils import load_data

all_data, all_labels = load_data('nemo_smile')

## Preprocessing danych

In [15]:
from utils import preprocess_data

X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(all_data, all_labels, binarize_labels=False)

In [16]:
print(X_train.shape)
print(y_train.shape)

torch.Size([868, 609, 478, 2])
torch.Size([868])


In [18]:
from utils import get_class_distribution

get_class_distribution(all_labels)

===> Class distribution <===
0: 597
1: 643


# MODEL TORCH

### W podejściu wykorzystane zostaną 2 modele - pierwszy z nich będzie siecią konwolucyjną 2d, która będzie miała za zadanie nauczyć się rozpoznawać cechy charakterystyczne dla wybranej klatki (zbioru współrzędnych pkt charakterystycznych). Do klasyfikacji szeregu czasowego zostanie wykorzystana sekwencyjna sieć neuronowa LSTM.

## Zbudowanie modelu ekstrakcji cech

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [25]:
class SmileClassifier(nn.Module):
    def __init__(self):
        super(SmileClassifier, self).__init__()
        
        # Spatial feature extraction
        self.conv1 = nn.Conv1d(in_channels=2, out_channels=32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        
        # Calculate the correct dimension after convolutions
        # Assuming initial landmarks is 478 (since 239*2=478)
        # After pool1: 239, after pool2: 119
        self.conv_output_size = 64 * 119  # 64 channels * 119 landmarks
        
        # Temporal feature extraction
        self.lstm = nn.LSTM(input_size=self.conv_output_size, hidden_size=128, 
                           batch_first=True, bidirectional=True)
        
        # Classification head
        self.attn = nn.Linear(256, 1)
        self.fc1 = nn.Linear(256, 64)  # 128*2 for bidirectional
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 1)
        
        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        for name, param in self.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0.1)

    def forward(self, x):
        # x shape: (batch_size, frames, landmarks, coordinates)
        batch_size, frames, landmarks, coordinates = x.shape
        
        # Reshape for Conv1D: combine batch and frames
        x = x.view(-1, landmarks, coordinates)  # (batch*frames, landmarks, 2)
        x = x.permute(0, 2, 1)  # (batch*frames, 2, landmarks)
        
        # Spatial features
        x = F.relu(self.conv1(x))
        x = self.pool1(x)  # (batch*frames, 32, landmarks//2)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)  # (batch*frames, 64, landmarks//4)
        
        # Flatten spatial features
        x = x.view(batch_size, frames, -1)  # (batch, frames, 64*(landmarks//4))
        
        # Temporal features with attention
        x, _ = self.lstm(x)
        attn_weights = torch.softmax(self.attn(x), dim=1)
        x = (x * attn_weights).sum(dim=1)
        
        # Classification head (remove sigmoid!)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)  # Raw logits

In [26]:
from torch.optim import Adam

model = SmileClassifier()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pos_weight = torch.tensor([(len(y_train) - y_train.sum()) / y_train.sum()]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = Adam(model.parameters(), lr=1e-4)

## Trening modelu

In [27]:
from torch.utils.tensorboard import SummaryWriter

RUNS_FOLDER_PATH = os.path.abspath('runs')
writer_path = os.path.join('runs', 'torch_lstm', 'fake_smile_classifier')
writer = SummaryWriter(writer_path)

In [28]:
from utils.model_functions import train_torch_model_binary

train_torch_model_binary(model, criterion, optimizer, X_train, y_train, X_val, y_val, writer=writer, batch_size=32, unbalanced=True, show_prediction_stats=False)


                                          EPOCH STATISTICS                                          
Epoch       : 1
----------------------------------------------------------------------------------------------------
                     TRAINING                                         VALIDATION                    
----------------------------------------------------------------------------------------------------
Loss        : 17.130543                                    Loss        : 3.422076
Accuracy    : 0.5876                                    Accuracy    : 0.5645
Precision   : 0.5896                                    Precision   : 0.6114
Recall      : 0.5876                                    Recall      : 0.5570
F1 Score    : 0.5852                                    F1 Score    : 0.4996
----------------------------------------------------------------------------------------------------
                                          VALIDATION EXTRA                              

## Ewaluacja modelu

In [29]:
from utils.model_functions import eval_torch_model_binary

eval_torch_model_binary(model, criterion, X_test, y_test)


                                          EPOCH STATISTICS                                          
Epoch       : 1
----------------------------------------------------------------------------------------------------
                                             VALIDATION                                             
----------------------------------------------------------------------------------------------------
Loss        : 3.415825
Accuracy    : 0.6828
Precision   : 0.6912
Recall      : 0.6874
F1 Score    : 0.6821
----------------------------------------------------------------------------------------------------
                                          VALIDATION EXTRA                                          
TP Rate     : 0.7727                                    FP Rate     : 0.3980



# TODYNET

### Przygotowanie danych

In [30]:
X_train_np = X_train.numpy()
X_val_np = X_val.numpy()
X_test_np = X_test.numpy()
y_train_np = y_train.numpy()
y_val_np = y_val.numpy()
y_test_np = y_test.numpy()

In [31]:
def flatten_landmarks(data):
    n_samples, n_timesteps, n_landmarks, n_coords = data.shape
    return data.reshape(n_samples, n_timesteps, n_landmarks * n_coords)

X_train_flat = flatten_landmarks(X_train_np)
X_val_flat = flatten_landmarks(X_val_np)
X_test_flat = flatten_landmarks(X_test_np)

In [39]:
X_train_flat.shape

(868, 609, 956)

In [32]:
TodyNet_DATA_PATH = os.path.join("..", "..", "src", "external", "TodyNet", "data", "UCR", "NEMO_SMILE")

os.makedirs(TodyNet_DATA_PATH, exist_ok=True)

In [37]:
import torch

X_train_tensor = torch.tensor(X_train_flat, dtype=torch.float32).unsqueeze(1)  # adding channel dimension
X_val_tensor = torch.tensor(X_val_flat, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_flat, dtype=torch.float32).unsqueeze(1)

# Save the data in PyTorch (.pt) format
torch.save(X_train_tensor, os.path.join(TodyNet_DATA_PATH, 'X_train.pt'))
torch.save(X_val_tensor, os.path.join(TodyNet_DATA_PATH, 'X_valid.pt'))
torch.save(X_test_tensor, os.path.join(TodyNet_DATA_PATH, 'X.pt'))

# Save the labels in PyTorch (.pt) format
torch.save(y_train.long(), os.path.join(TodyNet_DATA_PATH, 'y_train.pt'))
torch.save(y_val.long(), os.path.join(TodyNet_DATA_PATH, 'y_valid.pt'))
torch.save(y_test.long(), os.path.join(TodyNet_DATA_PATH, 'y.pt'))

In [38]:
X_train_tensor.shape

torch.Size([868, 1, 609, 956])

### Trening modelu Zmiejszone parametry ze względu na wielkość modelu

In [None]:
# cd .\src\external\TodyNet\src\ & python train.py --dataset='NEMO_SMILE' --num_layers 1 --in_dim 16 --hidden_dim 16 --out_dim 16 --pool_ratio 0.0 --kern_size "3" --groups 1

## Najlepszy wynik

TRAIN, epoch 16, loss 0.5146783623277866, acc tensor([75.4608], device='cuda:0')

VAL, loss 0.5493738343638759, acc tensor([73.1183], device='cuda:0')