In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

In [2]:
from utils import get_cuda_info

get_cuda_info()

PyTorch version: 2.5.1+cu118
**********
_CUDA version: 
CUDA version:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Oct_30_01:18:48_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0

**********
CUDNN version: 90100
Available GPU devices: 1
Device Name: NVIDIA GeForce RTX 4070 Ti SUPER


## Wybór optymalnych punktów charakterystycznych

In [3]:
import numpy as np

LANDMARK_INDEXES = np.load(os.path.join('..', '..', 'data', 'landmarks', 'combined_selected_points_emotions.npy'))
REFERENCE_POINT_IDX = 0

## Zdobycie danych

In [4]:
from utils import load_data, convert_landmarks_to_distances

all_data, all_labels = load_data('ravdess')
all_data = convert_landmarks_to_distances(all_data, LANDMARK_INDEXES, REFERENCE_POINT_IDX)

## Preprocessing danych

In [None]:
from utils import preprocess_data

X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(all_data, all_labels)

In [18]:
print(X_train.shape)
print(y_train.shape)

torch.Size([2012, 157, 154])
torch.Size([2012, 8])


# MODEL TORCH

### W podejściu wykorzystane zostaną 2 modele - pierwszy z nich będzie siecią konwolucyjną 2d, która będzie miała za zadanie nauczyć się rozpoznawać cechy charakterystyczne dla wybranej klatki (zbioru współrzędnych pkt charakterystycznych). Do klasyfikacji szeregu czasowego zostanie wykorzystana sekwencyjna sieć neuronowa LSTM.

## Zbudowanie modelu ekstrakcji cech

In [19]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class EmotionClassifier(nn.Module):
    def __init__(self, number_landmarks=LANDMARK_INDEXES.size):
        super(EmotionClassifier, self).__init__()

        self.number_landmarks = number_landmarks
        
        # Spatial feature extraction using Conv1D
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, padding=1)  # in_channels=1 for distances
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        
        # Calculate the number of features after Conv1D and pooling
        # Conv1D output: (batch_size * frames, 32, number_landmarks)
        # Pool1D output: (batch_size * frames, 32, number_landmarks // 2)
        self.flattened_features = 32 * (number_landmarks // 2)
        
        # LSTM layers for temporal feature extraction
        self.lstm1 = nn.LSTM(input_size=self.flattened_features, hidden_size=128, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(input_size=128 * 2, hidden_size=64, batch_first=True)
        
        # Fully connected classification layer
        self.fc = nn.Linear(64, 8)  # 8 emotion classes

    def forward(self, x):
        # x shape: (batch_size, frames, number_landmarks)
        batch_size, frames, landmarks = x.shape
        
        # Reshape for Conv1D: (batch_size * frames, 1, landmarks)
        x = x.view(-1, landmarks).unsqueeze(1)  # Shape: (batch_size * frames, 1, number_landmarks)
        
        # Spatial feature extraction
        x = F.relu(self.conv1(x))  # Shape: (batch_size * frames, 32, number_landmarks)
        x = self.pool1(x)          # Shape: (batch_size * frames, 32, number_landmarks // 2)
        
        # Flatten spatial features
        x = x.view(batch_size, frames, -1)  # Shape: (batch_size, frames, 32 * (number_landmarks // 2))
        
        # Temporal feature extraction
        x, _ = self.lstm1(x)  # Shape: (batch_size, frames, 128 * 2)
        x, _ = self.lstm2(x)  # Shape: (batch_size, frames, 64)
        
        # Classification
        x = self.fc(x[:, -1, :])  # Take the last timestep's output; Shape: (batch_size, 8)
        return x

In [21]:
from torch.optim import Adam

model = EmotionClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)

## Trening modelu

In [None]:
from torch.utils.tensorboard import SummaryWriter

RUNS_FOLDER_PATH = os.path.abspath('runs')
writer_path = os.path.join('runs', 'torch_lstm', 'emotion_classifier_landmark_distance')
writer = SummaryWriter(writer_path)

In [None]:
from utils.model_functions import train_torch_model_multiclass

train_torch_model_multiclass(model, criterion, optimizer, X_train, y_train, X_val, y_val, writer)

Epoch 1/300, Train Loss: 130.3934, Train Acc: 0.1332, Val Loss: 28.8059, Val Acc: 0.1346
Epoch 2/300, Train Loss: 129.3320, Train Acc: 0.1645, Val Loss: 28.6593, Val Acc: 0.1694
Epoch 3/300, Train Loss: 127.0614, Train Acc: 0.1834, Val Loss: 27.8937, Val Acc: 0.1787
Epoch 4/300, Train Loss: 124.8661, Train Acc: 0.1899, Val Loss: 27.5323, Val Acc: 0.1787
Epoch 5/300, Train Loss: 124.3736, Train Acc: 0.2063, Val Loss: 27.9171, Val Acc: 0.1601
Epoch 6/300, Train Loss: 123.3646, Train Acc: 0.1958, Val Loss: 27.3314, Val Acc: 0.1624
Epoch 7/300, Train Loss: 123.6661, Train Acc: 0.1948, Val Loss: 27.2683, Val Acc: 0.1995
Epoch 8/300, Train Loss: 122.8866, Train Acc: 0.2232, Val Loss: 27.2743, Val Acc: 0.2019
Epoch 9/300, Train Loss: 123.2685, Train Acc: 0.2102, Val Loss: 27.6643, Val Acc: 0.1926
Epoch 10/300, Train Loss: 122.5705, Train Acc: 0.2247, Val Loss: 27.3693, Val Acc: 0.1926
Epoch 11/300, Train Loss: 122.7633, Train Acc: 0.2162, Val Loss: 27.3115, Val Acc: 0.1972
Epoch 12/300, Train

KeyboardInterrupt: 

## Ewaluacja modelu

In [None]:
from utils.model_functions import eval_torch_model_multiclass

eval_torch_model_multiclass(model, criterion, X_test, y_test)

Test Loss: 0.0439, Test Accuracy: 0.4931
