In [1]:
import os
import sys
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

In [2]:
print(f'PyTorch version: {torch.__version__}')
print('*'*10)
print(f'_CUDA version: ')
!nvcc --version
print('*'*10)
print(f'CUDNN version: {torch.backends.cudnn.version()}')
print(f'Available GPU devices: {torch.cuda.device_count()}')
print(f'Device Name: {torch.cuda.get_device_name()}')

PyTorch version: 2.5.1+cu118
**********
_CUDA version: 
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Oct_30_01:18:48_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0
**********
CUDNN version: 90100
Available GPU devices: 1
Device Name: NVIDIA GeForce RTX 4070 Ti SUPER


## Zdobycie danych

In [3]:
DATA_PATH = os.path.abspath(os.path.join('..', 'data', 'processed', 'ravdess'))

In [4]:
LANDMARK_INDEXES = [
    76, 306,  # mouth corners
    74, 73, 72, 11, 302, 303, 304, # upper lip
    90, 180, 85, 16, 315, 404, 320, # lower lip
    33, 161, 159, 157, 133, 154, 145, 163,  # left eye
    70, 63, 105, 66, 107,  # left eyebrow
    362, 384, 386, 388, 263, 390, 374, 381,  # right eye
    300, 293, 334, 296, 336,  # right eyebrow
    1, 5, 197, 168  # nose
]

REFERENCE_LANDMARK_INDEX = 0  # Middle of face

In [5]:
def convert_coordinates_to_avg_distance(frames):
    out = []
    for frame in frames:
        frame_distance = 0
        for landmark_idx in LANDMARK_INDEXES:
            frame_distance += math.sqrt(
                (frame[REFERENCE_LANDMARK_INDEX][0] - frame[landmark_idx][0])**2 +
                (frame[REFERENCE_LANDMARK_INDEX][1] - frame[landmark_idx][1])**2
            )
        out.append(frame_distance / len(LANDMARK_INDEXES))
    return np.array(out)

In [6]:
def load_data(data_path):
    all_data = []
    all_labels = []

    for file in os.listdir(data_path):
        if file.endswith(".npy"):
            data = np.load(os.path.join(data_path, file), allow_pickle=True)
            data = np.array(data, dtype=np.float32)

            all_data.append(data)

            label = int(file.split("-")[2])
            all_labels.append(label)

    return np.array(all_data, dtype=object), np.array(all_labels)

In [7]:
all_data, all_labels = load_data(DATA_PATH)

## Preprocessing danych

In [8]:
def preprocess_data(data, labels):
    tensor_data = [torch.tensor(d, dtype=torch.float32) for d in data]
    padded_data = pad_sequence(tensor_data, batch_first=True)

    encoder = LabelBinarizer()
    encoded_labels = encoder.fit_transform(labels)
    encoded_labels = torch.tensor(encoded_labels, dtype=torch.float32)

    X_train, X_temp, y_train, y_temp = train_test_split(
        padded_data, encoded_labels, test_size=0.3, random_state=42
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42
    )

    return X_train, X_val, X_test, y_train, y_val, y_test

In [9]:
X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(all_data, all_labels)

In [10]:
print(X_train.shape)
print(y_train.shape)

torch.Size([2012, 157, 478, 2])
torch.Size([2012, 8])


In [11]:
# Class distribution
from collections import Counter

for key, val in sorted(Counter((label.argmax().item() for label in y_train)).items(), key=lambda i: i[0]):
    print(f"{key}:{val}")

0:139
1:271
2:258
3:266
4:262
5:278
6:273
7:265


# MODEL TORCH

### W podejściu wykorzystane zostaną 2 modele - pierwszy z nich będzie siecią konwolucyjną 2d, która będzie miała za zadanie nauczyć się rozpoznawać cechy charakterystyczne dla wybranej klatki (zbioru współrzędnych pkt charakterystycznych). Do klasyfikacji szeregu czasowego zostanie wykorzystana sekwencyjna sieć neuronowa LSTM.

## Zbudowanie modelu ekstrakcji cech

In [12]:
class EmotionClassifier(nn.Module):
    def __init__(self):
        super(EmotionClassifier, self).__init__()
        
        # Spatial feature extraction using Conv1D
        self.conv1 = nn.Conv1d(in_channels=2, out_channels=32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        
        # LSTM layers for temporal feature extraction
        self.lstm1 = nn.LSTM(input_size=32 * 239, hidden_size=128, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(input_size=128 * 2, hidden_size=64, batch_first=True)
        
        # Fully connected classification layer
        self.fc = nn.Linear(64, 8)  # 8 emotion classes

    def forward(self, x):
        # x shape: (batch_size, frames, landmarks, coordinates)
        batch_size, frames, landmarks, coordinates = x.shape
        
        # Reshape for Conv1D: (batch_size * frames, landmarks, coordinates)
        x = x.view(-1, landmarks, coordinates).permute(0, 2, 1)
        
        # Spatial feature extraction
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        
        # Flatten spatial features
        x = x.view(batch_size, frames, -1)  # (batch_size, frames, features)
        
        # Temporal feature extraction
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        
        # Classification
        x = self.fc(x[:, -1, :])  # Take the last timestep's output
        return x

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmotionClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)

## Trening modelu

In [14]:
BATCH_SIZE = 32
EPOCHS = 200

In [15]:
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [16]:
writer = SummaryWriter("runs/torch-lstm/emotion_classifier")

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_batch = y_batch.argmax(dim=1)
        
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(y_batch).sum().item()
        total += y_batch.size(0)
    
    train_acc = correct / total

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_batch = y_batch.argmax(dim=1)
            
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(y_batch).sum().item()
            total += y_batch.size(0)
    
    val_acc = correct / total

    writer.add_scalar("Loss/Train", train_loss, epoch)
    writer.add_scalar("Loss/Validation", val_loss, epoch)
    writer.add_scalar("Accuracy/Train", train_acc, epoch)
    writer.add_scalar("Accuracy/Validation", val_acc, epoch)

    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
writer.close()


Epoch 1/200, Train Loss: 130.3706, Train Acc: 0.1302, Val Loss: 28.8681, Val Acc: 0.1369
Epoch 2/200, Train Loss: 130.2065, Train Acc: 0.1252, Val Loss: 28.8730, Val Acc: 0.1323
Epoch 3/200, Train Loss: 130.0949, Train Acc: 0.1332, Val Loss: 28.8543, Val Acc: 0.1462
Epoch 4/200, Train Loss: 130.1193, Train Acc: 0.1228, Val Loss: 28.8616, Val Acc: 0.1369
Epoch 5/200, Train Loss: 130.0554, Train Acc: 0.1223, Val Loss: 28.8600, Val Acc: 0.1276
Epoch 6/200, Train Loss: 130.0361, Train Acc: 0.1238, Val Loss: 28.8725, Val Acc: 0.1276
Epoch 7/200, Train Loss: 130.0179, Train Acc: 0.1431, Val Loss: 28.8632, Val Acc: 0.1369
Epoch 8/200, Train Loss: 130.0870, Train Acc: 0.1262, Val Loss: 28.8738, Val Acc: 0.1276
Epoch 9/200, Train Loss: 130.0385, Train Acc: 0.1357, Val Loss: 28.8598, Val Acc: 0.1276
Epoch 10/200, Train Loss: 130.0207, Train Acc: 0.1238, Val Loss: 28.8595, Val Acc: 0.1276
Epoch 11/200, Train Loss: 130.0245, Train Acc: 0.1302, Val Loss: 28.8622, Val Acc: 0.1276
Epoch 12/200, Train

## Ewaluacja modelu

In [17]:
model.eval()

test_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in DataLoader(TensorDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        y_batch = y_batch.argmax(dim=1)
        
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        test_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(y_batch).sum().item()
        total += y_batch.size(0)

test_loss /= len(y_test)
test_acc = correct / total

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

Test Loss: 0.0617, Test Accuracy: 0.2338


# MODEL SEGLEARN

In [12]:
from sklearn.ensemble import RandomForestClassifier
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, Segment
from seglearn.split import temporal_split

### Przekształcenie danych na wektor płaski połączonych współrzędnych

In [13]:
X_train_np = X_train.numpy()
X_val_np = X_val.numpy()
X_test_np = X_test.numpy()
y_train_np = y_train.numpy()
y_val_np = y_val.numpy()
y_test_np = y_test.numpy()

In [14]:
def flatten_landmarks(data):
    n_samples, n_timesteps, n_landmarks, n_coords = data.shape
    return data.reshape(n_samples, n_timesteps, n_landmarks * n_coords)

X_train_flat = flatten_landmarks(X_train_np)
X_val_flat = flatten_landmarks(X_val_np)
X_test_flat = flatten_landmarks(X_test_np)

### Budowa modelu

In [25]:
pipe = Pype([
    ("segment", Segment(width=20, step=10)),  # Segmentacja sekwencji
    ("features", FeatureRep()),              # Ekstrakcja cech
    ("rf", RandomForestClassifier(n_estimators=100))  # RandomForest
])

### Trening modelu

In [26]:
pipe.fit(X_train_flat, y_train_np)

### Ewaluacja modelu

In [27]:
val_accuracy = pipe.score(X_val_flat, y_val_np)
test_accuracy = pipe.score(X_test_flat, y_test_np)

print(f"Dokładność na zbiorze walidacyjnym: {val_accuracy:.2f}")
print(f"Dokładność na zbiorze testowym: {test_accuracy:.2f}")

Dokładność na zbiorze walidacyjnym: 0.63
Dokładność na zbiorze testowym: 0.62


# TODYNET

### Przygotowanie danych

In [15]:
TodyNet_DATA_PATH = os.path.join("..", "src", "external", "TodyNet", "data", "UCR", "EMOTIONS")

os.makedirs(TodyNet_DATA_PATH, exist_ok=True)

In [59]:
X_train_tensor = torch.tensor(X_train_flat, dtype=torch.float32).unsqueeze(1)  # adding channel dimension
X_val_tensor = torch.tensor(X_val_flat, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_flat, dtype=torch.float32).unsqueeze(1)

y_train_class = torch.argmax(y_train, dim=1)
y_val_class = torch.argmax(y_val, dim=1)
y_test_class = torch.argmax(y_test, dim=1)

# Save the data in PyTorch (.pt) format
torch.save(X_train_tensor, os.path.join(TodyNet_DATA_PATH, 'X_train.pt'))
torch.save(X_val_tensor, os.path.join(TodyNet_DATA_PATH, 'X_valid.pt'))
torch.save(X_test_tensor, os.path.join(TodyNet_DATA_PATH, 'X.pt'))

# Save the labels in PyTorch (.pt) format
torch.save(y_train_class, os.path.join(TodyNet_DATA_PATH, 'y_train.pt'))
torch.save(y_val_class, os.path.join(TodyNet_DATA_PATH, 'y_valid.pt'))
torch.save(y_test_class, os.path.join(TodyNet_DATA_PATH, 'y.pt'))

In [60]:
X_train_tensor.shape

torch.Size([2012, 1, 157, 956])

### Trening modelu [pool_ratio 0.8, ponieważ rozmiar danych jest zbyt duży na 0.2]

In [None]:
# cd .\src\external\TodyNet\src\ & python train.py --dataset='EMOTIONS' --pool_ratio 0.8