In [1]:
import pickle
import torch
import torch.nn as nn
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

In [2]:
# ============ Load Annotations ============
def load_labels(pkl_path):
    with open(pkl_path, 'rb') as f:
        data = pickle.load(f, encoding='latin1')

    #trait_keys = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness', 'interview']
    trait_keys = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']  # OCEAN order

    video_names = list(data['extraversion'].keys())

    records = []
    for name in video_names:
        name = name[:-4]  # strip extension
        record = {'video_name': name}
        for key in trait_keys:
            record[key] = data[key][name + '.mp4']
        records.append(record)

    return pd.DataFrame(records)

train_df = load_labels('Embeddings_emonext/fiv2_embeddings/annotation_training.pkl')
test_df = load_labels('Embeddings_emonext/fiv2_embeddings/annotation_test.pkl')

In [5]:
train_df.head()

Unnamed: 0,video_name,openness,conscientiousness,extraversion,agreeableness,neuroticism
0,J4GQm9j0JZ0.003,0.488889,0.601942,0.523364,0.626374,0.552083
1,zEyRyTnIw5I.005,0.366667,0.582524,0.345794,0.472527,0.375
2,nskJh7v6v1U.004,0.511111,0.485437,0.252336,0.406593,0.291667
3,6wHQsN5g2RM.000,0.377778,0.398058,0.457944,0.505495,0.489583
4,dQOeQYWIgm8.000,0.622222,0.621359,0.607477,0.406593,0.489583


In [4]:
# ============ Load Embeddings ============
def load_embeddings(df, emb_dir):
    X, y, lengths = [], [], []
    trait_keys = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
    for folder in Path(emb_dir).iterdir():
        for pt_file in folder.glob('*.pt'):
            video_id = pt_file.stem
            if video_id not in df['video_name'].values:
                continue
            try:
                emb = torch.load(pt_file, weights_only=True)
                if isinstance(emb, dict):
                    emb = emb['emb']
                if torch.isnan(emb).any() or emb.shape != (30, 1024):
                    continue
            except Exception:
                continue

            traits = df[df['video_name'] == video_id][trait_keys].values[0].astype('float32')
            X.append(emb)
            y.append(torch.tensor(traits))
            lengths.append(torch.tensor(emb.shape[0]))

    return torch.stack(X), torch.stack(y), torch.stack(lengths)

X_train, y_train, lengths_train = load_embeddings(train_df, 'Embeddings_emonext/fiv2_embeddings/train')
X_test, y_test, lengths_test = load_embeddings(test_df, 'Embeddings_emonext/fiv2_embeddings/test')

In [6]:
# ============ Data Loaders ============
train_loader = DataLoader(TensorDataset(X_train, y_train, lengths_train), batch_size=512, shuffle=True)
val_loader = DataLoader(TensorDataset(X_test, y_test, lengths_test), batch_size=512)

In [7]:
# ============ Model ============
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim=1024, d_model=512, num_layers=3, n_heads=8, dropout=0.2, num_classes=5, max_len=30):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_embed = nn.Parameter(torch.randn(1, max_len, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=d_model * 4, dropout=dropout, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x, lengths):
        x = self.input_proj(x) + self.pos_embed[:, :x.size(1)]
        x = self.encoder(x)
        mask = torch.arange(x.size(1), device=lengths.device)[None, :] < lengths[:, None]
        mask = mask.float().unsqueeze(2)
        summed = (x * mask).sum(dim=1)
        count = mask.sum(dim=1).clamp(min=1)
        pooled = summed / count
        return self.fc(self.dropout(pooled))


In [9]:
from sklearn.metrics import mean_absolute_error

# ============ Training ============
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerClassifier(input_dim=1024, num_classes=5).to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

def ccc(y_true, y_pred):
    """
    This function calculates loss based on concordance correlation coefficient of two series: 'ser1' and 'ser2'
    TensorFlow methods are used
    """

    y_true_mean = np.mean(y_true)
    y_pred_mean = np.mean(y_pred)

    y_true_var = np.mean(np.square(y_true-y_true_mean))
    y_pred_var = np.mean(np.square(y_pred-y_pred_mean))

    cov = np.mean((y_true-y_true_mean)*(y_pred-y_pred_mean))

    ccc = np.multiply(2., cov) / (y_true_var + y_pred_var + np.square(y_true_mean - y_pred_mean))
    ccc_loss=np.mean(ccc)
    return ccc_loss

def acc_func(trues, preds):
    # print('acc', trues, preds)
    acc = []
    for i in range(5):
        acc.append(np.abs(trues - preds))
    acc = 1 - np.asarray(acc)
    return np.mean(acc)

for epoch in range(1, 101):
    model.train()
    total_loss = 0
    for xb, yb, lb in train_loader:
        xb, yb, lb = xb.to(DEVICE), yb.to(DEVICE), lb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb, lb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_loss, preds, targets = 0, [], []
    with torch.no_grad():
        for xb, yb, lb in val_loader:
            xb, yb, lb = xb.to(DEVICE), yb.to(DEVICE), lb.to(DEVICE)
            logits = model(xb, lb)
            val_loss += criterion(logits, yb).item()
            preds.append(logits.cpu())
            targets.append(yb.cpu())

    preds = torch.cat(preds).numpy()
    targets = torch.cat(targets).numpy()
    
    ccc_scores = [ccc(targets[:, i], preds[:, i]) for i in range(preds.shape[1])]
    mean_ccc = np.mean(ccc_scores)
    
    acc_scores = [acc_func(targets[:, i], preds[:, i]) for i in range(preds.shape[1])]
    mean_acc = np.mean(acc_scores)

    print(f"[Epoch {epoch}] Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f} | Mean CCC: {mean_ccc:.4f} | Mean acc: {mean_acc:.4f}")

[Epoch 1] Train Loss: 1.1371 | Val Loss: 0.2122 | Mean CCC: 0.0295 | Mean acc: 0.8150
[Epoch 2] Train Loss: 0.5763 | Val Loss: 0.1112 | Mean CCC: 0.0705 | Mean acc: 0.8651
[Epoch 3] Train Loss: 0.4791 | Val Loss: 0.1005 | Mean CCC: 0.1101 | Mean acc: 0.8753
[Epoch 4] Train Loss: 0.4362 | Val Loss: 0.0839 | Mean CCC: 0.1480 | Mean acc: 0.8839
[Epoch 5] Train Loss: 0.4078 | Val Loss: 0.0843 | Mean CCC: 0.1649 | Mean acc: 0.8849
[Epoch 6] Train Loss: 0.3905 | Val Loss: 0.0810 | Mean CCC: 0.1842 | Mean acc: 0.8866
[Epoch 7] Train Loss: 0.3723 | Val Loss: 0.0801 | Mean CCC: 0.1962 | Mean acc: 0.8873
[Epoch 8] Train Loss: 0.3555 | Val Loss: 0.0804 | Mean CCC: 0.2048 | Mean acc: 0.8874
[Epoch 9] Train Loss: 0.3430 | Val Loss: 0.0779 | Mean CCC: 0.2177 | Mean acc: 0.8886
[Epoch 10] Train Loss: 0.3315 | Val Loss: 0.0782 | Mean CCC: 0.2229 | Mean acc: 0.8887
[Epoch 11] Train Loss: 0.3237 | Val Loss: 0.0777 | Mean CCC: 0.2289 | Mean acc: 0.8890
[Epoch 12] Train Loss: 0.3187 | Val Loss: 0.0772 | M

In [11]:
# Save weights
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, "fiv2_best_checkpoint.pth")
