```
conda init
conda create -n train-models python=3.10 -y
conda activate train-models
pip install torch==2.6.0+cu118 torchaudio==2.6.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
pip install pandas ipykernel tqdm
```

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torch.nn.functional as F

In [2]:
# Paths
DATA_DIR = Path("../VoxCeleb2")

JPG_EMB_PATH = DATA_DIR / "train/jpg_train_embeddings.pt"
JPG_VAL_EMB_PATH = DATA_DIR / "val/jpg_val_embeddings.pt"
WAV_EMB_PATH = DATA_DIR / "train/wav_train_embeddings.pt"
WAV_VAL_EMB_PATH = DATA_DIR / "val/wav_val_embeddings.pt"

In [3]:
# Load embeddings
jpg_train_emb = torch.load(JPG_EMB_PATH)
jpg_val_emb   = torch.load(JPG_VAL_EMB_PATH)

wav_train_emb = torch.load(WAV_EMB_PATH)
wav_val_emb   = torch.load(WAV_VAL_EMB_PATH)

print("Train JPG:", jpg_train_emb.shape, "Train WAV:", wav_train_emb.shape)
print("Val JPG  :", jpg_val_emb.shape, "Val WAV  :", wav_val_emb.shape)

Train JPG: torch.Size([29498, 512]) Train WAV: torch.Size([29498, 512])
Val JPG  : torch.Size([3381, 512]) Val WAV  : torch.Size([3381, 512])


In [4]:
# Define MLP model (BASE)
class FaceToSpeechMLP(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(1024, 512)
        )
        
    def forward(self, x):
        return self.model(x)

In [40]:
# Define MLP model (SMALL)
class FaceToSpeechMLP(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(512, 512)
        )
        
    def forward(self, x):
        return self.model(x)

In [46]:
# Define MLP model (XSMALL)
class FaceToSpeechMLP(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(512, 512)
        )
        
    def forward(self, x):
        return self.model(x)

In [49]:
# Define MLP model (LARGE)
class FaceToSpeechMLP(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(1024, 512)
        )
        
    def forward(self, x):
        return self.model(x)

In [5]:
# Dataset / DataLoader
BATCH_SIZE = 128

train_ds = TensorDataset(jpg_train_emb, wav_train_emb)
val_ds   = TensorDataset(jpg_val_emb,   wav_val_emb)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE)

In [6]:
# Training Setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [7]:
# Define model
model = FaceToSpeechMLP(dropout_rate=0.0).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [8]:
# Training parameters
NUM_EPOCHS = 50
best_val_loss = float('inf')
MODEL_PATH = DATA_DIR / "face2speech_best.pt"

In [9]:
# Define loss function
def cosine_loss(pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
    pred_norm = F.normalize(pred, dim=1)
    target_norm = F.normalize(target, dim=1)
    loss = 1 - (pred_norm * target_norm).sum(dim=1).mean()
    return loss

In [10]:
# Training loop with validation
for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}"):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)

        loss = cosine_loss(out, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * x.size(0)
    
    train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val, y_val = x_val.to(device), y_val.to(device)
            val_out = model(x_val)

            loss = cosine_loss(val_out, y_val)
            val_loss += loss.item() * x_val.size(0)
            
    val_loss /= len(val_loader.dataset)
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Train Loss: {train_loss:.6f} - Val Loss: {val_loss:.6f}")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), MODEL_PATH)
        print("Saved best model")


Epoch 1/50: 100%|██████████| 231/231 [00:00<00:00, 337.33it/s]


Epoch 1/50 - Train Loss: 0.305883 - Val Loss: 0.597950
Saved best model


Epoch 2/50: 100%|██████████| 231/231 [00:00<00:00, 446.31it/s]


Epoch 2/50 - Train Loss: 0.239807 - Val Loss: 0.607007


Epoch 3/50: 100%|██████████| 231/231 [00:00<00:00, 448.42it/s]


Epoch 3/50 - Train Loss: 0.226435 - Val Loss: 0.608987


Epoch 4/50: 100%|██████████| 231/231 [00:00<00:00, 420.29it/s]


Epoch 4/50 - Train Loss: 0.217409 - Val Loss: 0.619328


Epoch 5/50: 100%|██████████| 231/231 [00:00<00:00, 446.76it/s]


Epoch 5/50 - Train Loss: 0.210671 - Val Loss: 0.626410


Epoch 6/50: 100%|██████████| 231/231 [00:00<00:00, 377.33it/s]


Epoch 6/50 - Train Loss: 0.205185 - Val Loss: 0.626807


Epoch 7/50: 100%|██████████| 231/231 [00:00<00:00, 347.02it/s]


Epoch 7/50 - Train Loss: 0.200321 - Val Loss: 0.631989


Epoch 8/50: 100%|██████████| 231/231 [00:00<00:00, 390.96it/s]


Epoch 8/50 - Train Loss: 0.196252 - Val Loss: 0.637680


Epoch 9/50: 100%|██████████| 231/231 [00:00<00:00, 383.33it/s]


Epoch 9/50 - Train Loss: 0.192470 - Val Loss: 0.635140


Epoch 10/50: 100%|██████████| 231/231 [00:00<00:00, 395.98it/s]


Epoch 10/50 - Train Loss: 0.189045 - Val Loss: 0.642780


Epoch 11/50: 100%|██████████| 231/231 [00:00<00:00, 389.38it/s]


Epoch 11/50 - Train Loss: 0.186001 - Val Loss: 0.658841


Epoch 12/50: 100%|██████████| 231/231 [00:00<00:00, 366.97it/s]


Epoch 12/50 - Train Loss: 0.183264 - Val Loss: 0.650177


Epoch 13/50: 100%|██████████| 231/231 [00:00<00:00, 379.59it/s]


Epoch 13/50 - Train Loss: 0.180698 - Val Loss: 0.652023


Epoch 14/50: 100%|██████████| 231/231 [00:00<00:00, 384.91it/s]


Epoch 14/50 - Train Loss: 0.178445 - Val Loss: 0.654292


Epoch 15/50: 100%|██████████| 231/231 [00:00<00:00, 359.92it/s]


Epoch 15/50 - Train Loss: 0.176187 - Val Loss: 0.660103


Epoch 16/50: 100%|██████████| 231/231 [00:00<00:00, 395.44it/s]


Epoch 16/50 - Train Loss: 0.174306 - Val Loss: 0.665396


Epoch 17/50: 100%|██████████| 231/231 [00:00<00:00, 419.06it/s]


Epoch 17/50 - Train Loss: 0.172398 - Val Loss: 0.660414


Epoch 18/50: 100%|██████████| 231/231 [00:00<00:00, 416.87it/s]


Epoch 18/50 - Train Loss: 0.170584 - Val Loss: 0.673635


Epoch 19/50: 100%|██████████| 231/231 [00:00<00:00, 429.31it/s]


Epoch 19/50 - Train Loss: 0.169280 - Val Loss: 0.673914


Epoch 20/50: 100%|██████████| 231/231 [00:00<00:00, 307.91it/s]


Epoch 20/50 - Train Loss: 0.167617 - Val Loss: 0.669516


Epoch 21/50: 100%|██████████| 231/231 [00:01<00:00, 203.48it/s]


Epoch 21/50 - Train Loss: 0.166217 - Val Loss: 0.673509


Epoch 22/50: 100%|██████████| 231/231 [00:01<00:00, 202.93it/s]


Epoch 22/50 - Train Loss: 0.164973 - Val Loss: 0.679049


Epoch 23/50: 100%|██████████| 231/231 [00:00<00:00, 290.63it/s]


Epoch 23/50 - Train Loss: 0.163806 - Val Loss: 0.680781


Epoch 24/50: 100%|██████████| 231/231 [00:00<00:00, 406.65it/s]


Epoch 24/50 - Train Loss: 0.162492 - Val Loss: 0.679282


Epoch 25/50: 100%|██████████| 231/231 [00:00<00:00, 425.81it/s]


Epoch 25/50 - Train Loss: 0.161497 - Val Loss: 0.687396


Epoch 26/50: 100%|██████████| 231/231 [00:00<00:00, 437.10it/s]


Epoch 26/50 - Train Loss: 0.160385 - Val Loss: 0.688635


Epoch 27/50: 100%|██████████| 231/231 [00:00<00:00, 288.10it/s]


Epoch 27/50 - Train Loss: 0.159158 - Val Loss: 0.681864


Epoch 28/50: 100%|██████████| 231/231 [00:01<00:00, 207.83it/s]


Epoch 28/50 - Train Loss: 0.158317 - Val Loss: 0.684787


Epoch 29/50: 100%|██████████| 231/231 [00:01<00:00, 205.99it/s]


Epoch 29/50 - Train Loss: 0.157661 - Val Loss: 0.684187


Epoch 30/50: 100%|██████████| 231/231 [00:01<00:00, 211.43it/s]


Epoch 30/50 - Train Loss: 0.156660 - Val Loss: 0.684671


Epoch 31/50: 100%|██████████| 231/231 [00:00<00:00, 366.69it/s]


Epoch 31/50 - Train Loss: 0.155696 - Val Loss: 0.689643


Epoch 32/50: 100%|██████████| 231/231 [00:00<00:00, 395.16it/s]


Epoch 32/50 - Train Loss: 0.155154 - Val Loss: 0.690213


Epoch 33/50: 100%|██████████| 231/231 [00:00<00:00, 429.68it/s]


Epoch 33/50 - Train Loss: 0.154161 - Val Loss: 0.682428


Epoch 34/50: 100%|██████████| 231/231 [00:00<00:00, 433.35it/s]


Epoch 34/50 - Train Loss: 0.153627 - Val Loss: 0.695268


Epoch 35/50: 100%|██████████| 231/231 [00:00<00:00, 415.29it/s]


Epoch 35/50 - Train Loss: 0.152850 - Val Loss: 0.686594


Epoch 36/50: 100%|██████████| 231/231 [00:00<00:00, 253.43it/s]


Epoch 36/50 - Train Loss: 0.152088 - Val Loss: 0.688463


Epoch 37/50: 100%|██████████| 231/231 [00:01<00:00, 217.61it/s]


Epoch 37/50 - Train Loss: 0.151516 - Val Loss: 0.691870


Epoch 38/50: 100%|██████████| 231/231 [00:00<00:00, 403.33it/s]


Epoch 38/50 - Train Loss: 0.150818 - Val Loss: 0.689393


Epoch 39/50: 100%|██████████| 231/231 [00:00<00:00, 422.90it/s]


Epoch 39/50 - Train Loss: 0.150350 - Val Loss: 0.693029


Epoch 40/50: 100%|██████████| 231/231 [00:01<00:00, 200.48it/s]


Epoch 40/50 - Train Loss: 0.149726 - Val Loss: 0.694824


Epoch 41/50: 100%|██████████| 231/231 [00:00<00:00, 253.59it/s]


Epoch 41/50 - Train Loss: 0.149177 - Val Loss: 0.698982


Epoch 42/50: 100%|██████████| 231/231 [00:01<00:00, 206.41it/s]


Epoch 42/50 - Train Loss: 0.148614 - Val Loss: 0.698427


Epoch 43/50: 100%|██████████| 231/231 [00:00<00:00, 365.53it/s]


Epoch 43/50 - Train Loss: 0.147955 - Val Loss: 0.700880


Epoch 44/50: 100%|██████████| 231/231 [00:00<00:00, 413.54it/s]


Epoch 44/50 - Train Loss: 0.147758 - Val Loss: 0.696912


Epoch 45/50: 100%|██████████| 231/231 [00:00<00:00, 412.66it/s]


Epoch 45/50 - Train Loss: 0.147083 - Val Loss: 0.697001


Epoch 46/50: 100%|██████████| 231/231 [00:00<00:00, 400.08it/s]


Epoch 46/50 - Train Loss: 0.146749 - Val Loss: 0.696765


Epoch 47/50: 100%|██████████| 231/231 [00:00<00:00, 398.46it/s]


Epoch 47/50 - Train Loss: 0.146414 - Val Loss: 0.700700


Epoch 48/50: 100%|██████████| 231/231 [00:00<00:00, 382.38it/s]


Epoch 48/50 - Train Loss: 0.145882 - Val Loss: 0.700231


Epoch 49/50: 100%|██████████| 231/231 [00:00<00:00, 416.34it/s]


Epoch 49/50 - Train Loss: 0.145463 - Val Loss: 0.698696


Epoch 50/50: 100%|██████████| 231/231 [00:00<00:00, 411.20it/s]


Epoch 50/50 - Train Loss: 0.144891 - Val Loss: 0.703431


In [11]:
# Paths for test embeddings
JPG_TEST_EMB_PATH = DATA_DIR / "test/jpg_test_embeddings.pt"
WAV_TEST_EMB_PATH = DATA_DIR / "test/wav_test_embeddings.pt"

In [12]:
# Load test embeddings
jpg_test_emb = torch.load(JPG_TEST_EMB_PATH)
wav_test_emb = torch.load(WAV_TEST_EMB_PATH)

In [13]:
# Create DataLoader
test_ds = TensorDataset(jpg_test_emb, wav_test_emb)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [14]:
# MODEL_PATH = DATA_DIR / "face2speech_best_LARGE.pt"
# model = FaceToSpeechMLP().to(device)
# model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

FaceToSpeechMLP(
  (model): Sequential(
    (0): Linear(in_features=512, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.0, inplace=False)
    (4): Linear(in_features=1024, out_features=1024, bias=True)
    (5): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.0, inplace=False)
    (8): Linear(in_features=1024, out_features=512, bias=True)
  )
)

In [15]:
def evaluate_model(model, dataloader):
    total_loss = 0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = cosine_loss(out, y)
            total_loss += loss.item() * x.size(0)
    avg_loss = total_loss / len(dataloader.dataset)
    return avg_loss

test_loss = evaluate_model(model, test_loader)
print(f"Test Cosine Loss: {test_loss:.6f}")

Test Cosine Loss: 0.696111
