NAME - TORSHA CHATTERJEE

Roll No. - M23CSA536

Speech Understanding

Assignment - 3


In [1]:
!pip install transformers peft accelerate --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
from transformers import AutoModelForCausalLM

# Synthetic Gesture Dataset
class SyntheticGestureDataset(Dataset):
    def __init__(self, num_samples=500, gesture_dim=43, seed=0):
        random.seed(seed)
        torch.manual_seed(seed)
        self.samples = []
        for _ in range(num_samples):
            speech_feat = torch.randn(1, 512)  # Dummy speech embedding
            gesture = torch.randn(34, gesture_dim)  # 34 frames
            self.samples.append({"speech": speech_feat, "gestures": gesture})

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


In [3]:
class GestureVQVAE(nn.Module):
    def __init__(self, num_embeddings=1024, embedding_dim=512, input_dim=43):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, embedding_dim), nn.ReLU())
        self.decoder = nn.Sequential(nn.Linear(embedding_dim, input_dim), nn.ReLU())
        self.codebook = nn.Embedding(num_embeddings, embedding_dim)

    def forward(self, x):
        # x: [batch, 34, 43]
        batch_size, seq_len, input_dim = x.shape
        x_flat = x.view(-1, input_dim)  # [batch*34, 43]
        z_e = self.encoder(x_flat)  # [batch*34, 512]
        codebook = self.codebook.weight  # [1024, 512]
        # Find closest codebook entries
        distances = ((z_e.unsqueeze(1) - codebook)**2).sum(-1)  # [batch*34, 1024]
        indices = torch.argmin(distances, dim=1)  # [batch*34]
        z_q = self.codebook(indices)  # [batch*34, 512]
        x_recon = self.decoder(z_q).view(batch_size, seq_len, input_dim)
        return x_recon, indices.view(batch_size, seq_len)  # (optional recon, tokens)


In [4]:
class GestureTranslator(nn.Module):
    def __init__(self, model_name, gesture_vocab_size):
        super().__init__()
        self.llm = AutoModelForCausalLM.from_pretrained(model_name)
        self.embedding_proj = nn.Embedding(gesture_vocab_size, self.llm.config.hidden_size)

    def forward(self, speech_embed, gesture_token_ids):
        # gesture_token_ids: [batch, seq_len]
        inputs_embeds = self.embedding_proj(gesture_token_ids)
        outputs = self.llm(inputs_embeds=inputs_embeds, labels=gesture_token_ids)
        return outputs.loss


In [5]:
def train(model, tokenizer, dataset, epochs=3, batch_size=4, lr=1e-4):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            speech = batch["speech"]  # [B, 1, 512]
            gestures = batch["gestures"]  # [B, 34, 43]
            _, gesture_tokens = tokenizer(gestures)  # [B, 34]
            gesture_tokens = gesture_tokens.to(torch.long)

            loss = model(speech, gesture_tokens)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} - Avg Loss: {total_loss / len(dataloader):.4f}")


In [6]:
tokenizer = GestureVQVAE()
model_name = "gpt2"

print(" Training on TED Gesture (Synthetic)")
dataset1 = SyntheticGestureDataset(seed=1)
model1 = GestureTranslator(model_name, gesture_vocab_size=1024)
train(model1, tokenizer, dataset1)

print("\n Training on TED Expressive (Synthetic)")
dataset2 = SyntheticGestureDataset(seed=2)
model2 = GestureTranslator(model_name, gesture_vocab_size=1024)
train(model2, tokenizer, dataset2)

print("\n Training on Custom Dataset (Synthetic)")
dataset3 = SyntheticGestureDataset(seed=3)
model3 = GestureTranslator(model_name, gesture_vocab_size=1024)
train(model3, tokenizer, dataset3)


 Training on TED Gesture (Synthetic)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 1 - Avg Loss: 3.5305
Epoch 2 - Avg Loss: 2.9824
Epoch 3 - Avg Loss: 2.9611

 Training on TED Expressive (Synthetic)
Epoch 1 - Avg Loss: 3.5339
Epoch 2 - Avg Loss: 2.9853
Epoch 3 - Avg Loss: 2.9687

 Training on Custom Dataset (Synthetic)
Epoch 1 - Avg Loss: 3.5470
Epoch 2 - Avg Loss: 2.9896
Epoch 3 - Avg Loss: 2.9601


In [7]:
def compute_diversity(dataset):
    gestures = [sample["gestures"].numpy() for sample in dataset]
    gestures = np.stack(gestures)
    shuffled = gestures[np.random.permutation(len(gestures))]
    return np.mean(np.abs(gestures - shuffled))

print("\n🧪 Diversity Scores:")
print("TED Gesture:", compute_diversity(dataset1))
print("TED Expressive:", compute_diversity(dataset2))
print("Custom:", compute_diversity(dataset3))



🧪 Diversity Scores:
TED Gesture: 1.1265469
TED Expressive: 1.1273643
Custom: 1.1287066
