# 🧠 TDNN + LSTM for Speech-to-Text using .flac Audio
This notebook implements a basic speech recognition model using TDNN + LSTM + CTC loss.
- Audio: `.flac` files
- Features: Log-Mel spectrograms
- Model: TDNN + LSTM
- Loss: CTC

Make sure you have the following files:
- `.flac` audio files in a folder
- A `transcripts.txt` with lines like: `filename transcription`

In [9]:
# 📦 Install Dependencies (if needed)
!pip install torch torchaudio librosa



In [10]:
# 📁 Imports and Settings
import os
import torch
import torchaudio
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import string

SAMPLE_RATE = 16000
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHARS = " " + string.ascii_lowercase + "'"
char2idx = {c: i + 1 for i, c in enumerate(CHARS)}
char2idx["<pad>"] = 0
idx2char = {i: c for c, i in char2idx.items()}

In [11]:
# 🔤 Tokenizer
def text_to_indices(text):
    return torch.tensor([char2idx[c] for c in text.lower() if c in char2idx], dtype=torch.long)

# 🔊 Feature Extraction
def extract_features(waveform):
    mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE)(waveform)
    log_mel_spec = torchaudio.transforms.AmplitudeToDB()(mel_spec)
    return log_mel_spec.transpose(0, 1)

In [12]:
class SpeechDataset(Dataset):
    def __init__(self, audio_dir, tsv_file):
        # Read TSV file
        with open(tsv_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        self.data = []
        for line in lines:
            # Split by tab and take only 1st and 3rd columns
            parts = line.strip().split('\t')
            if len(parts) >= 3:  # Ensure we have all required columns
                utt_id = parts[0]  # First column - utterance ID
                transcription = parts[2]  # Third column - transcription
                
                # Construct full audio path 
                audio_path = os.path.join(audio_dir, f"{utt_id}.flac")
                
                if os.path.exists(audio_path):
                    self.data.append((audio_path, transcription))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        path, text = self.data[idx]
        waveform, sr = torchaudio.load(path)
        if sr != SAMPLE_RATE:
            waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)
        features = extract_features(waveform.squeeze(0))
        target = text_to_indices(text)
        return features, target

In [13]:
# 🧱 Collate Function
def collate_fn(batch):
    features, targets = zip(*batch)
    input_lengths = [f.shape[0] for f in features]
    target_lengths = [len(t) for t in targets]

    features = pad_sequence(features, batch_first=True)
    targets = pad_sequence(targets, batch_first=False)

    return features, targets, torch.tensor(input_lengths), torch.tensor(target_lengths)

In [14]:
# 🧠 Model Definition
class TDNN_LSTM_Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.tdnn = nn.Sequential(
            nn.Conv1d(input_dim, hidden_dim, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
            nn.ReLU()
        )
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.tdnn(x)
        x = x.transpose(1, 2)
        x, _ = self.lstm(x)
        return self.fc(x)

In [15]:
# 🔁 Training Loop
def train(model, dataloader, optimizer, epochs):
    model.train()
    ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
    for epoch in range(epochs):
        total_loss = 0
        for features, targets, input_lengths, target_lengths in dataloader:
            features, targets = features.to(DEVICE), targets.to(DEVICE)
            outputs = model(features)
            log_probs = nn.functional.log_softmax(outputs, dim=-1)
            loss = ctc_loss(log_probs.transpose(0, 1), targets, input_lengths, target_lengths)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")

In [16]:
# Example usage
audio_dir = "./00/"
tsv_file = "hello00.tsv"

print(f"Loading dataset from {audio_dir} using {tsv_file}")
dataset = SpeechDataset(audio_dir, tsv_file)
print(f"Found {len(dataset)} valid audio-transcript pairs")

dataloader = DataLoader(
    dataset,
    batch_size=16,  # Reduced batch size for stability
    shuffle=True,
    collate_fn=collate_fn
)

# Initialize model
input_dim = 128  # Number of mel frequency bins
hidden_dim = 256
output_dim = len(char2idx)  # Vocabulary size

print(f"Initializing model with:")
print(f"- Input dim: {input_dim}")
print(f"- Hidden dim: {hidden_dim}") 
print(f"- Output dim: {output_dim}")
print(f"- Device: {DEVICE}")

model = TDNN_LSTM_Model(input_dim, hidden_dim, output_dim).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train model
print("\nStarting training...")
try:
    train(model, dataloader, optimizer, epochs=10)
    
    # Save model
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'char2idx': char2idx,
        'idx2char': idx2char
    }, 'sinhala_asr_model.pth')
    print("\nModel saved to sinhala_asr_model.pth")
    
except KeyboardInterrupt:
    print("\nTraining interrupted. Saving current model state...")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'char2idx': char2idx,
        'idx2char': idx2char
    }, 'sinhala_asr_model_interrupted.pth')
    print("Model saved to sinhala_asr_model_interrupted.pth")

Loading dataset from ./00/ using hello00.tsv
Found 715 valid audio-transcript pairs
Initializing model with:
- Input dim: 128
- Hidden dim: 256
- Output dim: 29
- Device: cpu

Starting training...




RuntimeError: Expected tensor to have size 16 at dimension 0, but got size 24 for argument #2 'targets' (while checking arguments for ctc_loss_allocate_outputs)