In [2]:
pip install torch torchaudio hmmlearn numpy tqdm

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (164 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.6/164.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
import torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from hmmlearn import hmm
import numpy as np
from tqdm import tqdm
import string
import csv
from pathlib import Path
from jiwer import wer, cer  # For WER and CER calculation

# ======================
# Dataset Configuration
# ======================
root_path = '/kaggle/input/librispeech-clean'
train_dataset = torchaudio.datasets.LIBRISPEECH(root_path, url="train-clean-100", download=False)
test_dataset = torchaudio.datasets.LIBRISPEECH(root_path, url="test-clean", download=False)

# =====================
# Audio Preprocessing
# =====================
audio_transforms = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=80,        # Optimized for speech recognition
    n_fft=512,        # Proper frequency resolution
    hop_length=160,   # 10ms frame shift
    win_length=400    # 25ms window size
)

# ====================
# Text Transformation
# ====================
class TextTransform:
    def __init__(self):
        self.char_map = {'': 0, ' ': 1, **{c: i+2 for i, c in enumerate(string.ascii_lowercase)}}
        self.index_map = {v: k for k, v in self.char_map.items()}

    def text_to_int(self, text):
        return [self.char_map.get(c, 1) for c in text.lower()]

    def int_to_text(self, labels):
        return ''.join(self.index_map.get(i, ' ') for i in labels).strip()

text_transform = TextTransform()

# ==================
# Data Processing
# ==================
def data_processing(data):
    spectrograms, labels, input_lengths, label_lengths = [], [], [], []
    for waveform, _, utterance, _, _, _ in data:
        # Audio processing
        spec = audio_transforms(waveform).squeeze(0).transpose(0, 1)
        spectrograms.append(spec)
        
        # Text processing
        label = torch.tensor(text_transform.text_to_int(utterance), dtype=torch.long)
        labels.append(label)
        
        # Length calculations
        input_lengths.append(spec.shape[0])  # Full sequence length
        label_lengths.append(len(label))
    
    # Batch padding
    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
    
    return spectrograms, labels, input_lengths, label_lengths

# ================
# Model Architecture
# ================
class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_cnn_layers=2, rnn_dim=512, n_class=28, n_feats=80, dropout=0.1):
        super().__init__()
        
        # CNN Layers
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            *[nn.Sequential(
                nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(32),
                nn.ReLU(),
                nn.Dropout(dropout)
            ) for _ in range(n_cnn_layers-1)]
        )
        
        # RNN Layers
        self.rnn = nn.LSTM(
            input_size=32*n_feats,  # 32 channels * 80 mel bands
            hidden_size=rnn_dim,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )
        
        # Final Classifier
        self.classifier = nn.Linear(rnn_dim*2, n_class)

    def forward(self, x):
        # CNN processing
        x = self.cnn(x)  # (batch, 32, time, n_feats)
        
        # Dimension reshaping
        x = x.permute(0, 2, 1, 3)  # (batch, time, 32, n_feats)
        x = x.flatten(2)            # (batch, time, 32*n_feats)
        
        # RNN processing
        x, _ = self.rnn(x)  # (batch, time, rnn_dim*2)
        
        # Classification
        return self.classifier(x)

# ================
# Training Setup
# ================
params = {
    'batch_size': 10,
    'epochs': 10,
    'learning_rate': 5e-4,
    'n_class': 28,  # 26 letters + space + blank
    'n_feats': 80   # Must match MelSpectrogram n_mels
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = SpeechRecognitionModel().to(device)
optimizer = optim.AdamW(model.parameters(), lr=params['learning_rate'])
criterion = nn.CTCLoss(blank=0)

# Data loaders
train_loader = DataLoader(train_dataset, 
                         batch_size=params['batch_size'],
                         shuffle=True,
                         collate_fn=data_processing,
                         num_workers=4,
                         pin_memory=True)

test_loader = DataLoader(test_dataset,
                        batch_size=params['batch_size'],
                        shuffle=False,
                        collate_fn=data_processing,
                        num_workers=4,
                        pin_memory=True)

# ================
# Training Loop
# ================
for epoch in range(params['epochs']):
    model.train()
    epoch_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{params['epochs']}"):
        spectrograms, labels, input_lengths, label_lengths = batch
        
        # Move data to device
        spectrograms = spectrograms.to(device)
        labels = labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(spectrograms)
        outputs = F.log_softmax(outputs, dim=2)
        
        # CTC Loss requirements: (T, N, C)
        outputs = outputs.permute(1, 0, 2)
        
        loss = criterion(
            outputs,
            labels,
            input_lengths=torch.tensor(input_lengths),
            target_lengths=torch.tensor(label_lengths)
        )
        
        # Backward pass
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1} | Avg Loss: {epoch_loss/len(train_loader):.4f}")

# Save trained model
torch.save(model.state_dict(), '/kaggle/working/speech_model.pth')

# ================
# HMM Integration
# ================
def prepare_hmm_data():
    model.eval()
    all_sequences = []
    lengths = []
    
    with torch.no_grad():
        for batch in tqdm(train_loader, desc="Preparing HMM Data"):
            spectrograms, _, input_lengths, _ = batch
            outputs = model(spectrograms.to(device))
            emissions = torch.argmax(outputs, dim=2).cpu().numpy()
            
            for i, length in enumerate(input_lengths):
                all_sequences.append(emissions[i, :length])
                lengths.append(length)
    
    return np.concatenate(all_sequences), np.array(lengths)

# Train HMM
hmm_observations, hmm_lengths = prepare_hmm_data()
hmm_model = hmm.GaussianHMM(
    n_components=params['n_class'],
    covariance_type="diag",
    n_iter=100
)
hmm_model.fit(hmm_observations.reshape(-1, 1), lengths=hmm_lengths)

# ====================
# Inference Function
# ====================
def speech_to_text(waveform):
    model.eval()
    with torch.no_grad():
        # Process audio
        spec = audio_transforms(waveform).unsqueeze(0).to(device)
        
        # Model predictions
        outputs = model(spec)
        emissions = torch.argmax(outputs, dim=2).cpu().numpy()[0]
        
        # HMM decoding
        _, best_path = hmm_model.decode(emissions.reshape(-1, 1))
        
    return text_transform.int_to_text(best_path)

# ====================
# Evaluation & Output with WER and CER
# ====================
def generate_results():
    results = []
    model.eval()
    ground_truths = []
    predictions = []
    
    with torch.no_grad():
        for i in tqdm(range(len(test_dataset)), desc="Evaluating"):
            waveform, _, utterance, sid, cid, uid = test_dataset[i]
            prediction = speech_to_text(waveform)
            
            # Store for WER/CER calculation
            ground_truths.append(utterance.lower())
            predictions.append(prediction)
            
            # Store results for CSV
            results.append([
                f"{sid}-{cid}-{uid}",
                utterance.lower(),
                prediction
            ])
    
    # Calculate WER and CER
    wer_score = wer(ground_truths, predictions)
    cer_score = cer(ground_truths, predictions)
    
    # Print WER and CER
    print(f"Word Error Rate (WER): {wer_score:.4f}")
    print(f"Character Error Rate (CER): {cer_score:.4f}")
    
    # Save results to CSV
    with open('/kaggle/working/stt_results.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["ID", "Ground Truth", "Prediction"])
        writer.writerows(results)
    
    return wer_score, cer_score

# Run evaluation
wer_score, cer_score = generate_results()
print("Processing complete! Results saved to stt_results.csv")


Epoch 1/10: 100%|██████████| 2854/2854 [28:36<00:00,  1.66it/s]
Epoch 1 | Avg Loss: 1.6857
Epoch 2/10: 100%|██████████| 2854/2854 [28:34<00:00,  1.66it/s]
Epoch 2 | Avg Loss: 0.7422
Epoch 3/10: 100%|██████████| 2854/2854 [28:36<00:00,  1.66it/s]
Epoch 3 | Avg Loss: 0.5444
Epoch 4/10: 100%|██████████| 2854/2854 [28:37<00:00,  1.66it/s]
Epoch 4 | Avg Loss: 0.4360
Epoch 5/10: 100%|██████████| 2854/2854 [28:35<00:00,  1.66it/s]
Epoch 5 | Avg Loss: 0.3652
Epoch 6/10: 100%|██████████| 2854/2854 [28:35<00:00,  1.66it/s]
Epoch 6 | Avg Loss: 0.3112
Epoch 7/10: 100%|██████████| 2854/2854 [28:32<00:00,  1.67it/s]
Epoch 7 | Avg Loss: 0.2699
Epoch 8/10: 100%|██████████| 2854/2854 [28:31<00:00,  1.67it/s]
Epoch 8 | Avg Loss: 0.2359
Epoch 9/10: 100%|██████████| 2854/2854 [28:31<00:00,  1.67it/s]
Epoch 9 | Avg Loss: 0.2100
Epoch 10/10: 100%|██████████| 2854/2854 [28:31<00:00,  1.67it/s]
Epoch 10 | Avg Loss: 0.1861
Preparing HMM Data: 100%|██████████| 2854/2854 [09:37<00:00,  4.94it/s]
Evaluating
Word