In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import librosa
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
import os
import time

In [3]:
#load dataset
dataset = load_dataset("babs/openslr-yoruba")

#split dataset

data_split = dataset['train'].train_test_split(test_size=0.2)

#access the train and test sets
train_ds = data_split['train']
test_ds = data_split['test']

# Print dataset column names
print(train_ds.column_names)


['audio', 'transcription']


In [6]:
# parameters

n_mfcc = 13 # number of mfcc features
num_epochs = 10 #number of training epoch (will be used in the training loop)
hidden_dim = 128 #LSTM hidden state dimensionality


# Create a label dictionary that maps each unique label to an integer
unique_labels = set([sample['transcription'] for sample in train_ds])
label_dict = {label: idx for idx, label in enumerate(unique_labels)}

# Update output_dim based on the number of unique labels
output_dim = len(label_dict)




In [7]:
# Encoding function
def encode_label(label):
    if label not in label_dict:
        raise ValueError(f"Label '{label}' not found in label_dict.")
    return torch.tensor(label_dict[label], dtype=torch.long)  # No need for []

# Extract MFCC and convert
def extract_mfcc(audio):
    y = np.array(audio['audio']['array'])
    sr = audio['audio']['sampling_rate']
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return torch.tensor(mfccs.T, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

# Define the LSTM model
class AudioLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(AudioLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.hidden2label = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.hidden2label(lstm_out[:, -1, :])
        return nn.functional.log_softmax(out, dim=1)

In [None]:
# Initialize model, loss function, and optimizer
model = AudioLSTM(input_dim=n_mfcc, hidden_dim=hidden_dim, output_dim=output_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for sample in train_ds:
        inputs = extract_mfcc(sample)
        target_label = sample['transcription']
        targets = encode_label(target_label)

        model.zero_grad()
        output = model(inputs)
        loss = loss_function(output, targets.unsqueeze(0))  # Add batch dimension
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_ds)}")

# Test on one sample (optional)
with torch.no_grad():
    sample = test_ds[0]
    inputs = extract_mfcc(sample)
    output = model(inputs)
    predicted_label_idx = output.argmax(dim=1).item()
    predicted_label = list(label_dict.keys())[list(label_dict.values()).index(predicted_label_idx)]
    print(f"Predicted: {predicted_label}, True: {sample['transcription']}")