In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import librosa
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
import os
import time



In [3]:
#load dataset
dataset = load_dataset("babs/openslr-yoruba")

#split dataset

data_split = dataset['train'].train_test_split(test_size=0.2)

#access the train and test sets
train_ds = data_split['train']
test_ds = data_split['test']


# Print dataset column names
print(train_ds.column_names)


for i in range(0,10):
    #process audio sample
    audio_sample = train_ds[i]

    #extract audio data and sampling rate
    audio_data = audio_sample['audio']['array']
    sampling_rate = audio_sample['audio']['sampling_rate']

    #ensure data is numpy array
    audio_data = np.array(audio_data)

    #extract mfcc features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sampling_rate, n_mfcc=13)

    #display the mfcc features
    print(f"Sample {i+1}: MFCCs shape: {mfccs.shape}")
    print(train_ds['transcription'][i])

['audio', 'transcription']
Sample 1: MFCCs shape: (13, 649)
 Kájọlà ni ibùsọ̀ táa ma kọ́kọ́ kàn ka tó dé Káṣọpẹ́.
Sample 2: MFCCs shape: (13, 289)
Orí ẹni ní ń mọ àtilà ẹni.
Sample 3: MFCCs shape: (13, 385)
Lọ gbá ilẹ̀ àárín ilé àti ti ẹ̀yìnkùlé.
Sample 4: MFCCs shape: (13, 209)
Ẹran ọ̀sìn ni ewúrẹ́ àti adìyẹ.
Sample 5: MFCCs shape: (13, 353)
Mo tiẹ̀ ríi gbọ́ pé osù tó ń bọ̀ ni ọdún ìgúnu.
Sample 6: MFCCs shape: (13, 273)
Mo nífẹ̀ẹ́ láti máa jẹ edé.
Sample 7: MFCCs shape: (13, 433)
Àwọn tí wọ́n darí iná fún eré orí ìtàgé náà gbìyànjú.
Sample 8: MFCCs shape: (13, 305)
 Alhaja fẹ́ràn láti máa jẹ búrẹ́dì Sẹ́nígà.
Sample 9: MFCCs shape: (13, 321)
Gèlè náà gbé ẹwà rẹ̀ jáde.
Sample 10: MFCCs shape: (13, 497)
 Ilé ẹ̀kọ́ girama tó wà ní Jíbówú ni Akíntádé ti bẹ̀rẹ̀ iṣẹ́.


In [4]:
# parameters

n_mfcc = 13 # number of mfcc features
num_epochs = 10 #number of training epoch (will be used in the training loop)
hidden_dim = 128 #LSTM hidden state dimensionality


# Create a label dictionary that maps each unique label to an integer
unique_labels = set([sample['transcription'] for sample in train_ds])
label_dict = {label: idx for idx, label in enumerate(unique_labels)}

# Update output_dim based on the number of unique labels
output_dim = len(label_dict)



In [5]:

def encode_label(label):
    if label not in label_dict:
        raise ValueError(f"Label '{label}' not found in label_dict.")
    return torch.tensor([label_dict[label]], dtype=torch.long)

#extract MFCC and convert
def extract_mfcc(audio):
    y = np.array(audio['audio']['array'])
    sr = audio['audio']['sampling_rate']
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # Shape should be (n_mfcc, num_frames), transpose to (num_frames, n_mfcc)
    return torch.tensor(mfccs.T, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

# Define the LSTM model
class AudioLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(AudioLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.hidden2label = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.hidden2label(lstm_out[:, -1, :])  # Use the output from the last time step
        return nn.functional.log_softmax(out, dim=1)

In [None]:
# Initialize model, loss function, and optimizer
model = AudioLSTM(input_dim=n_mfcc, hidden_dim=hidden_dim, output_dim=output_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for sample in train_ds:
        inputs = extract_mfcc(sample)
        target_label = sample['transcription']
        targets = encode_label(target_label)

        model.zero_grad()
        output = model(inputs)
        loss = loss_function(output, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_ds)}")

# Test on one sample (optional)
with torch.no_grad():
    sample = test_ds[0]
    inputs = extract_mfcc(sample)
    output = model(inputs)
    print(f"Predicted: {output.argmax(dim=1).item()}, True: {sample['transcription']}")