In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import librosa
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
import os
import time


In [13]:
#load dataset
dataset = load_dataset("babs/openslr-yoruba")

#split dataset

data_split = dataset['train'].train_test_split(test_size=0.2)

#access the train and test sets
train_ds = data_split['train']
test_ds = data_split['test']

# Print dataset column names
print(train_ds.column_names)


for i in range(0,10):
    #process audio sample
    audio_sample = train_ds[i]

    #extract audio data and sampling rate
    audio_data = audio_sample['audio']['array']
    sampling_rate = audio_sample['audio']['sampling_rate']

    #ensure data is numpy array
    audio_data = np.array(audio_data)

    #extract mfcc features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sampling_rate, n_mfcc=13)

    #display the mfcc features
    print(f"Sample {i+1}: MFCCs shape: {mfccs.shape}")
    print(train_ds['transcription'][i])

['audio', 'transcription']
Sample 1: MFCCs shape: (13, 401)
Rántí jẹ ohun tí ẹ gbé kalẹ̀.
Sample 2: MFCCs shape: (13, 481)
 Ìréde òru ti àwọn ọ̀dọ́mọgé ìsíyìín rawọ́ lé mà ti wá peléke.
Sample 3: MFCCs shape: (13, 697)
 Nínú ẹ̀wọ̀n ọdún méjìlá tí wọ́n dá fún Tájù, ọdún mẹ́wàá ló lò níbẹ̀.
Sample 4: MFCCs shape: (13, 265)
Gèlè náà gbé ẹwà rẹ̀ jáde.
Sample 5: MFCCs shape: (13, 305)
Àgbò ni Mojí ń mú kiri.
Sample 6: MFCCs shape: (13, 265)
Ọ̀gá akọrin wa féraǹ mi gidi.
Sample 7: MFCCs shape: (13, 393)
Ìjọba yóò fi kún owó àwọn òṣìṣẹ́.
Sample 8: MFCCs shape: (13, 273)
Mo fẹ́ mu àgbo jẹ̀díjẹ̀dí.
Sample 9: MFCCs shape: (13, 569)
 Mo rántí ìgbà èwe mi àti àwọn oun tí mo ṣe.
Sample 10: MFCCs shape: (13, 265)
Ilé ìgbọ̀nsẹ̀ náà rẹwà.


In [21]:
# function to vectorize text

for i in range(0,5):

    audio_sample = train_ds['transcription'][i]
    vocab = sorted(set(audio_sample)) #creates a set of unique characters from the audio_sample, removing any duplicates, and then sorts them
    print(f'{len(vocab)} unique characters')

    chars = tf.strings.unicode_split(vocab, input_encoding='UTF-8')
    ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
    ids = ids_from_chars
    print(ids)

19 unique characters
<StringLookup name=string_lookup_5, built=False>
28 unique characters
<StringLookup name=string_lookup_6, built=False>
27 unique characters
<StringLookup name=string_lookup_7, built=False>
18 unique characters
<StringLookup name=string_lookup_8, built=False>
17 unique characters
<StringLookup name=string_lookup_9, built=False>
