In [1]:
import torch
from torch import nn
from pathlib import Path
from tqdm import tqdm
from torchvision import transforms
import torchvision
from torch.utils.data import DataLoader, Dataset
import os
import numpy as np
import pandas as pd
from skimage import io
from PIL import Image

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCHS = 32

In [3]:
csv_path = Path("data/other.tsv")
audio_path = "data/train"
data = pd.read_csv(csv_path, sep='\t')

In [4]:
data.head(1)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,420bc8680e03882c43c80e6307ad2a869c892bd90b4388...,common_voice_en_38487408.mp3,His father was undersecretary of the Home Depa...,0,0,,,,,en,


In [5]:
data['path'].head(5)

0    common_voice_en_38487408.mp3
1    common_voice_en_38487409.mp3
2    common_voice_en_38487410.mp3
3    common_voice_en_38487411.mp3
4    common_voice_en_38487412.mp3
Name: path, dtype: object

In [6]:
data['sentence'].head(5)

0    His father was undersecretary of the Home Depa...
1    The Board's headquarters include a museum of b...
2              It was followed by a sequel, Hot Shots!
3    One round hit a mine inside, and the machine-g...
4    Diamond was educated at Leeds Grammar School a...
Name: sentence, dtype: object

In [7]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
all_tokens = []
for text in data['sentence'].astype(str).fillna(''):
    tokens = word_tokenize(text)
    all_tokens.extend(tokens)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mathews/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
tokens = list(set(all_tokens))
len(tokens)

53906

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(tokens)
integer_encoded

array([51685, 20597,  8212, ..., 29802,  8551,   924])

In [10]:
decoded_tokens = label_encoder.inverse_transform(integer_encoded)
decoded_tokens

array(['train', 'Rican', 'Eugenie', ..., 'boosters', 'Faulkes',
       'Alspaugh'], dtype='<U128')

In [11]:
words_to_num = {char: idx for idx, char in enumerate(set(tokens))}
num_to_words = {idx: char for idx, char in enumerate(set(tokens))}

encode = lambda s: [words_to_num[c] for c in word_tokenize(s)]
def encode2(l):

    l=str(l)
    value = [words_to_num[c] for c in word_tokenize(l)]
    while len(value) <82:
        value.append(0)
    return torch.tensor(value, dtype=torch.float32)

    
def decoder(l):
    return ' '.join([num_to_words.get(i,'<UNK>') for i in l])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['path'], data['sentence'], test_size=0.1)

In [13]:
X_train[0],y_train[0]

('common_voice_en_38487408.mp3',
 'His father was undersecretary of the Home Department of the government of Maharashtra.')

In [15]:
import os
import torch
from torch.utils.data import Dataset
import torchaudio
import torchaudio.transforms as T

# Fallback for MP3
from pydub import AudioSegment
import numpy as np


class DatasetsCustom(Dataset):
    def __init__(self, X, y, path, sample_rate=16000, n_mfcc=40):
        """
        Args:
            X (list or pd.Series): list of audio file names
            y (list or pd.Series): list of labels (same length as X)
            path (str): directory containing audio files
        """
        self.X = X.reset_index(drop=True)
        self.y = y.reset_index(drop=True)
        self.audio_path = path
        self.sample_rate = sample_rate
        self.mfcc_transform = T.MFCC(
            sample_rate=sample_rate,
            n_mfcc=n_mfcc,
            melkwargs={"n_fft": 2048, "hop_length": 512}
        )

    def load_audio(self, file_name):
        audio_path = os.path.join(self.audio_path, file_name)

        try:
            # Try with torchaudio
            waveform, sr = torchaudio.load(audio_path)
        except Exception as e:
            # If torchaudio fails (e.g. MP3 not supported), fallback to pydub
            audio = AudioSegment.from_file(audio_path, format="mp3")
            samples = np.array(audio.get_array_of_samples()).astype(np.float32)

            # stereo -> mono
            if audio.channels > 1:
                samples = samples.reshape((-1, audio.channels))
                samples = samples.mean(axis=1)

            waveform = torch.tensor(samples).unsqueeze(0)  # shape (1, time)
            sr = audio.frame_rate

        # Resample if needed
        if sr != self.sample_rate:
            resampler = T.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)

        # Convert to MFCC -> (n_mfcc, time)
        mfcc = self.mfcc_transform(waveform)

        # Change shape to (time, n_mfcc) for RNN input
        mfcc = mfcc.squeeze(0).transpose(0, 1)
        return mfcc

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        file_name = self.X.iloc[idx]
        label = self.y.iloc[idx]
        mfcc = self.load_audio(file_name)
        return mfcc, label


OSError: libtorch_cuda.so: cannot open shared object file: No such file or directory

In [None]:
train_datasets = DatasetsCustom(X=X_train, y=y_train, path=audio_path)
test_datasets  = DatasetsCustom(X=X_test, y=y_test, path=audio_path)

In [None]:
train_dataloader = DataLoader(dataset=train_datasets, batch_size=32, num_workers=0, shuffle=True)
test_dataloader = DataLoader(dataset=test_datasets, batch_size=32, num_workers=0, shuffle=False)

In [None]:
import torch
import torch.nn as nn

class AudioModel(nn.Module):
    def __init__(self, input_shape: int, hidden_units: int, output_shape: int, num_layers: int = 2, bidirectional: bool = True, dropout: float = 0.3):
        super(AudioModel, self).__init__()

        # LSTM backbone
        self.lstm = nn.LSTM(
            input_size=input_shape,   # n_mfcc from your dataset (e.g., 40)
            hidden_size=hidden_units,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )

        # Fully connected classifier
        rnn_out = hidden_units * (2 if bidirectional else 1)
        self.fc = nn.Linear(rnn_out, output_shape)

    def forward(self, x: torch.Tensor):
        """
        x: [B, T, F] where
           B = batch size
           T = time steps
           F = n_mfcc (input_shape)n_mfcc = 40                 # must match dataset MFCC config
hidden_units = 128
num_classes = len(set(y_train))

model = AudioModel(input_shape=n_mfcc, hidden_units=hidden_units, output_shape=num_classes).to(device)

        """
        batch_size = x.size(0)

        # LSTM forward
        lstm_out, (h_n, c_n) = self.lstm(x)

        # Take the last timestep output
        lstm_out_last = lstm_out[:, -1, :]   # [B, hidden*dir]

        # Classification
        output = self.fc(lstm_out_last)
        return output


In [None]:
n_mfcc = 40                 # must match dataset MFCC config
hidden_units = 128
num_classes = len(set(y_train))

model = AudioModel(input_shape=n_mfcc, hidden_units=hidden_units, output_shape=num_classes).to(device)




In [None]:
EPOCHS = 10
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    train_loss = 0
    test_loss = 0
    model.train()
    for batch in tqdm(train_dataloader):
        inputs, labels = batch

        inputs, labels = inputs.to(device), labels.to(device)
        output = model(inputs)
        loss = loss_fn(output, labels)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.inference_mode():
        for batch in test_dataloader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs)
            loss = loss_fn(output, labels)
            test_loss += loss.item()

    print(f"Epoch {epoch}, train loss {train_loss / len(train_dataloader)}, test loss {test_loss / len(test_dataloader)}")
        
    MODEL_PATH = Path("models")
    MODEL_NAME = "model_train.pth"
    MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME
    
    
    MODEL_PATH.mkdir(parents=True, exist_ok=True)
    #Save
    torch.save(obj=model.state_dict(),f=MODEL_SAVE_PATH)