In [None]:
import os
import random
import torch
from torch.utils.data import Dataset, DataLoader


class NoiseVoiceClassificationDataset(Dataset):
    def __init__(self, folder_noise, folder_voice, seed=42, prop=1):
        self.samples = []

        noise_paths = [os.path.join(folder_noise, f) for f in os.listdir(folder_noise) if f.endswith('.pt')]

        voice_paths = []
        for root, _, files in os.walk(folder_voice):
            for f in files:
                if f.endswith('.pt'):
                    voice_paths.append(os.path.join(root, f))

        num_noise = len(noise_paths)

        # Sample voice audios with size prop*num_noise
        rnd = random.Random(seed)
        voice_paths_sampled = rnd.sample(voice_paths, k=min(prop * num_noise, len(voice_paths)))

        self.samples.extend([(p, 0) for p in noise_paths])
        self.samples.extend([(p, 1) for p in voice_paths_sampled])

        rnd.shuffle(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        tensor = torch.load(path, weights_only=False)
        return tensor, torch.tensor(label, dtype=torch.long)


def load_dataloaders(noise_dir, voice_dir, seed=42, batchsize=16, prop=1):
    splits = ['train', 'validation', 'test']
    loaders = {}

    for i, split in enumerate(splits):
        noise_folder = os.path.join(noise_dir, split)
        voice_folder = os.path.join(voice_dir, split)

        dataset = NoiseVoiceClassificationDataset(noise_folder, voice_folder, seed=seed + i, prop=prop)

        loaders[split] = DataLoader(
            dataset,
            batch_size=batchsize,
            shuffle=(True if split == 'train' else False)
        )

    return loaders['train'], loaders['validation'], loaders['test']

In [None]:
noise_dir = os.path.join(os.getcwd(), "data", "preprocessed", "noise", "standard", "raw")
voice_dir = os.path.join(os.getcwd(), "data", "preprocessed", "standard","raw")

In [12]:
from CNN_transfomers_implementation import Mel_transformer, train_transformer

train, val, test = load_dataloaders(noise_dir, voice_dir, batchsize=16, prop=3)

In [13]:
spect_model = Mel_transformer(num_classes=30)
spect_model = train_transformer(spect_model, train, val, epochs=20)


Epoch 1/20:   0%|          | 0/263 [00:00<?, ?it/s]

                                                                           

Epoch 1: Train Loss: 0.0698 | Train Acc: 0.9860 | Val Acc: 0.9986


                                                                           

Epoch 2: Train Loss: 0.0192 | Train Acc: 0.9971 | Val Acc: 0.9986


                                                                           

Epoch 3: Train Loss: 0.0168 | Train Acc: 0.9964 | Val Acc: 0.9986


                                                                            

Epoch 4: Train Loss: 0.0096 | Train Acc: 0.9981 | Val Acc: 0.9979
Early stopping triggered at epoch 4
Best Val Acc: 0.9986


In [14]:
from CNN_transfomers_implementation import evaluate_model

evaluate_model(spect_model, test)

(0.9936, 0.0881)