# 1. Dataset from scratch

In [6]:
import numpy as np 
import pandas as pd 
import os
import torch
import librosa
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from sklearn.preprocessing import StandardScaler


In [None]:
###NUM_SEGMENTOS Y LABEL_GENERATOR

In [None]:
def process_select_signals(data_path, sample_rate):
    segment_length=0.4
    overlap_factor=2
    amplitude_threshold=0.30
    audio, _ = librosa.load(data_path, sr=sample_rate)
    audio_len = len(audio)
    audio = audio / np.max(np.abs(audio))  # Normalize audio

    time_length = segment_length
    sample_length = int(time_length * sample_rate)
    overlap = overlap_factor

    signals = []
    y_label = []
    subject_group = []

    # Find segments
    indx = [i for i, x in enumerate(np.sqrt(np.abs(audio))) if x > amplitude_threshold]

    segments = 0
    if len(indx) > 0 and (indx[0] + sample_length) < audio_len:
        for i in range(int((-indx[0] + indx[-1]) / (sample_length / overlap))):
            ind_start = i * int(sample_length / overlap) + indx[0]
            ind_end = ind_start + sample_length
            if ind_end <= indx[-1]:
                signal = np.zeros(sample_length)
                signal = audio[ind_start:int(ind_end)]
                signals.append(signal)
                y_label.append('Label')  # Replace 'Label' with actual label assignment logic
                subject_group.append('Speaker_ID')  # Replace 'Speaker_ID' with actual speaker ID assignment logic
                segments += 1

        print(f"Processed audio file: {data_path}")
        print(f"Time audio: {(audio_len - 1) / sample_rate} seconds, Segments: {segments}")
    else:
        print(f"No segments found in audio file: {data_path}")

    signals = np.stack(signals, axis=0) if signals else np.empty((0, sample_length))
    y_label = np.array(y_label)
    subject_group = np.array(subject_group)

    return signals

In [None]:
def to_spectrogram(signal, sample_rate):
    n_fft = 2048
    win_length = int(0.015*sample_rate) 
    hop_length = int(0.010*sample_rate)
    n_mels = 65 

    mel_spectrogram = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        onesided=True,
        n_mels=n_mels,
        mel_scale="htk",
    )

    scaler = StandardScaler()
    
    mel_spect = librosa.power_to_db(mel_spectrogram(torch.from_numpy(signal)))
    mel_spect_norm=scaler.fit_transform(mel_spect)
    
    return mel_spect_norm


In [12]:
# params: audio_labels: list of

class CustomAudioDataset(Dataset):

    def __init__(self, audio_labels, num_segs,  data_dir, sample_rate, to_spectrogram):
        
        # primer DF: labels, IDS, y nombre de los ficheros 
        label_generator=pd.read_csv(audio_labels)
        self.audio_labels=label_generator['Label'].values() # se obtiene de un script aparte y contiene [Speaker_ID, label, filename]
        self.audio_IDs=label_generator['Speaker_ID'].values()
        self.filenames=label_generator['FileName'].values() # comprobar esto

        # segundo DF: numero de segmentos y mapeo de indices
        num_segmentos=pd.read_csv(num_segs)
        self.num_segs=num_segmentos['Segments'].values()
        self.spec_id_to_file_id = np.concatenate([np.full(count, idx) for idx, count in enumerate(self.num_segs)])

        # sample rate y path al directorio
        self.sample_rate=sample_rate
        self.data_dir=data_dir

        # to_spectrogram: True or False
        self.to_spectrogram=to_spectrogram

        # cache
        # self.isincache=[]
        # self.cache=[]
        
        
    def __len__(self):

        # numero de segmentos/espectrogramas que puede obtener getitem
        return len(self.spec_id_to_file_id) ### tambien seria sum(self.num_segs)
    
    def __getitem__(self, idx):

        # mapeo de indices: file_index es el que se usa para hacer el fetch al DB. 
        file_index=self.spec_id_to_file_id[idx]

        ### if(file_index in self.isincache): 

        # data_path = directory/path + /filename.wav
        data_path=os.path.join(self.data_dir, self.audio_labels[file_index, -1]) 

        # audio, sample_rate = librosa.load(data_path, sr=SAMPLE_RATE)
        label=self.audio_labels[file_index]
        subject_group= self.audio_IDs[file_index]

        signals=process_select_signals(data_path, self.sample_rate)
        
        if(self.to_spectrogram):
            audio=to_spectrogram(audio, self.sample_rate)
            ### self.cache = [to_spectrogram(sig) for sig in signals_cache]
        else:
            audio=signals[idx]
            ### self.cache=signals
        
        return audio, label, subject_group

In [13]:
SAMPLE_RATE=44100
DATA_SIZE=pd.read_csv('prueba.csv')['Segments'].sum()
DATA_PATH_NeuroV = 'neurovoz/zenodo_upload/audios/'
labels='labels.csv'

#Dataset
datasetAudio = CustomAudioDataset(labels, DATA_PATH_NeuroV)

#DataLoader
dataloader = DataLoader(datasetAudio, batch_size=32, shuffle=True)

iters=int(DATASET_SIZE/BATCH_SIZE)

for epoch in range(200):

    for i in range(iters):

        x_batch, y_batch, subject_batch, cache_batch =next(iter(dataloader))

        model.train() 

