In [48]:
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision import datasets
from torch import nn
import wandb
from PIL import Image
import random
import pandas as pd
import numpy as np
import os
import torchaudio
import librosa
from torchaudio import transforms
from torch.utils.data import DataLoader, Dataset,TensorDataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, accuracy_score
from torch.utils.data.dataset import random_split

from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import f1_score

DATA

In [21]:
class AudioDataset(Dataset):
    def __init__(self, data_dir, is_label, TARGET_SAMPLE_RATE = 16000):
        #is label = True para audios com label, False para audios sem label, isso garante que mudanças no pipeline de extração de features sejam para ambos os conjuntos
        self.data_dir = data_dir
        self.classes = ["real", "fake"]
        self.audio_files = []
        self.labels = []
        self.is_label = is_label
        self.TARGET_SAMPLE_RATE =TARGET_SAMPLE_RATE
        if self.is_label:
            for class_idx, class_name in enumerate(self.classes):
                class_dir = os.path.join(data_dir, class_name)
                for file in os.listdir(class_dir):
                    if file.endswith(".mp3"):
                        self.audio_files.append(os.path.join(class_dir, file))
                        self.labels.append(class_idx)
        else:            
            for file in os.listdir(self.data_dir):
                if file.endswith(".mp3"):
                    self.audio_files.append(os.path.join(self.data_dir,file))

        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=TARGET_SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64
        )

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file = self.audio_files[idx]
        if self.is_label:
            label = self.labels[idx]

        # Load audio
        audio, sr = torchaudio.load(audio_file)
        # Convert to mono
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0).unsqueeze(0)

        if sr != self.TARGET_SAMPLE_RATE:
            audio = torchaudio.transforms.Resample(sr, self.TARGET_SAMPLE_RATE)(audio)

        # Pad or truncate the audio to a fixed length
        fixed_length = (
            self.TARGET_SAMPLE_RATE * 3
        )  # Adjust this value based on your requirements
        if audio.shape[1] < fixed_length:
            audio = torch.nn.functional.pad(audio, (0, fixed_length - audio.shape[1]))
        else:
            audio = audio[:, :fixed_length]

        audio = self.mel_spectrogram(audio)
        if self.is_label:
            return audio, label
        else:
            return audio

In [22]:
audio_train = AudioDataset(data_dir='/home/gustavo/Projects/PAV/DEEPFAKE-COMPTETITION-PAV/audios/train', is_label=True)
audio_train[0]

(tensor([[[5.0537e-04, 5.8895e-04, 9.6446e-04,  ..., 1.7757e-03,
           3.6543e-03, 3.7238e-04],
          [1.6781e-04, 1.3125e-03, 2.7401e-04,  ..., 9.0541e-03,
           9.1673e-03, 2.1658e-03],
          [5.7434e-04, 6.7541e-04, 1.3042e-04,  ..., 2.0726e-02,
           1.6678e-02, 4.8293e-03],
          ...,
          [1.0396e-07, 2.0007e-13, 1.1095e-13,  ..., 4.1733e-10,
           1.3565e-10, 1.3352e-07],
          [1.0269e-07, 1.2887e-13, 1.3506e-13,  ..., 7.3700e-10,
           3.2287e-10, 1.3118e-07],
          [1.0342e-07, 1.5312e-13, 1.2068e-13,  ..., 3.7980e-09,
           1.5164e-09, 1.3202e-07]]]),
 0)

In [23]:
audio_teste = AudioDataset(data_dir='/home/gustavo/Projects/PAV/DEEPFAKE-COMPTETITION-PAV/audios/test', is_label=False)
audio_teste[0]

tensor([[[1.7918e-03, 2.0068e-02, 4.0089e-03,  ..., 2.7534e-02,
          6.5274e-03, 1.3242e-01],
         [2.5826e-03, 6.3364e-02, 3.4908e-02,  ..., 5.4090e-02,
          4.8813e-02, 1.4507e-01],
         [5.7900e-03, 3.0583e-02, 4.7984e-02,  ..., 2.1832e+01,
          8.3534e+00, 1.1720e+01],
         ...,
         [2.2706e-07, 8.7983e-06, 2.0074e-03,  ..., 2.7007e-02,
          3.6072e-03, 3.8843e-03],
         [2.8828e-08, 3.2975e-10, 2.3564e-08,  ..., 5.4293e-03,
          2.0038e-03, 4.7438e-03],
         [1.7535e-07, 5.8559e-08, 7.8749e-11,  ..., 1.3839e-02,
          2.6288e-03, 5.2364e-03]]])

DataLoader

In [49]:
class Data:
   
    def __init__(self, batch_size,dataset_train,dataset_test, do_split):
        self.modes = ['train','test']
        self.dataloaders = {}
        self.batch_size = batch_size
        self.do_split = do_split
        if self.do_split:
            self.modes = ['train','validation','test']
            generator = torch.Generator().manual_seed(42)
            train_size = int(len(dataset_train.audio_files)*0.8)
            val_size = int(len(dataset_train.audio_files)-train_size)
            train_set, val_set = random_split(dataset_train, [train_size, val_size], generator=generator)

            self.dataloaders['train'] = train_set
            self.dataloaders['validation'] = val_set
        else:
            self.dataloaders['train'] = dataset_train
            
        self.dataloaders['test'] = dataset_test
    

    def get_loader(self, mode):
        if mode == 'train':
            return  DataLoader(self.dataloaders[mode], batch_size=self.batch_size, shuffle=True)
        else:
            return  DataLoader(self.dataloaders[mode], batch_size=self.batch_size, shuffle=False)


In [31]:
audio_teste = AudioDataset(data_dir='/home/gustavo/Projects/PAV/DEEPFAKE-COMPTETITION-PAV/audios/test', is_label=False) #test
audio_train = AudioDataset(data_dir='/home/gustavo/Projects/PAV/DEEPFAKE-COMPTETITION-PAV/audios/train', is_label=True) #train


not splited

In [40]:
data =Data(batch_size=100, dataset_train=audio_train, dataset_test=audio_teste, do_split=False)

loader_train = data.get_loader('train')
loader_test = data.get_loader('test')


splited

In [50]:
data =Data(batch_size=100, dataset_train=audio_train, dataset_test=audio_teste, do_split=True)

loader_train = data.get_loader('train')
loader_validation = data.get_loader('validation')
loader_test = data.get_loader('test')
