Dataset Preprocessing & Feature Extraction

In [2]:
%pip install torchaudio librosa transformers datasets torch torchvision torchaudio


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import shutil
import random

# Define dataset paths
dataset_path = "Deep-Voice dataset/KAGGLE/AUDIO"
output_path = "Deep-Voice dataset/KAGGLE/AUDIO_SPLIT" 

# Define split ratios
train_ratio = 0.70
val_ratio = 0.15
test_ratio = 0.15

# Ensure the output directories exist
for split in ['train', 'val', 'test']:
    for category in ['REAL', 'FAKE']:
        os.makedirs(os.path.join(output_path, split, category), exist_ok=True)

# Function to split and copy files
def split_data(category):
    category_path = os.path.join(dataset_path, category)
    files = os.listdir(category_path)
    random.shuffle(files)  # Shuffle for randomness

    total_files = len(files)
    train_end = int(train_ratio * total_files)
    val_end = train_end + int(val_ratio * total_files)

    # Split the data
    train_files = files[:train_end]
    val_files = files[train_end:val_end]
    test_files = files[val_end:]

    # Copy files to respective directories
    for file in train_files:
        shutil.copy(os.path.join(category_path, file), os.path.join(output_path, 'train', category, file))
    for file in val_files:
        shutil.copy(os.path.join(category_path, file), os.path.join(output_path, 'val', category, file))
    for file in test_files:
        shutil.copy(os.path.join(category_path, file), os.path.join(output_path, 'test', category, file))

# Process both Fake and Real categories
for category in ['REAL', 'FAKE']:
    split_data(category)

print("Dataset successfully split into Train, Validation, and Test folders.")


Dataset successfully split into Train, Validation, and Test folders.


Key Improvements in this Code
Extracts multiple features (Mel Spectrograms, MFCCs, Chroma Features).
Adds Data Augmentation (Time-stretching, Pitch-shifting, Noise Injection).
Handles correct dataset structure (real/, fake/ folders).
Stacks features together for a richer input representation.

In [4]:
import os
import librosa
import librosa.display
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import random

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to load and preprocess audio
def load_audio(file_path, sr=22050, duration=3):
    y, sr = librosa.load(file_path, sr=sr, duration=duration)
    return y, sr

# Data augmentation: Time Stretching, Pitch Shifting, Noise Addition
def augment_audio(y, sr):
    if random.random() < 0.3:  # 30% chance to apply augmentation
        y = librosa.effects.time_stretch(y, rate=random.uniform(0.8, 1.2))  # Time-stretch
    if random.random() < 0.3:
        y = librosa.effects.pitch_shift(y, sr=sr, n_steps=random.randint(-2, 2))  # Pitch shift
    if random.random() < 0.3:
        noise = np.random.randn(len(y)) * 0.005  # Add slight noise
        y = y + noise
    return y

# Convert audio to Mel Spectrogram
def audio_to_melspectrogram(y, sr, n_mels=128):
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    return librosa.power_to_db(mel_spec, ref=np.max)

# Convert audio to MFCCs
def audio_to_mfcc(y, sr, n_mfcc=40):
    return librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

# Convert audio to Chroma Features
def audio_to_chroma(y, sr):
    return librosa.feature.chroma_stft(y=y, sr=sr)

# Custom Dataset Class
class AudioDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.files = []
        self.labels = []

        for label, subfolder in enumerate(["REAL", "FAKE"]):  # Use correct dataset folder names
            folder_path = os.path.join(data_dir, subfolder)
            for file in os.listdir(folder_path):
                if file.endswith(".wav"):
                    self.files.append(os.path.join(folder_path, file))
                    self.labels.append(label)

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        label = self.labels[idx]

        # Load and augment audio
        y, sr = load_audio(file_path)
        y = augment_audio(y, sr)

        # Feature extraction
        mel_spec = audio_to_melspectrogram(y, sr)
        mfcc = audio_to_mfcc(y, sr)
        chroma = audio_to_chroma(y, sr)

        # Stack features into one tensor
        features = np.vstack([mel_spec, mfcc, chroma])  # Shape: (F, T)

        return torch.tensor(features).unsqueeze(0), torch.tensor(label)

# Load dataset
train_dataset = AudioDataset(data_dir="Deep-Voice dataset/KAGGLE/AUDIO_SPLIT/train")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Check sample shape
sample, label = train_dataset[0]
print("Feature Shape:", sample.shape)  # Expecting (1, F, T)
print("Label:", label)


Feature Shape: torch.Size([1, 180, 160])
Label: tensor(0)
