In [1]:
# ===============================
# STEP 1: LIBRARIES & SETUP
# ===============================
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

2025-10-27 18:58:12.668738: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761591492.991465      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761591493.075244      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Basic settings
AUDIO_PATH = "/kaggle/input/in-the-wild-audio-deepfake/release_in_the_wild"
CSV_PATH = "/kaggle/input/in-the-wild-audio-deepfake/modified_meta.csv"  # name may vary, check actual CSV name

In [3]:
# Check files
print("Available folders:", os.listdir(AUDIO_PATH))
print("Number of real audio files:", len(os.listdir(os.path.join(AUDIO_PATH, 'real'))))
print("Number of fake audio files:", len(os.listdir(os.path.join(AUDIO_PATH, 'fake'))))

Available folders: ['fake', 'attribution.txt', 'real']
Number of real audio files: 19963
Number of fake audio files: 11816


In [4]:
# ===============================
# STEP 2: LOAD METADATA
# ===============================
# Read CSV containing filenames and labels
df = pd.read_csv(CSV_PATH)
print("CSV Columns:", df.columns)
print(df.head())

# Assuming CSV has columns like ['file', 'label'] or ['filename', 'label']
# Normalize labels if needed
df['label'] = df['label'].map({'real': 0, 'fake': 1})  # 0=real, 1=fake

CSV Columns: Index(['file', 'label'], dtype='object')
    file label
0  0.wav  fake
1  1.wav  fake
2  2.wav  fake
3  3.wav  fake
4  4.wav  real


In [5]:
def get_path(row):
    folder = "fake" if row['label'] == 1 else "real"
    filename = str(row['file'])
    if not filename.endswith(".wav"):
        filename += ".wav"
    return os.path.join(AUDIO_PATH, folder, filename)


df['path'] = df.apply(get_path, axis=1)
print("Total samples:", len(df))
print(df.head())

Total samples: 31779
    file  label                                               path
0  0.wav      1  /kaggle/input/in-the-wild-audio-deepfake/relea...
1  1.wav      1  /kaggle/input/in-the-wild-audio-deepfake/relea...
2  2.wav      1  /kaggle/input/in-the-wild-audio-deepfake/relea...
3  3.wav      1  /kaggle/input/in-the-wild-audio-deepfake/relea...
4  4.wav      0  /kaggle/input/in-the-wild-audio-deepfake/relea...


In [6]:
# ===============================
# STEP 3: FEATURE EXTRACTION
# ===============================
# Parameters for audio feature extraction
SAMPLE_RATE = 16000       # Standard for speech tasks
DURATION = 3.0            # Crop or pad all audios to 3 seconds
N_MELS = 128              # Number of Mel bands
N_FFT = 1024
HOP_LENGTH = 512

def preprocess_audio(file_path, sr=SAMPLE_RATE, duration=DURATION, n_mels=N_MELS):
    """Load audio, trim/pad, and convert to Mel-spectrogram"""
    try:
        y, sr = librosa.load(file_path, sr=sr)
        # Trim or pad
        if len(y) > sr * duration:
            y = y[:int(sr * duration)]
        else:
            y = np.pad(y, (0, int(sr * duration) - len(y)), mode='constant')

        # Convert to Mel-spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_mels)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize between 0 and 1
        mel_spec_db = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
        return mel_spec_db
    except Exception as e:
        print("Error loading:", file_path, e)
        return None

In [7]:
# ===============================
# STEP 4: APPLY FEATURE EXTRACTION
# ===============================
X, y = [], []
for i, row in tqdm(df.iterrows(), total=len(df)):
    mel = preprocess_audio(row['path'])
    if mel is not None:
        X.append(mel)
        y.append(row['label'])

X = np.array(X)
y = np.array(y)

print("Feature shape before reshape:", X.shape)
# CNN expects 4D input: (samples, height, width, channels)
X = np.expand_dims(X, -1)
print("Feature shape after reshape:", X.shape)
print("Labels shape:", y.shape)

100%|██████████| 31779/31779 [54:25<00:00,  9.73it/s]


Feature shape before reshape: (31779, 128, 94)
Feature shape after reshape: (31779, 128, 94, 1)
Labels shape: (31779,)


In [8]:
# ===============================
# STEP 5: TRAIN-VALIDATION-TEST SPLIT
# ===============================
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 22245
Validation samples: 4767
Testing samples: 4767


In [9]:
# ===============================
# STEP 6: CLASS WEIGHTS
# ===============================
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)

# Save preprocessed arrays to disk (optional, to skip reprocessing later)
np.save("X_train.npy", X_train)
np.save("X_val.npy", X_val)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_val.npy", y_val)
np.save("y_test.npy", y_test)

Class Weights: {0: 0.7959424645770717, 1: 1.3447587957925282}
