In [None]:
import pandas as pd
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# Path to metadata file and dataset
metadata_path = temp # TODO Update this.
audio_dir =  temp # TODO Update this.
output_dir =  temp # TODO Update this.


In [3]:

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Load metadata
df = pd.read_csv(metadata_path)

# Keep only the first 6,000 files
df = df.iloc[:6000]

# Map labels: 'bona-fide' → 0 (real), 'spoof' → 1 (fake)
df["label"] = df["label"].map({"bona-fide": 0, "spoof": 1})

# Save updated metadata
df.to_csv(os.path.join(output_dir, "mel_labels.csv"), index=False)


In [None]:
def save_mel_spectrogram(audio_path, save_path):
    """Converts a .wav file to a Mel spectrogram and saves it as an image."""
    try:
        y, sr = librosa.load(audio_path, sr=16000)  # Load audio
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)  # Convert to decibels

        # Save as an image
        plt.figure(figsize=(4, 4))
        librosa.display.specshow(mel_spec_db, sr=sr, x_axis="time", y_axis="mel")
        plt.axis("off")  # No axes for clean image
        plt.savefig(save_path, bbox_inches="tight", pad_inches=0)
        plt.close()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

# Process first 6,000 files
for i, row in tqdm(df.iterrows(), total=len(df)):
    file_name = row["file"]  # File name from metadata
    file_path = os.path.join(audio_dir, file_name)  # Full path
    save_path = os.path.join(output_dir, f"{file_name}.png")  # Save as PNG
    save_mel_spectrogram(file_path, save_path)


In [5]:
from sklearn.model_selection import train_test_split

# Load the metadata CSV
df = pd.read_csv(os.path.join(output_dir, "mel_labels.csv"))

# Split into train (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

# Save new CSV files
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize for Res2Net
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize
])

class MelSpectrogramDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0] + ".png")
        image = Image.open(img_name).convert("RGB")  # Convert grayscale to RGB
        label = self.data.iloc[idx, 2]  # Label (0 for real, 1 for fake)

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long)

# Create datasets
train_dataset = MelSpectrogramDataset(os.path.join(output_dir, "train.csv"), output_dir, transform)
val_dataset = MelSpectrogramDataset(os.path.join(output_dir, "val.csv"), output_dir, transform)
test_dataset = MelSpectrogramDataset(os.path.join(output_dir, "test.csv"), output_dir, transform)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [10]:
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")

Number of training batches: 300
Number of validation batches: 38
Number of test batches: 38


In [None]:
import torch.nn as nn
import torchvision.models as models

class Res2NetClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(Res2NetClassifier, self).__init__()
        self.model = models.resnet50(pretrained=False)  # Using ResNet50 as a base
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)  # Modify FC layer

    def forward(self, x):
        return self.model(x)

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = Res2NetClassifier().to(device)


In [6]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [19]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct, total = 0, 0
        
        # for images, labels in train_loader:
        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch"):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss:.4f}, Accuracy: {train_acc:.4f}")

        # Validation
        model.eval()
        val_correct, val_total = 0, 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                val_correct += (predicted == labels).sum().item()
                val_total += labels.size(0)

        val_acc = val_correct / val_total
        print(f"Validation Accuracy: {val_acc:.4f}")



In [20]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

Epoch 1/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:45<00:00,  1.81batch/s]


Epoch [1/10], Loss: 31.9885, Accuracy: 0.9621
Validation Accuracy: 0.9783


Epoch 2/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:35<00:00,  1.92batch/s]


Epoch [2/10], Loss: 62.7321, Accuracy: 0.9340
Validation Accuracy: 0.9700


Epoch 3/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:35<00:00,  1.93batch/s]


Epoch [3/10], Loss: 26.0170, Accuracy: 0.9729
Validation Accuracy: 0.9733


Epoch 4/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:35<00:00,  1.93batch/s]


Epoch [4/10], Loss: 23.2363, Accuracy: 0.9744
Validation Accuracy: 0.9767


Epoch 5/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [03:16<00:00,  1.53batch/s]


Epoch [5/10], Loss: 16.5695, Accuracy: 0.9844
Validation Accuracy: 0.9867


Epoch 6/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:39<00:00,  1.88batch/s]


Epoch [6/10], Loss: 15.7357, Accuracy: 0.9827
Validation Accuracy: 0.9667


Epoch 7/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:34<00:00,  1.95batch/s]


Epoch [7/10], Loss: 12.4760, Accuracy: 0.9892
Validation Accuracy: 0.9700


Epoch 8/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:36<00:00,  1.92batch/s]


Epoch [8/10], Loss: 13.4184, Accuracy: 0.9871
Validation Accuracy: 0.9767


Epoch 9/10: 100%|█████████████████████████████████████████████████████████████████| 300/300 [02:35<00:00,  1.93batch/s]


Epoch [9/10], Loss: 9.3995, Accuracy: 0.9885
Validation Accuracy: 0.9733


Epoch 10/10: 100%|████████████████████████████████████████████████████████████████| 300/300 [02:36<00:00,  1.92batch/s]


Epoch [10/10], Loss: 13.0961, Accuracy: 0.9852
Validation Accuracy: 0.9833


In [22]:
def evaluate_model(model, test_loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        # for images, labels in test_loader:
        for images, labels in tqdm(test_loader, desc="Progress Bar", unit="batch"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    test_acc = correct / total
    print(f"Test Accuracy: {test_acc:.4f}")

evaluate_model(model, test_loader)


Progress Bar: 100%|█████████████████████████████████████████████████████████████████| 38/38 [00:18<00:00,  2.07batch/s]

Test Accuracy: 0.9900





In [None]:
# Save the model after training
save_dir=temp # TODO add path to save model.
model_name = "deepfake_audio_detector_.pth"
save_path = os.path.join(save_dir, model_name)
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")