# 🎤 Audio Deepfake Detection (MFCC + CNN)
This notebook trains an **audio branch** for your multimodal deepfake detection project using the ASVspoof2019 dataset.

In [1]:
!pip install numpy scipy librosa scikit-learn matplotlib tqdm torch torchvision torchaudio

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting torchaudio
  Downloading torchaudio-2.8.0-cp310-cp310-win_amd64.whl.metadata (7.2 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Using cached soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-1.0.0-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.1-cp310-cp310-win_amd64.whl.metadata (8.6 kB)
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
Collecting torchaudio
  Downloading torchaudio-2.7.1-cp310-cp310-win_amd64.whl.metadata (6.6 kB)
  Downloading torchaudio-2.7.0-cp310-cp310-win_amd64.whl.metadata (6.7 kB)
  Downloading torchaudio-2.6.0-cp310-cp310-win_amd64.wh


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\dearm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [9]:
import os, urllib.request, zipfile

DATA_DIR = "./VCTK-Corpus-0.92/"
os.makedirs(DATA_DIR, exist_ok=True)

# DATA_URL = "https://datashare.ed.ac.uk/download/DS_10283_3443.zip"
# ZIP_FILE = os.path.join(DATA_DIR, "asvspoof2019_la.zip")

# if not os.path.exists(ZIP_FILE):
#     print("📥 Downloading dataset...")
#     urllib.request.urlretrieve(DATA_URL, ZIP_FILE)
#     print("✅ Download complete.")

# if not os.path.exists(os.path.join(DATA_DIR, "LA")):
#     print("📦 Extracting dataset...")
#     with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
#         zip_ref.extractall(DATA_DIR)
#     print("✅ Extraction complete.")

In [7]:
import librosa
import numpy as np

def extract_mfcc(filepath, max_len=400):
    y, sr = librosa.load(filepath, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

In [10]:
import torch
import os
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class AudioDeepfakeDataset(Dataset):
    def __init__(self, file_list, labels):
        self.file_list = file_list
        self.labels = labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        filepath = self.file_list[idx]
        label = self.labels[idx]
        features = extract_mfcc(filepath)
        features = torch.tensor(features, dtype=torch.float).unsqueeze(0)
        return features, torch.tensor(label, dtype=torch.long)

real_files, fake_files = [], []
train_audio_path = os.path.join(DATA_DIR, "wav48_silence_trimmed", "flac")

for root, _, files in os.walk(train_audio_path):
    for f in files:
        if f.endswith(".flac"):
            if "bonafide" in f.lower():
                real_files.append(os.path.join(root, f))
            else:
                fake_files.append(os.path.join(root, f))

X = real_files + fake_files
y = [0]*len(real_files) + [1]*len(fake_files)

print(f"Loaded {len(real_files)} real and {len(fake_files)} fake samples.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
train_dataset = AudioDeepfakeDataset(X_train, y_train)
test_dataset = AudioDeepfakeDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

Loaded 0 real and 0 fake samples.


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
import torch.nn as nn
import torch.optim as optim

class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * 10 * 100, 128)
        self.fc2 = nn.Linear(128, 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
from tqdm import tqdm

EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

In [None]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"✅ Test Accuracy: {100*correct/total:.2f}%")
torch.save(model.state_dict(), "audio_deepfake_detector.pth")

In [None]:
def predict_audio(path, model):
    model.eval()
    feat = extract_mfcc(path)
    feat = torch.tensor(feat, dtype=torch.float).unsqueeze(0).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(feat)
        pred = torch.argmax(output, dim=1).item()
    return "FAKE" if pred == 1 else "REAL"

print(predict_audio(X_test[0], model))