In [1]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim, utils
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F

# ------------------------------------------------------------------------------
# 1️⃣ Enhanced Audio Loading, Normalization, and Augmentation
def extract_features(filename, n_mels=40, sr=16000, max_length=500, augment=False):
    audio, sr = librosa.load(filename, sr=sr)
    audio = librosa.util.normalize(audio)

    if augment:
        # Time-stretch with more aggressive range
        rate = np.random.uniform(0.8, 1.2)
        audio = librosa.effects.time_stretch(y=audio, rate=rate)

        # Pitch shift with wider range
        n_steps = np.random.randint(-3, 4)  # -3 to +3 semitones
        audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=n_steps)
        
        # Add random noise
        noise_amp = 0.005 * np.random.uniform() * np.amax(audio)
        audio = audio + noise_amp * np.random.normal(size=audio.shape[0])
        
        # Random gain
        gain = np.random.uniform(0.8, 1.2)
        audio = audio * gain
        
        # Randomly drop some frequencies (spectral masking)
        if np.random.rand() < 0.3:
            n_fft = 2048
            S = librosa.stft(audio, n_fft=n_fft)
            mask = np.random.rand(*S.shape) > 0.2  # 20% chance to mask
            S = S * mask
            audio = librosa.istft(S)

    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    if mel_db.shape[1] < max_length:
        pad = max_length - mel_db.shape[1]
        mel_db = np.pad(mel_db, ((0, 0), (0, pad)), mode='constant')
    else:
        mel_db = mel_db[:, :max_length]

    return mel_db

# ------------------------------------------------------------------------------
# 2️⃣ Prepare the data with more aggressive augmentation
X, y = [], []

for label, directory in [(0, "/kaggle/input/deep-voice-deepfake-voice-recognition/KAGGLE/AUDIO/REAL"),
                         (1, "/kaggle/input/deep-voice-deepfake-voice-recognition/KAGGLE/AUDIO/FAKE")]:
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        
        # Always extract original features
        mel = extract_features(file_path, augment=False)
        X.append(mel)
        y.append(label)
        
        # Add augmented versions (3 per sample)
        for _ in range(3):
            mel_aug = extract_features(file_path, augment=True)
            X.append(mel_aug)
            y.append(label)

X = np.array(X, dtype='float32')
y = np.array(y, dtype='float32')

# ------------------------------------------------------------------------------
# 3️⃣ Prepare for training
X = np.expand_dims(X, 1)  # (samples, 1, 40, max_length)

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, 
                                                  test_size=0.2, 
                                                  random_state=42,
                                                  stratify=y)

train_ds = utils.data.TensorDataset(X_train, y_train)
val_ds = utils.data.TensorDataset(X_val, y_val)

train_loader = utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = utils.data.DataLoader(val_ds, batch_size=32, shuffle=False)

# ------------------------------------------------------------------------------
# 4️⃣ Improved CNN architecture with regularization
class CNN(nn.Module):
    def __init__(self, n_classes=2):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(negative_slope=0.1),
            nn.MaxPool2d(2, 2),
            nn.Dropout2d(0.25)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(negative_slope=0.1),
            nn.MaxPool2d(2, 2),
            nn.Dropout2d(0.25)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(negative_slope=0.1),
            nn.MaxPool2d(2, 2),
            nn.Dropout2d(0.25)
        )
        self.flatten = nn.Flatten()
        self.fc1 = nn.Sequential(
            nn.Linear(64 * 5 * 62, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Dropout(0.5)
        )
        self.fc2 = nn.Sequential(
            nn.Linear(128, n_classes)
        )

    def forward(self, x):
        x = self.conv1(x)  # [B, 16, 20, 250]
        x = self.conv2(x)  # [B, 32, 10, 125]
        x = self.conv3(x)  # [B, 64, 5, 62]
        x = self.flatten(x)  # [B, 64*5*62]
        x = self.fc1(x)
        x = self.fc2(x)
        return x

# ------------------------------------------------------------------------------
# 5️⃣ Prepare for training with more regularization
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CNN().to(device)

opt = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-5)
loss_fn = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(opt, mode='max', factor=0.5, patience=5, verbose=True)

# ------------------------------------------------------------------------------
# 6️⃣ Train the network with early stopping
epochs = 100
best_val_accuracy = 0
patience = 10
patience_counter = 0

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        opt.zero_grad()
        preds = model(X_batch)
        loss = loss_fn(preds, y_batch)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        opt.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)

    # Validation
    model.eval()
    correct = 0
    total = 0
    val_loss = 0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            preds = model(X_val)
            val_loss += loss_fn(preds, y_val).item()
            predicted = preds.argmax(1)
            total += y_val.size(0)
            correct += (predicted == y_val).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = 100 * correct / total
    scheduler.step(val_accuracy)  # Update learning rate
    
    print(f'Epoch {epoch + 1}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

    # Early stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch + 1}')
            break

print("Training finished.")
print(f"Best validation accuracy: {best_val_accuracy:.2f}%")

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))



Epoch 1, Train Loss: 0.9414, Val Loss: 0.6568, Val Accuracy: 82.69%
Epoch 2, Train Loss: 0.8681, Val Loss: 0.5718, Val Accuracy: 86.54%
Epoch 3, Train Loss: 0.7676, Val Loss: 0.6255, Val Accuracy: 71.15%
Epoch 4, Train Loss: 0.7650, Val Loss: 0.6810, Val Accuracy: 51.92%
Epoch 5, Train Loss: 0.7357, Val Loss: 0.9693, Val Accuracy: 15.38%
Epoch 6, Train Loss: 0.6840, Val Loss: 0.9030, Val Accuracy: 19.23%
Epoch 7, Train Loss: 0.6742, Val Loss: 0.7940, Val Accuracy: 28.85%
Epoch 8, Train Loss: 0.6561, Val Loss: 0.7273, Val Accuracy: 46.15%
Epoch 9, Train Loss: 0.5779, Val Loss: 0.7882, Val Accuracy: 38.46%
Epoch 10, Train Loss: 0.5714, Val Loss: 0.8213, Val Accuracy: 32.69%
Epoch 11, Train Loss: 0.5424, Val Loss: 0.9616, Val Accuracy: 23.08%
Epoch 12, Train Loss: 0.5510, Val Loss: 0.9762, Val Accuracy: 23.08%
Early stopping at epoch 12
Training finished.
Best validation accuracy: 86.54%


<All keys matched successfully>