In [1]:
from TrainAndTest.test import evaluate_classification_model
from DatasetReaders.AudioDatset import AudioDataset
from DetectionModels.AudioCNN import CNNnetwork
from torch.utils.data import DataLoader
from TrainAndTest.train import train
from torch.utils.data import Subset
import matplotlib.pyplot as plt
from torch import nn
import torchaudio
import warnings
import librosa
import random
import torch

if torch.cuda.is_available():
    device="cuda"
else:
    device= "cpu"
print(device)
warnings.filterwarnings("ignore", message="At least one mel filterbank has all zero values", 
                        category=UserWarning, module="torchaudio.functional.functional")

cuda


## 1. Dataset Initialization

In [2]:
NUM_SAMPLES= int(3.100772*16000)
BATCH_SIZE= 128
SAMPLE_RATE= 16000  
NUM_OF_MELS= 128
WIN_LENGTH= int(0.016 * SAMPLE_RATE)
HOP_LENGTH= int(0.004 * SAMPLE_RATE) 
FAKE_AUDIO_PATH= r"D:\f_uni\sophomore_Uni\semester_1\Profesinal_Project\multimodal_deepfake_detection\datasets\FoR_dataset\for-norm\for-norm\training\fake"
REAL_AUDIO_PATH= r"D:\f_uni\sophomore_Uni\semester_1\Profesinal_Project\multimodal_deepfake_detection\datasets\FoR_dataset\for-norm\for-norm\training\real"
CORRUPTED_PATHS=[
    "file8319.wav_16k.wav_norm.wav_mono.wav_silence.wav",
    "file15440.wav_16k.wav_norm.wav_mono.wav_silence.wav",
    "file11064.wav_16k.wav_norm.wav_mono.wav_silence.wav"
]

mel_spectogram= torchaudio.transforms.MelSpectrogram(
    sample_rate= SAMPLE_RATE,
    n_fft= WIN_LENGTH*2,
    hop_length= HOP_LENGTH,
    n_mels= NUM_OF_MELS,
    window_fn= torch.hamming_window,
    win_length= WIN_LENGTH
)

audio_dataset= AudioDataset(
    FAKE_AUDIO_PATH,
    REAL_AUDIO_PATH,
    mel_spectogram,
    SAMPLE_RATE,
    NUM_SAMPLES,
    device,
    CORRUPTED_PATHS
)

real_indices= [i for i, label in enumerate(audio_dataset.labels) if label==0]
fake_indices= [i for i, label in enumerate(audio_dataset.labels) if label==1]
real_sample= random.sample(real_indices, 5000)
fake_sample= random.sample(fake_indices, 5000)
subset_indices= real_sample + fake_sample
random.shuffle(subset_indices)
small_dataset= Subset(audio_dataset, subset_indices)

## 2. Model training

In [3]:
train_data_loader= DataLoader(
    small_dataset,
    batch_size=BATCH_SIZE,
)
model= CNNnetwork().to(device)
print(model)
loss_fn= nn.CrossEntropyLoss()
optimiser= torch.optim.Adam(model.parameters(), lr=0.001)

CNNnetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv5): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=

In [4]:
torch.manual_seed(42)
train(model,
      train_data_loader,
      loss_fn,
      optimiser,
      device,
      epochs=10)
torch.save(model.state_dict(), "Audio_CNN.pth")
print("model trained and stored")

epoch 1
loss: 0.83709937032265
epoch 2
loss: 0.25845937232805205
epoch 3
loss: 0.16878953151687792
epoch 4
loss: 0.1223969947593876
epoch 5
loss: 0.08709593993151866
epoch 6
loss: 0.07538645325487928
epoch 7
loss: 0.09406193873927562
epoch 8
loss: 0.04943130389622212
epoch 9
loss: 0.04548365080911855
epoch 10
loss: 0.028359877935881857
train completem :)
model trained and stored


## 3. Model Testing

In [7]:
torch.manual_seed(0)

model= CNNnetwork()
state_dict= torch.load("Audio_CNN.pth", weights_only=True)
model.load_state_dict(state_dict=state_dict)
model.eval()

real_indices= [i for i, label in enumerate(audio_dataset.labels) if label == 0]
fake_indices= [i for i, label in enumerate(audio_dataset.labels) if label == 1]
real_sample= random.sample(real_indices, 1000)
fake_sample= random.sample(fake_indices, 1500)
subset_indices= real_sample + fake_sample
random.shuffle(subset_indices)
test_dataset= Subset(audio_dataset, subset_indices)
test_loader= DataLoader(test_dataset, batch_size=128, shuffle=True)

acc, prec, rec, f1= evaluate_classification_model(model, test_loader, device)

Accuracy: 0.9636
Precision: 0.9641
Recall: 0.9636
F1-score: 0.9637
