In [1]:
import os
import glob
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from PIL import Image

import IPython.display as ipd
import torch
import torchaudio
from torchvision import models
from torchvision.transforms import transforms

import librosa
import librosa.display

import yaml

In [2]:
from src.models import models



### Test Attack Success Rate

In [3]:
ADV_SAMPLES_PATH = '/home/hanhnlh/projects/ccs/SPEECH_PROJECT/deepfakes_detectors/audio-deepfake-adversarial-attacks/outputs_adv/LCNN_TEST/'
MODEL_PATH = "/home/hanhnlh/projects/ccs/SPEECH_PROJECT/deepfakes_detectors/audio-deepfake-adversarial-attacks/trained_models/lcnn_wavefake_22-04-2024-18-52-49/ckpt.pth"
CONFIG_PATH = "/home/hanhnlh/projects/ccs/SPEECH_PROJECT/deepfakes_detectors/audio-deepfake-adversarial-attacks/trained_models/lcnn_wavefake_22-04-2024-18-52-49/config.yaml"

N_CLASSES = 2
SAMPLE_RATE = 16000
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

TRANSFORM = transforms.Compose([
            transforms.ToTensor() ,
             ])

In [4]:
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

In [5]:
# Load model

model_name, model_parameters = config["model"]["name"], config["model"]["parameters"]

print(f'Load model {model_name} with parameters {model_parameters}')

# Load model architecture
model = models.get_model(
    model_name=model_name,
    config=model_parameters,
    device=DEVICE,
)

if MODEL_PATH:
    model.load_state_dict(torch.load(MODEL_PATH))
model.eval()
model.to(DEVICE)

Load model lcnn with parameters {'frontend_algorithm': ['lfcc'], 'input_channels': 1}
Using ['lfcc'] frontend


LCNN(
  (m_transform): Sequential(
    (0): Conv2d(1, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): MaxFeatureMap2D()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1))
    (4): MaxFeatureMap2D()
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
    (6): Conv2d(32, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): MaxFeatureMap2D()
    (8): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (9): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
    (10): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1))
    (11): MaxFeatureMap2D()
    (12): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
    (13): Conv2d(48, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): MaxFeatureMap2D()
    (15): MaxPool2d(kern

In [6]:
# Load adversarial samples
adv_samples = []
for file in glob.glob(os.path.join(ADV_SAMPLES_PATH, '*.npy')):
    audio = np.load(file)
    audio = torch.from_numpy(audio)
    adv_samples.append(audio)

In [7]:
# compute the percentage of successful adversarial attacks
n_success = 0
for adv_sample in adv_samples:
    sample = adv_sample.unsqueeze(0) # shape: (B, 3, H, W)
    sample = sample.to(DEVICE)
    with torch.no_grad():
        output = model(sample).squeeze(1)
        batch_pred = torch.sigmoid(output)
        batch_pred_label = (batch_pred + 0.5).int()
        if batch_pred_label.item() == 1: # 0 is spoof, 1 is real
            n_success += 1
print(f'Percentage of successful adversarial attacks: {n_success / len(adv_samples) * 100:.2f}%')

Percentage of successful adversarial attacks: 99.82%


### Display audio files for testing

In [8]:
ori_audio_file = '/mnt/storage/hanhnlh/ccs/dataset/WaveFake/generated_audio/ljspeech_multi_band_melgan/LJ003-0093_gen.wav'

waveform, sample_rate = torchaudio.load(ori_audio_file)
print(waveform.shape)
ipd.Audio(waveform.numpy(), rate=sample_rate)


torch.Size([1, 148992])


In [9]:
# Because the limitation of resource computation, I cut the audio file by 64600 
# And the sample_rate when processing the audio file is 16000

adv_audio_file = '/home/hanhnlh/projects/ccs/SPEECH_PROJECT/deepfakes_detectors/audio-deepfake-adversarial-attacks/outputs_adv/LCNN_TEST/LJ003-0093_gen.npy'

audio_array = np.load(adv_audio_file)
audio_tensor = torch.from_numpy(audio_array).reshape((1, -1))
print(audio_tensor.shape)
ipd.Audio(audio_tensor, rate=SAMPLE_RATE)



torch.Size([1, 64600])
