**Py Torch Model load**

In [1]:
import torch
from tqdm import tqdm
import torchaudio
import gc
import os
from BEATs import BEATs, BEATsConfig
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
THRESHOLD = 0.85

In [4]:
import soundfile as sf
sf.available_formats()

{'AIFF': 'AIFF (Apple/SGI)',
 'AU': 'AU (Sun/NeXT)',
 'AVR': 'AVR (Audio Visual Research)',
 'CAF': 'CAF (Apple Core Audio File)',
 'FLAC': 'FLAC (Free Lossless Audio Codec)',
 'HTK': 'HTK (HMM Tool Kit)',
 'SVX': 'IFF (Amiga IFF/SVX8/SV16)',
 'MAT4': 'MAT4 (GNU Octave 2.0 / Matlab 4.2)',
 'MAT5': 'MAT5 (GNU Octave 2.1 / Matlab 5.0)',
 'MPC2K': 'MPC (Akai MPC 2k)',
 'OGG': 'OGG (OGG Container format)',
 'PAF': 'PAF (Ensoniq PARIS)',
 'PVF': 'PVF (Portable Voice Format)',
 'RAW': 'RAW (header-less)',
 'RF64': 'RF64 (RIFF 64)',
 'SD2': 'SD2 (Sound Designer II)',
 'SDS': 'SDS (Midi Sample Dump Standard)',
 'IRCAM': 'SF (Berkeley/IRCAM/CARL)',
 'VOC': 'VOC (Creative Labs)',
 'W64': 'W64 (SoundFoundry WAVE 64)',
 'WAV': 'WAV (Microsoft)',
 'NIST': 'WAV (NIST Sphere)',
 'WAVEX': 'WAVEX (Microsoft)',
 'WVE': 'WVE (Psion Series 3)',
 'XI': 'XI (FastTracker 2)'}

In [2]:
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")

GPU: NVIDIA GeForce RTX 2070 SUPER is available.


**Load the model**

In [3]:
checkpoint = torch.load('BEATs_iter3_plus_AS2M.pt')

cfg = BEATsConfig(checkpoint['cfg'])
BEATs_model = BEATs(cfg)
BEATs_model.load_state_dict(checkpoint['model'])
BEATs_model = BEATs_model.to(device="cuda")
#BEATs_model.eval()



In [16]:
x = torchaudio.load(uri=AUDIO, format="wav")
_x = x[0].to(device="cuda")
rep_x = BEATs_model.extract_features(_x)[0]

In [22]:
audio_name=os.path.join(REF_SET,"maCheSuccede.wav")
sample = torchaudio.load(uri= audio_name,format="wav")[0]
sample = sample.to(device="cuda")
audio_rep = BEATs_model.extract_features(sample)[0]

In [33]:
print(audio_rep.shape)
print(rep_x.shape)
ComputeSimilarity(audio_rep, rep_x)

torch.Size([2, 264, 768])
torch.Size([2, 760, 768])
torch.Size([2, 768])


0.5416378378868103

In [60]:
def calculate_padding(tensor, target_shape):
    padding = []
    for dim, target_dim in zip(tensor.shape[::-1], target_shape[::-1]):
        pad = target_dim - dim
        padding.extend((0, pad))
    return padding

def ComputeSimilarity(t1, t2):
    target_shape = (
    max(t1.shape[0], t2.shape[0]),
    max(t1.shape[1], t2.shape[1]),
    max(t1.shape[2], t2.shape[2]),
    )
    

    padding1 = calculate_padding(t1, target_shape)
    padding2 = calculate_padding(t2, target_shape)

    # Apply padding
    padded_tensor1 = torch.nn.functional.pad(t1, padding1)
    padded_tensor2 = torch.nn.functional.pad(t2, padding2)
        
    _sim = torch.nn.functional.cosine_similarity(padded_tensor1, padded_tensor2)
    print(_sim.shape)
    mean = _sim.max().item()
    return(mean)

In [61]:
REF_SET = "Audio/AlessandroReal/"
AUDIO = "Audio/fake.wav"

**TESTS**


In [62]:
gc.collect()
torch.cuda.empty_cache()
x = torchaudio.load(uri=AUDIO, format="wav")[0]
x = x.to(device="cuda")
rep_x = BEATs_model.extract_features(x)[0]

SIM = []

for file in tqdm(os.listdir(REF_SET), desc="Processing refrence set files"):

    gc.collect()
    torch.cuda.empty_cache()
    if(file.endswith(".wav")):

        audio_name = os.path.join(REF_SET, file)
        sample = torchaudio.load(uri= audio_name,format=os.path.splitext(file)[1])[0]
        sample = sample.to(device="cuda")
        audio_rep = BEATs_model.extract_features(sample)[0]

        sample = None
        
        SIM.append(ComputeSimilarity(rep_x, audio_rep))
        
    

Processing refrence set files:  50%|█████     | 1/2 [00:00<00:00,  2.70it/s]

torch.Size([2, 768])


Processing refrence set files: 100%|██████████| 2/2 [00:00<00:00,  3.16it/s]

torch.Size([2, 768])





In [63]:

# Find the maximum similarity value
max_similarity = max(SIM)

print(max_similarity)

if(max_similarity < THRESHOLD):
    print("fake")
else:
    print("real")

0.6548163890838623
fake
