In [177]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [222]:
import os

# List files in a directory
root = '/content/drive/MyDrive/data/'
original = root + 'original/complete/rihanna.mp3'
generated = root + 'finetune-generated/complete/rihanna.mp3'
print(original)
print(generated)
checkpoint_path = root + 'vggish_model.ckpt'
pca_params_path = root + 'vggish_pca_params.npz'


/content/drive/MyDrive/data/original/complete/rihanna.mp3
/content/drive/MyDrive/data/finetune-generated/complete/rihanna.mp3


In [108]:
import tensorflow as tf
import numpy as np
import scipy
from scipy.io import wavfile
from scipy.linalg import sqrtm
from scipy.stats import entropy
from sklearn.neighbors import KernelDensity

!git clone https://github.com/tensorflow/models.git
!pip install tf_slim
!pip install resampy
!pip install torchaudio

fatal: destination path 'models' already exists and is not an empty directory.


In [None]:
!wget https://storage.googleapis.com/audioset/vggish_model.ckpt
!wget https://storage.googleapis.com/audioset/vggish_pca_params.npz

In [223]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [224]:
import sys
sys.path.append('models/research/audioset/vggish')
import vggish_input
import vggish_params
import vggish_postprocess
import vggish_slim
import resampy
import torchaudio

def extract_vggish_features(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    B, T = waveform.shape
    waveform = waveform[:, T//2:]
    if sample_rate != 16000:
        waveform = waveform.numpy()
        waveform = resampy.resample(waveform, sample_rate, 16000)
    examples_batch = vggish_input.waveform_to_examples(waveform.flatten(), 16000)

    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
        features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
        [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch})

    return embedding_batch

original_features = extract_vggish_features(original)
generated_features = extract_vggish_features(generated)

In [225]:
pca_params = np.load(pca_params_path)
pca_means = pca_params['pca_means']
pca_eigenvalues = pca_params['pca_eigen_vectors']

def postprocess_features(features, means, components):
    features -= means
    features = np.dot(features, components.T)
    return features

original_features = postprocess_features(original_features, pca_means, pca_eigenvalues)
generated_features = postprocess_features(generated_features, pca_means, pca_eigenvalues)

In [226]:
from scipy.linalg import sqrtm

def compute_fad(features1, features2):
    mu1, sigma1 = np.mean(features1, axis=0), np.cov(features1, rowvar=False)
    mu2, sigma2 = np.mean(features2, axis=0), np.cov(features2, rowvar=False)
    ssdiff = np.sum((mu1 - mu2)**2.0)
    covmean = sqrtm(sigma1.dot(sigma2))
    if np.iscomplexobj(covmean):
        covmean = covmean.real
    fad = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
    return fad

fad_score = compute_fad(original_features, generated_features)
print(f"Fréchet Audio Distance: {fad_score}")


Fréchet Audio Distance: 146.86130835079769


In [27]:
!git clone https://github.com/kkoutini/PaSST.git
%cd PaSST
!pip install -r requirements.txt

Cloning into 'PaSST'...
remote: Enumerating objects: 347, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 347 (delta 66), reused 85 (delta 53), pack-reused 222[K
Receiving objects: 100% (347/347), 628.45 KiB | 4.69 MiB/s, done.
Resolving deltas: 100% (164/164), done.
/content/PaSST
Collecting av>=10.0.0 (from -r requirements.txt (line 1))
  Downloading av-12.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting kk-sacred>=0.8.4 (from -r requirements.txt (line 4))
  Downloading kk_sacred-0.8.5-py2.py3-none-any.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.3/112.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting timm>=0.4.12 (from -r requirements.txt (line 6))
  Downloading timm-1.0.3-py3-none-any.whl (2.3 MB)
[2K    

In [227]:
import torch
import torch.nn.functional as F
from torchaudio.transforms import MelSpectrogram

# Correct path to import the get_model function
sys.path.append('/content/PaSST/src')
from models.passt import get_model

# Load the PaSST model
passt_model = get_model('passt_s_swa_p16_128_ap476', pretrained=True).eval()

def waveform_to_spectrogram(waveform, sample_rate, n_mels=128):
    mel_spectrogram = MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels)
    spectrogram = mel_spectrogram(waveform)
    spectrogram = spectrogram.unsqueeze(0)  # Add batch dimension
    return spectrogram

def extract_passt_labels(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    B, T = waveform.shape
    waveform = waveform[:, T//2:]
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    spectrogram = waveform_to_spectrogram(waveform, 16000)
    with torch.no_grad():
        logits = passt_model(spectrogram)[0]  # Get the logits from the model output tuple
        labels = logits.softmax(dim=-1)
    return labels

original_labels = extract_passt_labels(original)
generated_labels = extract_passt_labels(generated)

def compute_kl_divergence(labels1, labels2):
    kl_div = F.kl_div(labels1.log(), labels2, reduction='batchmean')
    return kl_div.item()

kl_div_score = compute_kl_divergence(original_labels, generated_labels)
print(f"Kullback-Leibler Divergence: {kl_div_score}")




 Loading PaSST pre-trained on AudioSet Patch 16 stride 10 structured patchout mAP=476 SWA 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
 