In [None]:
import os
import zipfile

# Define the URL of the dataset and file names
dataset_url = "https://github.com/karoldvl/ESC-50/archive/master.zip"
zip_filename = "ESC-50-master.zip"
output_folder = "ESC-50-master"

# Download the dataset
print("Downloading dataset...")
if not os.path.exists(zip_filename):
    !wget -O {zip_filename} {dataset_url}
else:
    print(f"{zip_filename} already exists. Skipping download.")

# Extract the dataset
print("Extracting dataset...")
if not os.path.exists(output_folder):
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall()
    print(f"Dataset successfully extracted to: {output_folder}")
else:
    print(f"{output_folder} already exists. Skipping extraction.")


Downloading dataset...
--2024-12-26 07:49:03--  https://github.com/karoldvl/ESC-50/archive/master.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/karolpiczak/ESC-50/archive/master.zip [following]
--2024-12-26 07:49:04--  https://github.com/karolpiczak/ESC-50/archive/master.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master [following]
--2024-12-26 07:49:04--  https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.113.9
Connecting to codeload.github.com (codeload.github.com)|140.82.113.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 645695005 (616M) [application/zip]
Saving to: ‘ESC-50-master.z

In [None]:
import os
import librosa
import pandas as pd
from tqdm import tqdm

# Path al dataset
audio_dir = os.path.join("ESC-50-master", "audio")
metadata_path = os.path.join("ESC-50-master", "meta", "esc50.csv")

# Caricamento dei metadati
print("Loading metadata...")
metadata = pd.read_csv(metadata_path)

# Aggiungere colonna per il percorso completo ai file audio
metadata['file_path'] = metadata['filename'].apply(lambda x: os.path.join(audio_dir, x))

# Controllo della presenza dei file audio
missing_files = [path for path in metadata['file_path'] if not os.path.exists(path)]
if missing_files:
    print(f"Warning: Missing audio files! {len(missing_files)} files are missing.")
else:
    print("All audio files are present.")

# Caricamento e downsampling dei file audio
print("Processing audio files...")
processed_audio = []
for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
    file_path = row['file_path']
    try:
        # Carica l'audio con librosa e downsample a 16 kHz
        y, sr = librosa.load(file_path, sr=16000)  # YAMNet richiede 16 kHz
        processed_audio.append({
            'audio': y,
            'sr': sr,
            'label': row['category'],
            'target': row['target'],
            'fold': row['fold'],
            'file_path': file_path
        })
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print(f"Processed {len(processed_audio)} audio files.")

# Esplora un esempio per verifica
example = processed_audio[0]
print(f"Example audio: {example['file_path']}")
print(f"Sample rate: {example['sr']}")
print(f"Audio length (seconds): {len(example['audio']) / example['sr']}")


Loading metadata...
All audio files are present.
Processing audio files...


100%|██████████| 2000/2000 [00:23<00:00, 83.99it/s] 

Processed 2000 audio files.
Example audio: ESC-50-master/audio/1-100032-A-0.wav
Sample rate: 16000
Audio length (seconds): 5.0





In [None]:
import joblib

# Definisci il nome del file di output
output_file = "processed_audio.joblib"

# Salva processed_audio utilizzando joblib con compressione
joblib.dump(processed_audio, output_file, compress=3)  # compress=3 è un buon compromesso tra spazio e velocità

print(f"Processed audio data salvato con successo in: {output_file}")


Processed audio data salvato con successo in: processed_audio.joblib


### yamnet

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
from tqdm import tqdm
# Carica il modello YAMNet da TensorFlow Hub
yamnet_model_url = "https://tfhub.dev/google/yamnet/1"
yamnet_model = hub.load(yamnet_model_url)
print("YAMNet model loaded successfully.")


YAMNet model loaded successfully.


In [None]:
# Lista per salvare gli embeddings
yamnet_embeddings = []

# Calcolo degli embeddings per ogni waveform
print("Generating embeddings with YAMNet...")
for audio in tqdm(processed_audio):
    try:
        # Converte il waveform in un tensor
        waveform = tf.convert_to_tensor(audio['audio'], dtype=tf.float32)

        # Passa il waveform nel modello
        _, embeddings, _ = yamnet_model(waveform)  # Ottieni gli embeddings dallo strato intermedio

        # Calcola la media sugli embeddings (opzionale, ma consigliato)
        averaged_embeddings = tf.reduce_mean(embeddings, axis=0).numpy()

        # Salva gli embeddings mediati
        yamnet_embeddings.append(averaged_embeddings)
    except Exception as e:
        print(f"Error generating embeddings for {audio['file_path']}: {e}")

yamnet_embeddings = np.array(yamnet_embeddings)  # Converte in array NumPy
print(f"Generated embeddings with shape: {yamnet_embeddings.shape}")


Generating embeddings with YAMNet...


100%|██████████| 2000/2000 [00:12<00:00, 154.61it/s]

Generated embeddings with shape: (2000, 1024)





In [None]:
# Salvataggio embeddings, labels, folds e categories in formato .npz
output_file = "yamnet_embeddings_esc50.npz"

labels = np.array([audio['target'] for audio in processed_audio])  # Label numerica
folds = np.array([audio['fold'] for audio in processed_audio])    # Fold
categories = np.array([audio['label'] for audio in processed_audio])  # Categoria testuale

np.savez(output_file, embeddings=yamnet_embeddings, labels=labels, folds=folds, categories=categories)
print(f"Embeddings saved to {output_file}")


Embeddings saved to yamnet_embeddings_esc50.npz


### ast

In [None]:
from transformers import ASTFeatureExtractor
import torch
from tqdm import tqdm

# Carica il Feature Extractor AST
feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

# Lista per salvare le feature estratte
processed_features = []

print("Preprocessing audio with AST Feature Extractor...")

# Estrazione delle feature
for audio in tqdm(processed_audio):
    try:
        # Converte il waveform in feature con padding e troncamento automatici
        inputs = feature_extractor(
            audio['audio'],  # Audio waveform (1D array)
            return_tensors="pt",  # Ritorna tensori PyTorch
            sampling_rate = 16000
        )
        # Salva le feature estratte
        processed_features.append({
            'input_values': inputs['input_values'],  # Tensor delle feature
            'label': audio['label'],
            'target': audio['target'],
            'fold': audio['fold']
        })
    except Exception as e:
        print(f"Error processing {audio['file_path']}: {e}")

print(f"Processed {len(processed_features)} audio features.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Preprocessing audio with AST Feature Extractor...


100%|██████████| 2000/2000 [00:10<00:00, 191.68it/s]

Processed 2000 audio features.





In [None]:
from transformers import ASTModel
import torch
from tqdm import tqdm

# Verifica se una GPU è disponibile
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Carica il modello AST pre-addestrato e spostalo sulla GPU (se disponibile)
ast_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(device)
ast_model.eval()  # Imposta il modello in modalità valutazione

# Liste per salvare gli embeddings mediati e non mediati
ast_embeddings_mean = []
ast_embeddings_temporal = []

print("Generating embeddings with AST...")

# Estrai gli embeddings
for feature in tqdm(processed_features):
    try:
        with torch.no_grad():  # Disabilita il calcolo dei gradienti
            # Sposta l'input sulla GPU
            input_values = feature['input_values'].to(device)  # Assumi che input_values sia un tensor

            # Ottieni embeddings dall'output di AST
            outputs = ast_model(input_values)  # Passa gli input al modello

            # Embeddings mediati (media temporale)
            embeddings_mean = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()


            # Salva entrambi i tipi di embeddings

            ast_embeddings_mean.append({
                'embeddings': embeddings_mean,
                'label': feature['label'],
                'target': feature['target'],
                'fold': feature['fold']
            })

    except Exception as e:
        print(f"Error processing input: {e}")

#print(f"Generated embeddings for {len(ast_embeddings_mean)} audio files.")


Using device: cuda


config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Generating embeddings with AST...


100%|██████████| 2000/2000 [02:45<00:00, 12.06it/s]


In [None]:
import numpy as np

# Salvataggio degli embeddings mediati (nome originale)
mean_embeddings = np.array([item['embeddings'] for item in ast_embeddings_mean])
labels = np.array([item['target'] for item in ast_embeddings_mean])
folds = np.array([item['fold'] for item in ast_embeddings_mean])
categories = np.array([item['label'] for item in ast_embeddings_mean])  # Aggiunge le categorie testuali

np.savez(
    "ast_embeddings_esc50.npz",  # Nome originale per gli embeddings mediati
    embeddings=mean_embeddings,
    labels=labels,
    folds=folds,
    categories=categories  # Include le categorie testuali
)

