In [1]:
import librosa
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import soundfile as sf
import warnings
import json
from typing import Dict, Tuple, Optional

import sys
sys.path.append('..')
from src.deep_learning_audio_preprocessing import *
warnings.filterwarnings('ignore')

## DEAM

In [3]:
# Configuration
MODEL_TYPE = 'ast'
RAW_AUDIO_DIR = Path("../data/raw/audio_files_DEAM/MEMD_audio/")
PROCESSED_DIR = Path(f"../data/processed/mel_spectrograms_{MODEL_TYPE}/")
# Create output directory
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)    

In [34]:
# Create preprocessor
preprocessor = AudioPreprocessor(model_type=MODEL_TYPE)

In [35]:
# Get all audio files
audio_files = list(RAW_AUDIO_DIR.glob("*.mp3"))
if not audio_files:
    audio_files = list(RAW_AUDIO_DIR.glob("*.wav"))
    
print(f"Found {len(audio_files)} audio files")
print(f"Using {MODEL_TYPE} configuration:")
print(json.dumps(preprocessor.config, indent=2))

Found 1802 audio files
Using ast configuration:
{
  "target_sr": 16000,
  "n_mels": 128,
  "n_fft": 400,
  "hop_length": 160,
  "duration_secs": 10,
  "normalize": "global"
}


In [36]:
# Compute global statistics
if preprocessor.config['normalize'] == 'global':
    preprocessor.compute_global_stats(audio_files)
    
# Process all files
print(f"\nProcessing audio files...")
    
failed_files = []
metadata = {}
    
for audio_path in tqdm(audio_files, desc="Processing"):
    song_id = audio_path.stem
    output_path = PROCESSED_DIR / f"{song_id}.npz"
        
    if not output_path.exists():
        result = preprocessor.process_file(audio_path)
            
        if result is not None:
            # Save as compressed numpy file
            np.savez_compressed(
                output_path,
                spectrograms=result['spectrograms'],
                song_id=result['song_id']
            )
                
            metadata[song_id] = {
                'n_segments': result['n_segments'],
                'shape': result['spectrograms'].shape
            }
        else:
            failed_files.append(audio_path.name)

Computing global statistics from 100 files...


Computing stats: 100%|████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.54it/s]


Global stats - Mean: -41.76, Std: 16.17

Processing audio files...


Processing: 100%|███████████████████████████████████████████████████████████████████| 1802/1802 [08:16<00:00,  3.63it/s]


In [37]:
# Save metadata
metadata_path = PROCESSED_DIR / 'preprocessing_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump({
        'model_type': MODEL_TYPE,
        'config': preprocessor.config,
        'global_mean': float(preprocessor.global_mean) if preprocessor.global_mean is not None else None,
        'global_std': float(preprocessor.global_std) if preprocessor.global_std is not None else None,
        'n_processed': len(metadata),
        'n_failed': len(failed_files),
        'failed_files': failed_files,
        'file_metadata': metadata
    }, f, indent=2)
    
print(f"\nProcessing complete!")
print(f"Processed: {len(metadata)} files")
print(f"Failed: {len(failed_files)} files")
print(f"Metadata saved to: {metadata_path}")


Processing complete!
Processed: 1802 files
Failed: 0 files
Metadata saved to: ../data/processed/mel_spectrograms_ast/preprocessing_metadata.json


In [38]:
example_id = list(metadata.keys())[0]
print(f"\nExample output shape for {example_id}: {metadata[example_id]['shape']}")


Example output shape for 463: (4, 128, 1001)


### make a mert embedding of the audio files

In [2]:
from huggingface_hub import list_repo_files

# Check the correct model exists
files = list_repo_files("m-a-p/MERT-v1-95M")
print(f"Found {len(files)} files in m-a-p/MERT-v1-95M")
print("Sample files:", files[:5])

Found 8 files in m-a-p/MERT-v1-95M
Sample files: ['.gitattributes', 'MERT-v1-95M_fairseq.pt', 'README.md', 'config.json', 'configuration_MERT.py']


In [3]:
from huggingface_hub import snapshot_download
mert_dir = snapshot_download("m-a-p/MERT-v1-95M", resume_download=False, force_download=True)
print("MERT at:", mert_dir)

from transformers import AutoModel, Wav2Vec2FeatureExtractor
_ = AutoModel.from_pretrained(mert_dir, trust_remote_code=True)
_ = Wav2Vec2FeatureExtractor.from_pretrained(mert_dir, trust_remote_code=True)
print("Local MERT loads.")



Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

modeling_MERT.py: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

configuration_MERT.py: 0.00B [00:00, ?B/s]

MERT-v1-95M_fairseq.pt:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

MERT at: /home/georgios/.cache/huggingface/hub/models--m-a-p--MERT-v1-95M/snapshots/12af15fef9d0ac838c3f475bfbbf26d2060dd4f5
Local MERT loads.


In [9]:
# I/O
AUDIO_DIR = Path("../data/raw/audio_files_DEAM/MEMD_audio")
MERT_OUTPUT_DIR = Path("../data/processed/mert_embeddings/")
MERT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [10]:
import torch
# Load MERT
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
model = model.to(device)
model.eval()
print(f"MERT loaded on {device}")

MERT loaded on cuda


In [13]:
import librosa
# Layers to use for the final embedding (concatenate L5 and L6 → 1536D vector)
LAYER_A, LAYER_B = 5, 6

audio_files = sorted(AUDIO_DIR.glob("*.mp3"))

with torch.inference_mode():
    for audio_path in tqdm(audio_files, desc="Extracting MERT L5+L6 embeddings"):
        out_path = MERT_OUTPUT_DIR / f"{audio_path.stem}.npy"
        if out_path.exists():# skip existing embedding files
            continue
        # Load audio (mono, resampled to 24 kHz as expected by MERT)
        audio, _ = librosa.load(audio_path, sr=24000, mono=True, duration=45.0)
        # Convert audio into MERT input format
        inputs = processor(audio, sampling_rate=24000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Run MERT and keep hidden states from all transformer layers
        outputs = model(**inputs, output_hidden_states=True)
        hs = outputs.hidden_states # list of tensors: [embedding_output, layer1, ..., layer12]

        # Pick the 5th and 6th transformer layers
        # If 13 entries, index is shifted by +1 because the first is the embedding output
        if len(hs) == 13:
            h5 = hs[LAYER_A + 1]  # (B,T,768)
            h6 = hs[LAYER_B + 1]
        else:
            h5 = hs[LAYER_A]
            h6 = hs[LAYER_B]

        # mean over time dimens, concat -> (1536,)
        v5 = h5.mean(dim=1) # (B,768)
        v6 = h6.mean(dim=1) # (B,768)
        vec = torch.cat([v5, v6], dim=-1).squeeze(0).float().cpu().numpy() # (1536,)

        # save the embedding
        out_path = MERT_OUTPUT_DIR / f"{audio_path.stem}.npy"
        np.save(out_path, vec.astype(np.float32))

Extracting MERT L5+L6 embeddings: 100%|████████████████████████████████████████████| 1802/1802 [00:03<00:00, 454.16it/s]


## Deezer

In [2]:
deezer_train = pd.read_csv('../data/Deezer/train.csv')
deezer_val = pd.read_csv('../data/Deezer/val.csv')
deezer_test = pd.read_csv('../data/Deezer/test.csv')
# Combine all for preprocessing
deezer_all = pd.concat([deezer_train, deezer_val, deezer_test])

In [3]:
# audio files named by dzr_sng_id
MODEL_TYPE = 'ast'
audio_dir = Path('../data/raw/audio_files_Deezer/')
output_dir = Path(f'../data/processed/DEEZER/mel_spectrograms_{MODEL_TYPE}')
output_dir.mkdir(parents=True, exist_ok=True)

In [4]:
preprocessor = AudioPreprocessor(model_type=MODEL_TYPE)

In [5]:
# Get all audio files
audio_files = list(audio_dir.glob("*.mp3"))
if not audio_files:
    audio_files = list(audio_dir.glob("*.wav"))
    
print(f"Found {len(audio_files)} audio files")
print(f"Using {MODEL_TYPE} configuration:")
print(json.dumps(preprocessor.config, indent=2))

Found 12089 audio files
Using ast configuration:
{
  "target_sr": 16000,
  "n_mels": 128,
  "n_fft": 400,
  "hop_length": 160,
  "duration_secs": 10,
  "normalize": "global"
}


In [2]:
# Compute global statistics
if preprocessor.config['normalize'] == 'global':
    preprocessor.compute_global_stats(audio_files)

In [3]:
metadata = {}
failed_files = []

for dzr_id in tqdm(deezer_all['dzr_sng_id'], desc="Processing Deezer"):
    audio_path = audio_dir / f"{dzr_id}.mp3"
    if not audio_path.exists():
        continue

    result = preprocessor.process_file(audio_path)
    if result is not None:
        np.savez_compressed(
            output_dir / f"{dzr_id}.npz",
            spectrograms=result['spectrograms'],
            song_id=result['song_id']
        )

        metadata[dzr_id] = {
            'n_segments': result['n_segments'],
            'shape': result['spectrograms'].shape
        }
    else:
        failed_files.append(audio_path.name)

# Save metadata
with open(output_dir / "preprocessing_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

SyntaxError: incomplete input (4037190457.py, line 1)

### mert embeddings for Deezer

In [4]:
from huggingface_hub import list_repo_files

# Check the correct model exists
files = list_repo_files("m-a-p/MERT-v1-95M")
print(f"Found {len(files)} files in m-a-p/MERT-v1-95M")
print("Sample files:", files[:5])

Found 8 files in m-a-p/MERT-v1-95M
Sample files: ['.gitattributes', 'MERT-v1-95M_fairseq.pt', 'README.md', 'config.json', 'configuration_MERT.py']


In [5]:
from huggingface_hub import snapshot_download
mert_dir = snapshot_download("m-a-p/MERT-v1-95M", resume_download=False, force_download=True)
print("MERT at:", mert_dir)

from transformers import AutoModel, Wav2Vec2FeatureExtractor
_ = AutoModel.from_pretrained(mert_dir, trust_remote_code=True)
_ = Wav2Vec2FeatureExtractor.from_pretrained(mert_dir, trust_remote_code=True)
print("Local MERT loads.")

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

configuration_MERT.py: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

modeling_MERT.py: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

MERT-v1-95M_fairseq.pt:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

MERT at: /home/georgios/.cache/huggingface/hub/models--m-a-p--MERT-v1-95M/snapshots/12af15fef9d0ac838c3f475bfbbf26d2060dd4f5
Local MERT loads.


In [6]:
# I/O
AUDIO_DIR = Path("../data/raw/audio_files_Deezer/")
MERT_OUTPUT_DIR = Path("../data/processed/DEEZER/mert_embeddings/")
MERT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [7]:
import torch
# Load MERT
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
model = model.to(device)
model.eval()
print(f"MERT loaded on {device}")

MERT loaded on cuda


In [8]:
import librosa
# Layers to use for the final embedding (concatenate L5 and L6 → 1536D vector)
LAYER_A, LAYER_B = 5, 6

audio_files = sorted(AUDIO_DIR.glob("*.mp3"))

with torch.inference_mode():
    for audio_path in tqdm(audio_files, desc="Extracting MERT L5+L6 embeddings"):
        out_path = MERT_OUTPUT_DIR / f"{audio_path.stem}.npy"
        if out_path.exists():# skip existing embedding files
            continue
        # Load audio (mono, resampled to 24 kHz as expected by MERT)
        audio, _ = librosa.load(audio_path, sr=24000, mono=True, duration=45.0)
        # Convert audio into MERT input format
        inputs = processor(audio, sampling_rate=24000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Run MERT and keep hidden states from all transformer layers
        outputs = model(**inputs, output_hidden_states=True)
        hs = outputs.hidden_states # list of tensors: [embedding_output, layer1, ..., layer12]

        # Pick the 5th and 6th transformer layers
        # If 13 entries, index is shifted by +1 because the first is the embedding output
        if len(hs) == 13:
            h5 = hs[LAYER_A + 1]  # (B,T,768)
            h6 = hs[LAYER_B + 1]
        else:
            h5 = hs[LAYER_A]
            h6 = hs[LAYER_B]

        # mean over time dimens, concat -> (1536,)
        v5 = h5.mean(dim=1) # (B,768)
        v6 = h6.mean(dim=1) # (B,768)
        vec = torch.cat([v5, v6], dim=-1).squeeze(0).float().cpu().numpy() # (1536,)

        # save the embedding
        out_path = MERT_OUTPUT_DIR / f"{audio_path.stem}.npy"
        np.save(out_path, vec.astype(np.float32))

Extracting MERT L5+L6 embeddings: 100%|███████████████████████████████████████████| 12089/12089 [34:58<00:00,  5.76it/s]
