##Setup

This cell installs the necessary libraries, loads the pre-trained MusicGen small model (https://huggingface.co/facebook/musicgen-small), and initializes the generation pipeline. MusicGen generates music from text descriptions by converting text into audio tokens, predicting them using a transformer, and decoding them into 32kHz audio waveforms.


In [None]:
!pip install -q transformers accelerate

import torch
import random
import soundfile as sf
import numpy as np

from scipy.signal import butter, sosfilt
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from IPython.display import Audio, display

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "facebook/musicgen-small"
processor = AutoProcessor.from_pretrained(model_id)

model = MusicgenForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

print("Ready on:", device)


## Music Generation

We define a set of text prompts for different music genres and randomly select one for each genre. Then we use the MusicGen model to generate audio for each selected prompt. The generated audio is saved as WAV files.


In [None]:
genre_prompts = {
    "pop": [
        "Upbeat pop track with bright synths",
        "Emotional pop ballad with piano, strings and soft drums",
        "Modern pop song with  groovy bass",
        "Dance-pop track with four-on-the-floor kick and sparkling keys",
        "Mid-tempo pop track with airy pads and mellow vocal hooks"
    ],
    "rock": [
        "Energetic rock track with distorted guitars and powerful drums",
        "Classic rock riff with electric guitar and steady bass",
        "Slow rock ballad with clean guitar and emotional solos",
        "Indie rock track with jangly guitars and tight drum groove",
        "Heavy rock track with aggressive rhythm guitars"
    ],
    "jazz": [
        "Jazz quartet with saxophone, double bass, drums and warm piano",
        "Smooth jazz track with electric piano and soft saxophone melody",
        "Jazz fusion track with syncopated drums and fretless bass",
        "Big band swing with brass section, walking bass and fast ride cymbal",
        "Atmospheric jazz with trumpet and piano"
    ],
    "classical": [
        "Romantic classical piece for solo piano with expressive dynamics",
        "String quartet playing a lyrical, slow movement",
        "Full symphony orchestra with sweeping strings and bold brass",
        "Baroque-style piece with harpsichord and chamber ensemble",
        "Soft classical piece for piano and cello duet"
    ],
    "hiphop": [
        "Hip-hop beat with dusty drums and vinyl crackle",
        "Modern trap beat with deep bass and rapid hi-hats",
        "Lo-fi hip-hop beat with warm piano samples and laid-back groove",
        "Aggressive hip-hop instrumental with heavy kicks and brass stabs",
        "Chill hip-hop beat with soft Rhodes chords and subtle percussion"
    ],
    "electronic": [
        "Ambient electronic soundscape with evolving pads and soft pulses",
        "Melodic techno track with steady kick and hypnotic arpeggios",
        "Deep house groove with warm bass and airy chords",
        "Dubstep-style track with bass and sharp snares",
        "Future bass track with detuned synth chords and sidechain pumping"
    ],
}

selected_prompts = {}
for genre, prompt_list in genre_prompts.items():
    chosen = random.choice(prompt_list)
    selected_prompts[genre] = chosen


print("Chosen prompts:")
for genre, prompt in selected_prompts.items():
    print(f"- {genre}: {prompt}")

duration_tokens = 600   #10-12 seconds
sr = model.config.audio_encoder.sampling_rate
print(f"Sampling rate: {sr} Hz")

for genre, prompt in selected_prompts.items():
    print(f"\nGeneration for '{genre}'...")
    print(f"Prompt: {prompt}")

    inputs = processor(
        text=[prompt],
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16, enabled=(device == "cuda")):
        audio_values = model.generate(
            **inputs,
            do_sample=True,
            guidance_scale=2.5,
            max_new_tokens=duration_tokens,
            temperature=1.0,
            top_k=250,
            top_p=0.95,
        )

    #(batch, channels, samples)
    audio = audio_values[0].detach().cpu().float().numpy()

    display(Audio(audio, rate=sr))

    filename = f"raw_{genre}.wav"
    sf.write(filename, audio.T, sr)
    print(f"Saved as {filename}")

## Preprocessing: Normalization and High-Pass Filtering
Through this function we normalize the amplitude of each generated audio file and apply a high-pass filter to remove low frequencies (below 30 Hz by default).


In [None]:
def preprocess_normalize_highpass(audio, sr, hp_cutoff=30.0):
    audio = audio.astype(np.float32)

    # stereo to mono
    if audio.ndim == 2:
        audio = np.mean(audio, axis=1)

    # peak normalization
    peak = np.max(np.abs(audio))
    if peak > 0:
        audio = 0.99 * audio / peak

    sos_hp = butter(
        N=2,
        Wn=hp_cutoff,
        btype="highpass",
        fs=sr,
        output="sos"
    )

    # high-pass filtering
    audio_hp = sosfilt(sos_hp, audio)
    return audio_hp

genres = ["pop", "rock", "jazz", "classical", "hiphop", "electronic"]

for genre in genres:
    raw_filename = f"raw_{genre}.wav"
    out_filename = f"preproc_{genre}.wav"

    audio, sr = sf.read(raw_filename)
    audio_pre = preprocess_normalize_highpass(audio, sr, hp_cutoff=30.0)

    sf.write(out_filename, audio_pre, sr)
    print(f"Preprocess per {genre}: saved as {out_filename}")



## Denoising: noise reduction with Noisereduce library
We reduce background noise from each preprocessed audio clip using the noisereduce library. The function estimates a noise profile, and applies non-stationary noise reduction.


In [None]:
!pip install -q noisereduce
import noisereduce as nr

def denoise_noisereduce_mono(audio, sr, noise_duration=0.1):
    audio = audio.astype(np.float32)

    if audio.ndim == 2:
        audio = np.mean(audio, axis=1)

    n_noise = int(noise_duration * sr)  # samples
    n_noise = min(n_noise, len(audio) // 2)  # if clip is too short it takes its half instead of noise_duration
    y_noise = audio[:n_noise]

    y_denoised = nr.reduce_noise(
        y=audio,
        y_noise=None,
        sr=sr,
        prop_decrease=0.7,  # noise reduction percentage
        stationary=False    # assuming non-stationary noise
    )

    return y_denoised

for genre in genres:

    pre_file = f"preproc_{genre}.wav"
    out_file = f"denoised_{genre}.wav"

    print(f"\nGenre: {genre}")

    audio_pre, sr = sf.read(pre_file)
    print("Preprocessed:")
    display(Audio(audio_pre, rate=sr))

    audio_denoised = denoise_noisereduce_mono(audio_pre, sr)
    print("Denoised:")
    display(Audio(audio_denoised, rate=sr))

    # Save denoised file
    sf.write(out_file, audio_denoised, sr)
    print(f"Saved as {out_file}")

In [None]:
import matplotlib.pyplot as plt
import librosa.display

def plot_waveform_and_spec(y, sr, title):
    plt.figure(figsize=(10, 3))
    plt.title(title)
    librosa.display.waveshow(y, sr=sr)
    plt.tight_layout()
    plt.show()
for genre in genres:

    audio_pre, sr = sf.read(f"preproc_{genre}.wav")
    audio_denoised, sr = sf.read(f"denoised_{genre}.wav")

    plot_waveform_and_spec(audio_pre, sr, f"{genre} - Preprocessed")
    plot_waveform_and_spec(audio_denoised, sr, f"{genre} - Denoised")


## Bandwidth Extension with HiFi‑GAN BWE (from 24 kHz to 48 kHz)

We use a bandwidth‑extension model (HiFi‑GAN BWE, https://github.com/brentspell/hifi-gan-bwe) (which is a third‑party implementation and not the official HiFi‑GAN release) to enhance the bandwidth of the denoised
audio clips.  
Each cleaned audio file (assumed to be mono or converted to mono) is downsampled to the 24 kHz input rate expected by the model, and feed it to the BWE model to generate a 48 kHz version with reconstructed high‑frequency content.

If the BWE model fails for any reason, a standard resampling fallback  is applied to produce a 48 kHz output.  

This model is optimized for speech-like audio and it is a generative model that reconstructs high-frequency content lost in low-resolution signals.  


In [None]:
!pip install -q hifi-gan-bwe
import warnings
import torchaudio
import os
from hifi_gan_bwe import BandwidthExtender

warnings.filterwarnings("ignore")

print("starting BWE...")

# Setup
device = "cuda" if torch.cuda.is_available() else "cpu"
if 'bwe_model' not in locals():
    bwe_model = BandwidthExtender.from_pretrained("hifi-gan-bwe-13-59f00ca-vctk-24kHz-48kHz").to(device)
    bwe_model.eval()

target_sr_out = 48000
fs_in = 24000

for genre in genres:
    in_path = f"denoised_{genre}.wav"
    out_path = f"bwe_{genre}_48k.wav"

    if not os.path.exists(in_path):
        print(f"Skipping {genre}: file not found.")
        continue

    y, sr = sf.read(in_path, dtype="float32")

    if y.ndim == 2:
        y = np.mean(y, axis=1)

    x = torch.from_numpy(y).float().unsqueeze(0).unsqueeze(0).to(device)

    # Resampling
    if sr != fs_in:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=fs_in).to(device)
        x = resampler(x)
        current_sr = fs_in
    else:
        current_sr = sr

    print(f"Processing {genre}...")

    with torch.no_grad():
        try:
            y_48k_t = bwe_model(x, current_sr)
            y_48k = y_48k_t.squeeze().cpu().numpy()

            print(f"{genre}: BWE Success")

        except Exception as e:
            print(f"{genre}: BWE Failed. Error: {e}")
            print(f"   -> Fallback: standard resampling")

            y_48k_t = torchaudio.functional.resample(x, orig_freq=current_sr, new_freq=target_sr_out)
            y_48k = y_48k_t.squeeze().cpu().numpy()

    sf.write(out_path, y_48k, target_sr_out)

    print(f"Preview {genre}:")
    display(Audio(y_48k, rate=target_sr_out))

## Audio Quality evaluation at Each Pipeline Stage

We evaluate the effect of each stage in the audio pipeline on audio quality.

Two key metrics are computed:
- Log Spectral Distance (LSD): compares the spectral difference between each stage and the original raw audio (resampled to 16 kHz), measuring signal degradation.

- High-Frequency Energy Ratio (HF-ratio): measures the proportion of energy above 8 kHz in the native sampling rate of each audio file, useful to quantify how much high-frequency content is preserved or added.


In [None]:
!pip install -q librosa pandas

import librosa, pandas as pd

def load_mono(path, target_sr=None):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File non trovato: {path}")

    audio, sr = sf.read(path, dtype="float32")
    if audio.ndim == 2:
        audio = np.mean(audio, axis=1)
    if target_sr is not None and sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    return audio, sr

def normalize(x):
    return x / (np.max(np.abs(x)) + 1e-12)

# same dimension for each input
def align_signals(ref, test):
    n = min(len(ref), len(test))
    return ref[:n], test[:n]

def log_spectral_distance(ref, test, sr, n_fft=1024, hop_length=512, eps=1e-12):
    ref, test = align_signals(ref, test)
    ref = normalize(ref)
    test = normalize(test)

    #STFT
    S_ref = np.abs(librosa.stft(ref, n_fft=n_fft, hop_length=hop_length)) + eps
    S_test = np.abs(librosa.stft(test, n_fft=n_fft, hop_length=hop_length)) + eps

    min_frames = min(S_ref.shape[1], S_test.shape[1])

    log_ref = np.log10(S_ref[:, :min_frames])
    log_test = np.log10(S_test[:, :min_frames])

    lsd_frames = np.sqrt(np.mean((log_ref - log_test) ** 2, axis=0))
    return np.mean(lsd_frames)

def hf_energy_ratio(x, sr, hf_min_hz=8000.0):
    X = np.fft.rfft(x)
    freqs = np.fft.rfftfreq(len(x), d=1.0 / sr)
    power_spectrum = np.abs(X) ** 2
    total_energy = np.sum(power_spectrum) + 1e-12
    hf_energy = np.sum(power_spectrum[freqs >= hf_min_hz])

    return hf_energy / total_energy

genres = ["pop", "rock", "jazz", "classical", "hiphop", "electronic"]

stages = {
    "raw":      lambda g: f"raw_{g}.wav",
    "preproc":  lambda g: f"preproc_{g}.wav",
    "denoised": lambda g: f"denoised_{g}.wav",
    "bwe":      lambda g: f"bwe_{g}_48k.wav",
}

COMMON_SR = 16000  # sampling frequency for LSD

rows = []

for genre in genres:

    try:
        raw_audio, _ = load_mono(stages["raw"](genre), target_sr=COMMON_SR)
    except FileNotFoundError as e:
        print(e)
        continue

    for stage_name, path_fn in stages.items():
        path = path_fn(genre)
        if not os.path.exists(path):
            print(f"{stage_name} - file not found: {path}")
            continue

        #LSD
        try:
            test_audio, _ = load_mono(path, target_sr=COMMON_SR)
            lsd_val = log_spectral_distance(raw_audio, test_audio, sr=COMMON_SR)
        except Exception as e:
            print(f"LSD error: {e}")
            lsd_val = None

        # HF
        try:
            native_audio, native_sr = load_mono(path, target_sr=None)
            hf_ratio = hf_energy_ratio(native_audio, native_sr)
        except Exception as e:
            print(f"HF-Ratio error: {e}")
            hf_ratio = None

        rows.append({
            "genre": genre,
            "stage": stage_name,
            "lsd": lsd_val,
            "hf_energy_ratio": hf_ratio,
        })

results_df = pd.DataFrame(rows)
results_df = results_df.sort_values(by=["genre", "stage"])

print("\nRESULTS:")
print(results_df)

stage_means = results_df.groupby("stage").mean(numeric_only=True)
print("\nMEAN FOR EVERY STAGE:")
print(stage_means)


Now we compare consecutive stages

In [None]:
genres = ["pop", "rock", "jazz", "classical", "hiphop", "electronic"]

stage_order = ["raw", "preproc", "denoised", "bwe"]

stage_paths = {
    "raw":      lambda g: f"raw_{g}.wav",
    "preproc":  lambda g: f"preproc_{g}.wav",
    "denoised": lambda g: f"denoised_{g}.wav",
    "bwe":      lambda g: f"bwe_{g}_48k.wav",
}
COMMON_SR = 16000

rows = []

for genre in genres:

    for i in range(len(stage_order) - 1):
        stage_a = stage_order[i]
        stage_b = stage_order[i + 1]

        try:
            audio_a, _ = load_mono(stage_paths[stage_a](genre), target_sr=COMMON_SR)
            audio_b, _ = load_mono(stage_paths[stage_b](genre), target_sr=COMMON_SR)
            lsd_val = log_spectral_distance(audio_a, audio_b, sr=COMMON_SR)
        except Exception as e:
            print(f"LSD error {stage_a} → {stage_b}: {e}")
            lsd_val = None

        try:
            audio_native, sr_native = load_mono(stage_paths[stage_b](genre), target_sr=None)
            hf_ratio = hf_energy_ratio(audio_native, sr_native)
        except Exception as e:
            print(f"HF-Ratio error {stage_b}: {e}")
            hf_ratio = None

        rows.append({
            "genre": genre,
            "comparison": f"{stage_b} vs {stage_a}",
            "lsd": lsd_val,
            "hf_energy_ratio": hf_ratio,
        })

df = pd.DataFrame(rows)
df = df.sort_values(by=["genre", "comparison"])

print("\nSTAGES COMPARISON:")
print(df)

means = df.groupby("comparison").mean(numeric_only=True)
print("\nMEAN")
print(means)
