#**🎯 STEP 1 — Prompt Embeddings**

**📍 1.1 — Install required libraries**

In [None]:
!pip install transformers accelerate torchaudio plotly scikit-learn --quiet

**📍 1.2 — Import libraries and set up device**

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import ClapProcessor, ClapModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**📍 1.3 — Load CLAP model and processor**

In [None]:
processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
model_clap = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)

**📍 1.4 — Define prompts and semantic labels**

In [None]:
prompt_data = [
    {"prompt": "a soft bell", "label": "soft"},
    {"prompt": "a mellow flute tone", "label": "soft"},
    {"prompt": "a gentle whisper", "label": "soft"},
    {"prompt": "a harsh buzzer", "label": "harsh"},
    {"prompt": "a robotic beep", "label": "harsh"},
    {"prompt": "a metallic grinding", "label": "harsh"},
    {"prompt": "a digital chime", "label": "digital"},
    {"prompt": "a pulsing tone", "label": "digital"},
    {"prompt": "an alien pulse", "label": "digital"},
]

**📍 1.5 — Extract CLAP embeddings**

In [None]:
embeddings = []
labels = []
prompts = []

for example in prompt_data:
    inputs = processor(text=example["prompt"], return_tensors="pt").to(device)
    with torch.no_grad():
        emb = model_clap.get_text_features(**inputs).squeeze()
    embeddings.append(emb.cpu().numpy())
    labels.append(example["label"])
    prompts.append(example["prompt"])

X = np.vstack(embeddings)
print("✅ Extracted embeddings shape:", X.shape)

#**🎯 STEP 2 — Interpolation Experiments**

**📍 2.1 — Select two target prompts**

In [None]:
# Select two semantically distant prompts for interpolation
prompt_a = "a soft bell"
prompt_b = "a harsh buzzer"

idx_a = prompts.index(prompt_a)
idx_b = prompts.index(prompt_b)

z_a = torch.tensor(X[idx_a])
z_b = torch.tensor(X[idx_b])

**📍 2.2 — Define interpolation coefficients**

In [None]:
# Define alpha values for interpolation from z_a to z_b
alphas = np.linspace(0, 1, 7)
interpolated_z = [(1 - a) * z_a + a * z_b for a in alphas]

**📍 2.3 — Plot interpolated embedding vectors**

In [None]:
# Visualize the vector-wise evolution along the interpolation path
plt.figure(figsize=(10, 2))
for i, z in enumerate(interpolated_z):
    plt.plot(z.numpy(), label=f"α={alphas[i]:.2f}", alpha=0.6)
plt.title("Interpolated CLAP Embeddings")
plt.xlabel("Embedding Dimension")
plt.ylabel("Value")
plt.legend()
plt.tight_layout()
plt.show()

**📍 2.4 — Load trained RealNVP model from GitHub**

In [None]:
import torch.nn as nn
import requests
from io import BytesIO

# 🔗 Download the pretrained model weights (.pt file) from your GitHub repository
github_url = "https://raw.githubusercontent.com/Mariagiusi23/ID-001-AWOL-for-Audio/main/notebook/realnvp_model.pt"
response = requests.get(github_url)
response.raise_for_status()
weights_buffer = BytesIO(response.content)

# 🧠 Define the RealNVP architecture
class CouplingLayer(nn.Module):
    def __init__(self, dim, mask):
        super().__init__()
        hidden = 256
        self.mask = mask
        self.scale_net = nn.Sequential(
            nn.Linear(dim, hidden), nn.ReLU(), nn.Linear(hidden, dim), nn.Tanh()
        )
        self.translate_net = nn.Sequential(
            nn.Linear(dim, hidden), nn.ReLU(), nn.Linear(hidden, dim)
        )

    def forward(self, x):
        x_masked = x * self.mask
        s = self.scale_net(x_masked) * (1 - self.mask)
        t = self.translate_net(x_masked) * (1 - self.mask)
        y = x_masked + (1 - self.mask) * (x * torch.exp(s) + t)
        log_det_jacobian = s.sum(dim=1)
        return y, log_det_jacobian

    def inverse(self, y):
        y_masked = y * self.mask
        s = self.scale_net(y_masked) * (1 - self.mask)
        t = self.translate_net(y_masked) * (1 - self.mask)
        x = y_masked + (1 - self.mask) * ((y - t) * torch.exp(-s))
        return x

class RealNVP(nn.Module):
    def __init__(self, input_dim, output_dim, n_layers):
        super().__init__()
        self.layers = nn.ModuleList()
        for i in range(n_layers):
            mask = self._get_mask(i, input_dim).to(device)
            self.layers.append(CouplingLayer(input_dim, mask))
        self.projection = nn.Linear(input_dim, output_dim)

    def _get_mask(self, layer_index, dim):
        mask = torch.zeros(dim)
        if layer_index % 2 == 0:
            mask[: dim // 2] = 1
        else:
            mask[dim // 2 :] = 1
        return mask

    def forward(self, x):
        log_det = 0
        for layer in self.layers:
            x, ldj = layer(x)
            log_det += ldj
        return x, log_det

    def inverse(self, z):
        for layer in reversed(self.layers):
            z = layer.inverse(z)
        return self.projection(z)

# 📦 Instantiate the model and load the pretrained weights
realnvp = RealNVP(input_dim=512, output_dim=4, n_layers=6).to(device)
realnvp.load_state_dict(torch.load(weights_buffer, map_location=device))
realnvp.eval()

print("✅ RealNVP model loaded from GitHub.")



**📍 2.5 — Define decoder for parameter mapping**

In [None]:
# Function that maps a CLAP embedding to FM parameters using RealNVP
def real_decoder(z):
    z = z.unsqueeze(0).to(device)
    with torch.no_grad():
        z_transformed, _ = realnvp(z)
        pred = realnvp.inverse(z_transformed).squeeze()
    return pred.cpu().numpy().tolist()

**📍 2.6 — Define FM synthesis function**

In [None]:
# Generate audio signal from a 4-parameter FM vector
def synthesize(p, sr=16000, duration=2.0):
    t = torch.linspace(0, duration, int(sr * duration))
    carrier, modulator, index, amplitude = p
    mod_signal = torch.sin(2 * math.pi * modulator * t)
    signal = amplitude * torch.sin(2 * math.pi * carrier * t + index * mod_signal)
    return signal

**📍 2.7 — Generate audio from interpolated embeddings**

In [None]:
# Decode and synthesize audio signals from interpolated embeddings
from IPython.display import Audio
import math

for i, z in enumerate(interpolated_z):
    normed_params = real_decoder(z)
    carrier   = 100 + 900 * normed_params[0]
    modulator = 50 + 450 * normed_params[1]
    index     = 10 * normed_params[2]
    amplitude = 0.9 * normed_params[3]
    p = [carrier, modulator, index, amplitude]
    signal = synthesize(p)
    audio = Audio(signal.numpy(), rate=16000)

    print(f"\nα = {alphas[i]:.2f}")
    print(f"  Carrier freq:   {carrier:.2f} Hz")
    print(f"  Modulator freq: {modulator:.2f} Hz")
    print(f"  Modulation idx: {index:.2f}")
    print(f"  Amplitude:      {amplitude:.2f}")

    display(audio)

**📍 2.8 — Extrapolation beyond alpha = 1**

In [None]:
extra_alphas = [1.2, 1.5, 2.0]
extra_z = [(1 - a) * z_a + a * z_b for a in extra_alphas]

for i, z in enumerate(extra_z):
    normed_params = real_decoder(z)
    carrier    = 100 + 900 * normed_params[0]
    modulator  = 50 + 450 * normed_params[1]
    index      = 10 * normed_params[2]
    amplitude  = 0.9 * normed_params[3]
    p = [carrier, modulator, index, amplitude]

    signal = synthesize(p)
    audio = Audio(signal.numpy(), rate=16000)

    print(f"\nα = {extra_alphas[i]:.2f} (extrapolated)")
    print(f"  Carrier freq:   {carrier:.2f} Hz")
    print(f"  Modulator freq: {modulator:.2f} Hz")
    print(f"  Modulation idx: {index:.2f}")
    print(f"  Amplitude:      {amplitude:.2f}")
    display(audio)


**📍 2.9 — Save interpolated audio as .wav**

In [None]:
import os
import soundfile as sf

os.makedirs("interpolated_audio", exist_ok=True)

for i, z in enumerate(interpolated_z):
    normed_params = real_decoder(z)
    carrier    = 100 + 900 * normed_params[0]
    modulator  = 50 + 450 * normed_params[1]
    index      = 10 * normed_params[2]
    amplitude  = 0.9 * normed_params[3]
    p = [carrier, modulator, index, amplitude]

    signal = synthesize(p)

    filename = f"interpolated_audio/audio_interp_{i}_alpha_{alphas[i]:.2f}.wav"
    sf.write(filename, signal.numpy(), samplerate=16000)
    print(f"✅ Saved: {filename}")


#**🎯 STEP 3 — Latent Space Visualization**

**📍 3.1 — PCA projection to 2D**

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(6, 4))
for label in set(labels):
    idxs = [i for i, l in enumerate(labels) if l == label]
    plt.scatter(X_pca[idxs, 0], X_pca[idxs, 1], label=label, alpha=0.7)
plt.title("CLAP Embeddings - PCA")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

**📍 3.2 — t-SNE projection (2D)**

In [None]:
X_tsne = TSNE(n_components=2, perplexity=5, random_state=42).fit_transform(X)
fig = px.scatter(
    x=X_tsne[:, 0], y=X_tsne[:, 1],
    color=labels, text=prompts,
    title="CLAP Embeddings - t-SNE (2D)",
    labels={"x": "t-SNE 1", "y": "t-SNE 2"}
)
fig.update_traces(textposition='top center')
fig.show()

**📍 3.3 — t-SNE projection (3D)**

In [None]:
X_tsne_3d = TSNE(n_components=3, perplexity=5, random_state=42).fit_transform(X)
fig3d = px.scatter_3d(
    x=X_tsne_3d[:, 0],
    y=X_tsne_3d[:, 1],
    z=X_tsne_3d[:, 2],
    color=labels,
    text=prompts,
    title="CLAP Embeddings - t-SNE (3D)",
    labels={"x": "t-SNE 1", "y": "t-SNE 2", "z": "t-SNE 3"}
)
fig3d.update_traces(marker=dict(size=5), textposition='top center')
fig3d.show()

# **🎯 STEP 4 — Comparison with Baseline MLP**


**📍 4.1 — Load MLP baseline model from GitHub**

In [None]:
import torch.nn as nn
import requests
from io import BytesIO

# Define the MLP architecture
class MLP(nn.Module):
    def __init__(self, input_dim=512, hidden_dim=256, output_dim=4):  # ⚠️ hidden_dim must match the saved model
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# Instantiate the model
mlp = MLP().to(device)

# GitHub raw link to the saved model file
mlp_url = "https://raw.githubusercontent.com/Mariagiusi23/ID-001-AWOL-for-Audio/main/notebook/mlp_baseline_model.pt"

# Download the model weights
response = requests.get(mlp_url)
response.raise_for_status()
buffer = BytesIO(response.content)

# Load weights
mlp.load_state_dict(torch.load(buffer, map_location=device))
mlp.eval()

print("✅ MLP baseline model loaded from GitHub.")


**📍 4.2 — Generate and save baseline MLP audio**

In [None]:
os.makedirs("baseline_audio", exist_ok=True)

for i, z in enumerate(interpolated_z):
    with torch.no_grad():
        pred = mlp(z.to(device)).squeeze()
    normed_params = pred.cpu().numpy().tolist()

    carrier    = 100 + 900 * normed_params[0]
    modulator  = 50 + 450 * normed_params[1]
    index      = 10 * normed_params[2]
    amplitude  = 0.9 * normed_params[3]
    p = [carrier, modulator, index, amplitude]

    signal = synthesize(p)
    filename = f"baseline_audio/audio_mlp_alpha_{alphas[i]:.2f}.wav"
    sf.write(filename, signal.numpy(), samplerate=16000)

    print(f"\nα = {alphas[i]:.2f} — MLP")
    print(f"  Carrier freq:   {carrier:.2f} Hz")
    print(f"  Modulator freq: {modulator:.2f} Hz")
    print(f"  Modulation idx: {index:.2f}")
    print(f"  Amplitude:      {amplitude:.2f}")
    print(f"✅ Saved: {filename}")
    display(Audio(signal.numpy(), rate=16000))


#**🎯 STEP 5 — Gradio Demo: Real-time Semantic Interpolation**

In [None]:
!pip install gradio --quiet

import gradio as gr

def generate_interpolated_sound(prompt_a, prompt_b, alpha):
    # Encode prompts
    inputs_a = processor(text=prompt_a, return_tensors="pt").to(device)
    inputs_b = processor(text=prompt_b, return_tensors="pt").to(device)
    with torch.no_grad():
        z_a = model_clap.get_text_features(**inputs_a).squeeze()
        z_b = model_clap.get_text_features(**inputs_b).squeeze()

    # Interpolate
    z = (1 - alpha) * z_a + alpha * z_b

    # Predict FM params
    normed_params = real_decoder(z)
    carrier    = 100 + 900 * normed_params[0]
    modulator  = 50 + 450 * normed_params[1]
    index      = 10 * normed_params[2]
    amplitude  = 0.9 * normed_params[3]
    p = [carrier, modulator, index, amplitude]

    # Synthesize
    signal = synthesize(p)
    return (16000, signal.numpy())

# UI layout
gr.Interface(
    fn=generate_interpolated_sound,
    inputs=[
        gr.Textbox(label="Prompt A", value="a soft bell"),
        gr.Textbox(label="Prompt B", value="a harsh buzzer"),
        gr.Slider(0, 1, value=0.5, label="α — Interpolation factor")
    ],
    outputs=gr.Audio(label="Generated Audio"),
    title="AWOL: Language-to-Sound Interpolation",
    description="Type two prompts and slide alpha to explore semantic audio interpolation."
).launch()


#**🎯 Final Remarks**


This notebook explores the semantic latent space of CLAP embeddings by performing smooth interpolation between two audio prompts. The decoded parameters are mapped through a pretrained RealNVP model,resulting in frequency-modulated signals synthesized using a parametric FM decoder. The transition is perceptually continuous, both visually (via latent plots) and sonically (via waveform/audio). This confirms the model's ability to learn a meaningful latent space for controllable sound generation.

