<a href="https://colab.research.google.com/github/NadiaCarvalho/Aethra/blob/main/Aethra-LatentSpaceCreator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Functions
import itertools

import numpy as np

import bokeh
import bokeh.io
import bokeh.plotting
import bokeh.models

bokeh.io.output_notebook()

def get_dimensionality_reduction(algorithm, latent_spaces, n_components=2):
  if algorithm == 'pca':
      from sklearn.decomposition import PCA
      from sklearn.preprocessing import StandardScaler
      pca = PCA(n_components=n_components)
      predictions = pca.fit_transform(
      StandardScaler().fit_transform(latent_spaces))
  elif algorithm == 'tsne':
      from sklearn.manifold import TSNE
      tsne = TSNE(n_components=n_components, perplexity=30 if len(latent_spaces) > 30 else len(latent_spaces)-1)
      predictions = tsne.fit_transform(np.asarray(latent_spaces))
  elif algorithm == 'mds':
      from sklearn.manifold import MDS
      mds = MDS(n_components=n_components, normalized_stress="auto")
      predictions = mds.fit_transform(latent_spaces)
  elif algorithm == 'isomap':
      from sklearn.manifold import Isomap
      iso = Isomap(n_components=n_components)
      predictions = iso.fit_transform(latent_spaces)
  elif algorithm == 'lle':
      from sklearn.manifold import LocallyLinearEmbedding
      lle = LocallyLinearEmbedding(n_components=n_components)
      predictions = lle.fit_transform(latent_spaces)
  else:
      import umap.umap_ as umap
      reducer = umap.UMAP(n_components=n_components)
      predictions = reducer.fit_transform(latent_spaces)
  return predictions

def get_bokeh_source(predictions, labels, colors=None, radius=.01):
  return dict(
    x=predictions[:,0],
    y=predictions[:,1],
    id=labels,
    label=labels if colors is None else colors[1],
    color=['blue'] * len(labels) if colors is None else colors[0],
    alpha=[.7] * len(labels),
    radius=[radius] * len(labels)
  )

def print_visualization_2d(source):

  hover = bokeh.models.HoverTool(tooltips=[('Label', '@label'), ('ID', '@id')])

  p = bokeh.plotting.figure(
      toolbar_location='below',
      tools=['box_select', 'lasso_select', 'poly_select',
             'tap', 'reset', 'zoom_in', 'zoom_out', 'help',
             'pan', 'wheel_zoom', 'box_zoom', 'save', hover],
      )
  p.circle(x='x', y='y',
           radius='radius',
           color='color',
           alpha='alpha',
           source=source)
  return p

def get_colors(samples):
  anal = [librosa.feature.rms(y=s).mean() for s in samples] #, sr=sr
  edges = np.histogram_bin_edges(anal, bins=18)

  def get_group(s):
    for i, e in enumerate(edges):
      if s < e:
        return i
    return len(edges)

  cl_x = list(bokeh.palettes.Category20_20)
  return [cl_x[get_group(s)] for s in anal], anal

def get_plot(lats, samples, algorithm='lle'):
  points = get_dimensionality_reduction(algorithm, lats, 2)
  rs = points.max(0) - points.min(0)
  return get_bokeh_source(points, list(range(0, len(points), 1)),
                          colors=get_colors(samples), radius=.01)
  # float(rs.min()/((points.shape[0] / 2) if points.shape[0] > 100 else points.shape[0]))

In [2]:
# @title Import Model {"display-mode":"form"}
rave_model = "https://huggingface.co/Intelligent-Instruments-Lab/rave-models/resolve/main/voice_vocalset_b2048_r48000_z16.ts" # @param {"type":"string"}
!wget --no-check-certificate -r {rave_model} -O model.ts

will be placed in the single file you specified.

--2026-01-08 17:26:43--  https://huggingface.co/Intelligent-Instruments-Lab/rave-models/resolve/main/voice_vocalset_b2048_r48000_z16.ts
Resolving huggingface.co (huggingface.co)... 13.226.251.112, 13.226.251.66, 13.226.251.81, ...
Connecting to huggingface.co (huggingface.co)|13.226.251.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/651d934b1793ea8afb571e1d/3a3fff00fc977b947f7b5cf9602b1a19d2eff6afbd7bf60f0fc52a207000f486?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20260108%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20260108T172643Z&X-Amz-Expires=3600&X-Amz-Signature=4593971c2249e2d534955e8799ac5a710eed724aa431b469ded6f3f92a63fae2&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27voice_vocalset_b2048_r48000_z16.ts%3B+filename%3D%22voice_vocalset_b2048

In [3]:
# @title Import Audio From Drive {"display-mode":"form"}
audio_id = "1jRg8PrHAGYY0ZYB5uY9hoU5GlyMuetCC" # @param {"type":"string"}
audio_name = 'EOU_extra.wav' # @param {"type":"string"}

import gdown
gdown.download(id=audio_id, output=audio_name)

Downloading...
From (original): https://drive.google.com/uc?id=1jRg8PrHAGYY0ZYB5uY9hoU5GlyMuetCC
From (redirected): https://drive.google.com/uc?id=1jRg8PrHAGYY0ZYB5uY9hoU5GlyMuetCC&confirm=t&uuid=eb229a22-9775-4589-9a10-678f29376780
To: /content/EOU_extra.wav
100%|██████████| 218M/218M [00:04<00:00, 54.1MB/s]


'EOU_extra.wav'

In [6]:
# @title Extraction Process
import librosa
import torch

import numpy as np
import pandas as pd

print('Loading Model and Audio')

model = torch.jit.load(f'model.ts') # import model
audio, sr = librosa.load(audio_name, sr=48000) # import audio

print('Starting Process')

# Pass Audio By Model
x = torch.from_numpy(audio)
x = x[None, None, :]

with torch.no_grad():
  z = model.encode(x)
  reconst = model(x).squeeze(0).detach().numpy()

lat = z.squeeze(0).detach().numpy().T

print('Loaded Latent Space')

# Prepare for pandas
lat_df = pd.DataFrame(lat)
lat4 = lat_df.groupby(np.arange(len(lat_df)) // 4).mean().values

import numpy as np

m = lat4.shape[0]
n = int(np.ceil(audio.shape[0] / lat4.shape[0]))
pads = m*n - audio.shape[0]

reconstp = reconst.squeeze(0)

if pads > 0:
  samples = np.pad(audio.astype(float), (0, pads), mode='constant', constant_values=0)
  reconstp = np.pad(reconstp.astype(float), (0, pads + (audio.shape[0] - reconstp.shape[0])), mode='constant', constant_values=0)
else:
  samples = audio[:pads]
  reconstp = reconstp[:pads]

samples = np.reshape(samples, (m,n))
rsamples = np.reshape(reconstp, (m,n))

print('Prepared For Visualization')

# Visualization and Datapoints Creation (TouchDesigner)
import bokeh

from IPython.display import display, clear_output
from tqdm.notebook import tqdm

from bokeh.io import curdoc

bokeh.io.output_notebook()
source = bokeh.models.ColumnDataSource(get_plot(lat4, samples, algorithm='pca'))
graph = print_visualization_2d(source)
handle = bokeh.io.show(graph, notebook_handle=True)

display(handle)

print('Saving Datapoints')

df1 = source.to_df()
df1['R'] = df1['color'].apply(lambda x: eval("0x" + x[1:3]))
df1['G'] = df1['color'].apply(lambda x: eval("0x" + x[3:5]))
df1['B'] = df1['color'].apply(lambda x: eval("0x" + x[5:7]))

df2 = pd.DataFrame().from_dict({i: {'latent': lat[i]} for i, s in enumerate(samples)}).T

pd_source = pd.concat([df1, df2], axis=1)

display(pd_source)

print('Saving Data')

output_name = "".join(audio_name.split(".")[:-1])
pd_source.to_csv(f'touchdesigner_datapoints_{output_name}.csv')
#pd_source['latent'] = pd_source['latent'].apply(lambda x: np.fromstring(x[1:-1], dtype=float, sep=' '))
pd_source.to_pickle(f'model_{output_name}.pkl')

Loading Model and Audio
Starting Process
Loaded Latent Space
Prepared For Visualization


Saving Datapoints


Unnamed: 0,x,y,id,label,color,alpha,radius,R,G,B,latent
0,0.706390,0.188690,0,4.662457e-07,#aec7e8,0.7,0.01,174,199,232,"[4.31158, -0.016098864, -1.4334894, 0.6719994,..."
1,1.143595,1.283239,1,2.382394e-05,#aec7e8,0.7,0.01,174,199,232,"[4.1220045, 0.62115127, 1.9695816, 0.06266399,..."
2,0.940314,0.895876,2,1.324826e-04,#aec7e8,0.7,0.01,174,199,232,"[5.6267643, -0.8501216, 1.3822607, 1.3482549, ..."
3,0.920570,1.386840,3,1.925660e-04,#aec7e8,0.7,0.01,174,199,232,"[3.761097, 0.71769917, 0.29603863, 0.111465916..."
4,-0.350952,0.698491,4,2.233555e-04,#aec7e8,0.7,0.01,174,199,232,"[5.1292, 0.445167, 0.103130326, 0.21994618, 1...."
...,...,...,...,...,...,...,...,...,...,...,...
4426,-0.769120,2.743165,4426,2.015814e-04,#aec7e8,0.7,0.01,174,199,232,"[1.8275592, 1.0977074, 0.76811147, -1.234736, ..."
4427,1.018823,1.453832,4427,2.331215e-04,#aec7e8,0.7,0.01,174,199,232,"[1.568912, 0.63413984, -0.97760487, -1.0814228..."
4428,-0.518252,2.166708,4428,1.985054e-04,#aec7e8,0.7,0.01,174,199,232,"[1.1067492, 0.035274267, -1.0160422, -1.069451..."
4429,0.251978,0.770385,4429,2.217626e-04,#aec7e8,0.7,0.01,174,199,232,"[1.4529978, 0.023876213, -0.38188368, -1.31751..."


In [None]:
# @title Listen to Samples
import ipywidgets
import time

secslider = ipywidgets.SelectionRangeSlider(
    options=list(range(0, lat.shape[0], 1)),
    index=(0, 1000),
    description='Samples',
    layout={'width': '90%'},
    disabled=False
)

box = ipywidgets.HBox([secslider])
display(box)

from IPython.display import Audio

start = ipywidgets.Button(
    description='Start',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click me',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)
print()
display(start)
print()

out = ipywidgets.Output(layout={'border': '0px solid black', 'padding': '.5em', 'width': '90%'})

def show_audio(b):
  with out:
    out.clear_output(wait=False)
    time.sleep(1)

    print('Original')
    display(Audio(data=samples[secslider.value[0]:secslider.value[1]].reshape(-1), rate=sr))
    print('Reconstructed')
    display(Audio(data=rsamples[secslider.value[0]:secslider.value[1]].reshape(-1), rate=sr))

start.on_click(show_audio)
display(out)
print()