In [1]:
!pip install torchopenl3



In [None]:
import requests
import soundfile as sf
import torchopenl3

In [None]:
url = 'https://raw.githubusercontent.com/marl/openl3/master/tests/data/audio/chirp_44k.wav'
filename = 'Sample_audio.wav'
r = requests.get(url, allow_redirects=True)
open(filename, "wb").write(r.content)

In [None]:
audio,sr = sf.read('Sample_audio.wav')
print(f"Audi Shape{audio.shape}")
print(f"Sample Rate {sr}")

In [None]:
emb, ts = torchopenl3.get_audio_embedding(audio, sr)

In [None]:
print(f"Embedding Shape {emb.shape}")
print(f"TimeStamps Shape {ts.shape}")

The following code computes an embedding using a model trained on environmental videos using a spectrogram with a linear frequency axis and an embedding dimensionality of 512:

In [None]:
emb, ts = torchopenl3.get_audio_embedding(audio, sr, content_type="env",
                               input_repr="linear", embedding_size=512)

print(f"Embedding Shape {emb.shape}")
print(f"TimeStamps Shape {ts.shape}")

By default TorchOpenL3 will pad the beginning of the input audio signal by 0.5 seconds (half of the window size) so that the the center of the first window corresponds to the beginning of the signal (“zero centered”), and the returned timestamps correspond to the center of each window. You can disable this centering like this:

In [None]:
emb, ts = torchopenl3.get_audio_embedding(audio, sr, center=False)
print(f"Embedding Shape {emb.shape}")
print(f"TimeStamps Shape {ts.shape}")

**The hop size used to extract the embedding is 0.1 seconds by default (i.e. an embedding frame rate of 10 Hz). In the following example we change the hop size from 0.1 (10 frames per second) to 0.5 (2 frames per second):**

In [None]:
emb, ts = torchopenl3.get_audio_embedding(audio, sr, hop_size=0.5)
print(f"Embedding Shape {emb.shape}")
print(f"TimeStamps Shape {ts.shape}")

By default, the model file is loaded from disk every time get_audio_embedding is called. To avoid unnecessary I/O when processing multiple files with the same model, you can load it manually and pass it to the function via the model parameter:

In [None]:
model = torchopenl3.models.load_audio_embedding_model(input_repr="mel256", content_type="music",
                                                 embedding_size=512)
emb, ts = torchopenl3.get_audio_embedding(audio, sr, model=model)
print(f"Embedding Shape {emb.shape}")
print(f"TimeStamps Shape {ts.shape}")