# Environment setup

## Install neccessary tools, libraries, etc.

In [1]:
# Download dataset
!gdown 1JM_MB_0xkpf8jQxsQ8y3GXjVOiG1prBc

# Extract and remove compressed file
!unzip '/content/background_music_generation.zip'
!rm -rf '/content/background_music_generation.zip'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dataset/train/audio/1699168573.2607746.mp3  
  inflating: dataset/train/audio/1699168573.2649837.mp3  
  inflating: dataset/train/audio/1699168573.2694404.mp3  
  inflating: dataset/train/audio/1699168573.2739244.mp3  
  inflating: dataset/train/audio/1699168573.2785115.mp3  
  inflating: dataset/train/audio/1699168573.2833698.mp3  
  inflating: dataset/train/audio/1699168573.2877045.mp3  
  inflating: dataset/train/audio/1699168573.2917523.mp3  
  inflating: dataset/train/audio/1699168573.2950678.mp3  
  inflating: dataset/train/audio/1699168573.2993283.mp3  
  inflating: dataset/train/audio/1699168573.3036797.mp3  
  inflating: dataset/train/audio/1699168573.3076046.mp3  
  inflating: dataset/train/audio/1699168573.311661.mp3  
  inflating: dataset/train/audio/1699168573.3160274.mp3  
  inflating: dataset/train/audio/1699168573.3209093.mp3  
  inflating: dataset/train/audio/1699168573.325484.mp3  
  inflati

In [2]:
!pip install git+https://github.com/microsoft/CLAP.git # Note: CLAP has 2 versions: from microsoft and laion.

Collecting git+https://github.com/microsoft/CLAP.git
  Cloning https://github.com/microsoft/CLAP.git to /tmp/pip-req-build-68z0oqhu
  Running command git clone --filter=blob:none --quiet https://github.com/microsoft/CLAP.git /tmp/pip-req-build-68z0oqhu
  Resolved https://github.com/microsoft/CLAP.git to commit e8a6467b87cd85716e20c6a008126150d9740be0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torchlibrosa<0.2.0,>=0.1.0 (from msclap==1.3.3)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Building wheels for collected packages: msclap
  Building wheel for msclap (pyproject.toml) ... [?25l[?25hdone
  Created wheel for msclap: filename=msclap-1.3.3-py3-none-any.whl size=31119 sha256=9120951d6ad595ea435575a1e67e8e021c98a3e18ec0ac1cf07fb5883403b45c
  Stored in directory: /tmp/pi

## Import important libraries

In [28]:
import os
import json
from glob import glob
from IPython.display import Audio

import heapq

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

from msclap import CLAP

In [29]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Experiment Setup

## Utils

In [4]:
def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")

In [5]:
def plot_specgram(waveform, sample_rate, title="Spectrogram"):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle(title)

## Dataset

In [6]:
dataset_path = '/content/dataset'
os.listdir(dataset_path)

['tokenizer.json', 'test', 'stopwords.txt', 'train']

In [7]:
stopwords = open('/content/dataset/stopwords.txt', 'r').read()
print(stopwords)

a
able
about
above
abst
accordance
according
accordingly
across
act
actually
added
adj
affected
affecting
affects
after
afterwards
again
against
ah
all
almost
alone
along
already
also
although
always
am
among
amongst
an
and
announce
another
any
anybody
anyhow
anymore
anyone
anything
anyway
anyways
anywhere
apparently
approximately
are
aren
arent
arise
around
as
aside
ask
asking
at
auth
available
away
awfully
b
back
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
beginnings
begins
behind
being
believe
below
beside
besides
between
beyond
biol
both
brief
briefly
but
by
c
ca
came
can
cannot
can't
cause
causes
certain
certainly
co
com
come
comes
contain
containing
contains
could
couldnt
d
date
did
didn't
different
do
does
doesn't
doing
done
don't
down
downwards
due
during
e
each
ed
edu
effect
eg
eight
eighty
either
else
elsewhere
end
ending
enough
especially
et
et-al
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
except
f
far
few
ff
fifth
fi

In [8]:
tokens = json.loads(open('/content/dataset/tokenizer.json', 'r').read())
text2labels = tokens[0]
labels2text = tokens[1]

In [9]:
print(text2labels)

{'audio': 0, 'bollywood': 1, 'reminiscing': 2, 'repeated': 3, 'repeats': 4, 'bowls': 5, 'arpeggio': 6, 'middle': 7, 'acid': 8, 'snippet': 9, 'supported': 10, 'violent': 11, 'practicing': 12, 'sensual': 13, 'bouncy': 14, 'alternative': 15, 'claviorgan': 16, 'resounding': 17, 'sub': 18, 'inviting': 19, 'chamber': 20, 'they': 21, 'grim': 22, 'brings': 23, 'male': 24, 'nightclub': 25, 'instrumental': 26, 'sentimental': 27, 'melancholic': 28, 'scenes': 29, 'peace': 30, 'arpeggios': 31, 'cinematic': 32, 'passionately': 33, 'house': 34, 'no': 35, 'of': 36, 'instrumentation': 37, 'genre': 38, 'road': 39, 'crossover': 40, 'backed': 41, 'third': 42, 'flamenco': 43, 'bells': 44, 'beginning': 45, 'up': 46, 'room': 47, 'emphasized': 48, 'trumpets': 49, 'glam': 50, 'clarinet': 51, 'out': 52, 'be': 53, 'pass': 54, 'hungary': 55, 'using': 56, 'white': 57, 'pianos': 58, 'horn': 59, 'match': 60, 'build': 61, 'giving': 62, 'featuring': 63, 'sessions': 64, 'intro': 65, 'village': 66, 'chinese': 67, 'being

In [10]:
print(labels2text)

{'0': 'audio', '1': 'bollywood', '2': 'reminiscing', '3': 'repeated', '4': 'repeats', '5': 'bowls', '6': 'arpeggio', '7': 'middle', '8': 'acid', '9': 'snippet', '10': 'supported', '11': 'violent', '12': 'practicing', '13': 'sensual', '14': 'bouncy', '15': 'alternative', '16': 'claviorgan', '17': 'resounding', '18': 'sub', '19': 'inviting', '20': 'chamber', '21': 'they', '22': 'grim', '23': 'brings', '24': 'male', '25': 'nightclub', '26': 'instrumental', '27': 'sentimental', '28': 'melancholic', '29': 'scenes', '30': 'peace', '31': 'arpeggios', '32': 'cinematic', '33': 'passionately', '34': 'house', '35': 'no', '36': 'of', '37': 'instrumentation', '38': 'genre', '39': 'road', '40': 'crossover', '41': 'backed', '42': 'third', '43': 'flamenco', '44': 'bells', '45': 'beginning', '46': 'up', '47': 'room', '48': 'emphasized', '49': 'trumpets', '50': 'glam', '51': 'clarinet', '52': 'out', '53': 'be', '54': 'pass', '55': 'hungary', '56': 'using', '57': 'white', '58': 'pianos', '59': 'horn', '6

In [11]:
train_json = json.loads(open('/content/dataset/train/train.json', 'r').read())
test_json = json.loads(open('/content/dataset/test/public.json', 'r').read())
train_json

{'1699168556.1432111.mp3': 'The recording features a live performance of a traditional song and it consists of sustained synth pad, punchy kick and snare hits, shimmering cymbals, groovy bass and synth pad. It sounds soft, mellow, passionate and emotional.',
 '1699168565.7955616.mp3': 'The recording features a mellow piano melody, synth pad chords and sustained strings melody. It sounds emotional, passionate and the recording is noisy.',
 '1699168589.6105175.mp3': 'The recording features an electro song that consists of a passionate female vocal singing over punchy kick and snare hits, shimmering hi hats, synth lead melody, groovy synth bass and mellow synth keys melody. It sounds energetic, fun, happy and joyful - like something kids would listen to.',
 '1699168567.8103771.mp3': 'This music is an electronic instrumental. The tempo is fast with synthesiser articulation, electronic arrangements and digital drumming. The music is incessant, psychedelic, hypnotic, trippy and trance like. 

In [12]:
print(len(train_json))
print(len(test_json))

10000
1000


In [13]:
Audio(data='/content/dataset/train/audio/1699168556.1432111.mp3')

## Model

CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning.

![CLAP diagram](https://github.com/microsoft/CLAP/blob/main/docs/clap2_diagram.png?raw=true)

In [14]:
clap = CLAP(version = '2023', use_cuda=True)
clap

CLAP_weights_2023.pth:   0%|          | 0.00/690M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

<msclap.CLAPWrapper.CLAPWrapper at 0x7e2cc4090640>

## Experiment

In [15]:
audio_files = glob('/content/dataset/train/audio/**.mp3')
descriptions = list(train_json.values())

In [None]:
def find_top_k_with_indices(arr, k):
  """Finds the top-k largest elements in an array and their indices.

  Args:
    arr: The input array.
    k: The number of top elements to find.

  Returns:
    A list of tuples, where each tuple contains the value and its index.
  """

  # Create a min-heap of tuples (value, index)
  heap = []
  for i, num in enumerate(arr):
    heapq.heappush(heap, (-num, i))
    if len(heap) > k:
      heapq.heappop(heap)

  # Return the elements in the heap
  return [(value, index) for value, index in heap]


### Text to Audio Retrieval


In [None]:
# Extract text embeddings
# text_embeddings = clap.get_text_embeddings(descriptions)

In [17]:
from tqdm import tqdm

In [18]:
# Extract audio embeddings
audio_embeddings = []
n_batch = 100
for i in tqdm(range(0, len(audio_files), n_batch)):
    audio_embeddings.append(clap.get_audio_embeddings(audio_files[i:i+n_batch], resample=True))

len(audio_embeddings)

100%|██████████| 100/100 [03:20<00:00,  2.01s/it]


In [22]:
audio_embeddings[0]

tensor([[ 0.8470, -0.8058,  0.9054,  ...,  1.1910,  1.0200, -0.1791],
        [ 0.8377,  0.1484,  0.2013,  ...,  1.0715,  0.6761, -0.1386],
        [ 0.8878, -0.5095,  0.6133,  ...,  1.4805,  0.9385, -0.1296],
        ...,
        [-0.3398,  0.2989,  0.2200,  ..., -0.2093,  0.4997,  0.1893],
        [ 1.6956,  0.6990,  0.1978,  ...,  1.4635,  0.0726,  0.0742],
        [ 1.0750,  0.3730,  0.1520,  ...,  1.6611,  0.1403,  1.2724]],
       device='cuda:0')

In [64]:
query = "The recording features a cover of a soft rock song that consists of a mellow piano melody, synth pad chords, punchy kick and snare hits, shimmering hi hats, groovy bass and soft kick hits. It sounds soulful and passionate."
text_embedding = clap.get_text_embeddings(query)
text_embedding

tensor([[ 1.4301,  0.0904,  0.1490,  ...,  0.6633,  0.3442,  0.7209],
        [ 1.4555, -0.4822,  0.4498,  ...,  0.9028,  0.5983,  0.9885],
        [ 1.5234, -0.5048,  0.4774,  ...,  0.6875,  0.5409,  1.1387],
        ...,
        [ 1.4162, -0.0243,  0.3544,  ...,  0.5007,  0.3688,  0.9730],
        [ 1.5234, -0.5048,  0.4774,  ...,  0.6875,  0.5409,  1.1387],
        [ 0.8240, -0.8766, -2.0817,  ...,  0.8719,  0.9381,  0.1189]],
       device='cuda:0')

In [61]:
text_embedding.shape

torch.Size([222, 1024])

In [66]:
audio_embedding = audio_embeddings[0]
audio_embedding.shape

torch.Size([100, 1024])

In [68]:
text_embedding_normalized = text_embedding / torch.norm(text_embedding, dim=1, keepdim=True)
audio_embedding_normalized = audio_embedding / torch.norm(audio_embedding, dim=1, keepdim=True)

print(text_embedding_normalized.shape)
print(audio_embedding_normalized.shape)

torch.Size([222, 1024])
torch.Size([100, 1024])


In [80]:
cosine_similarities = torch.matmul(text_embedding_normalized, audio_embedding_normalized.T)
cosine_similarities.mean()

tensor(0.2381, device='cuda:0')

In [89]:
similarities = torch.stack([clap.compute_similarity(text_embedding, audio_embeddings[i]) for i in tqdm(range(len(audio_embeddings)))])
similarities[0].shape

100%|██████████| 100/100 [00:00<00:00, 1746.43it/s]


torch.Size([222, 100])

In [91]:
# Compute similarity between audio and text embeddings
# Calculate cosine leave the dimension of [222, 100], therefore, I
# choose its mean/max and calculate the top_k.

similarities = torch.stack([clap.compute_similarity(text_embedding, audio_embeddings[i]).max() for i in tqdm(range(len(audio_embeddings)))])
similarities = F.softmax(similarities.detach().cpu(), dim=-1).numpy()
similarities

100%|██████████| 100/100 [00:00<00:00, 1937.66it/s]


array([0.0116028 , 0.00785413, 0.01763663, 0.01012329, 0.01180258,
       0.00866984, 0.00673212, 0.01578652, 0.01182968, 0.00831652,
       0.01710806, 0.01069394, 0.00784009, 0.00905653, 0.00978602,
       0.01042647, 0.00819554, 0.01436546, 0.00861467, 0.01290775,
       0.01112057, 0.00767357, 0.0121166 , 0.01101229, 0.00905754,
       0.00750172, 0.01093672, 0.00916381, 0.00903771, 0.00951989,
       0.02487038, 0.0089934 , 0.00929628, 0.00654611, 0.01011369,
       0.00776255, 0.0062    , 0.00970196, 0.01135887, 0.00870927,
       0.00629098, 0.01014406, 0.00740436, 0.01826545, 0.00800521,
       0.00740214, 0.00891509, 0.00897696, 0.00957664, 0.01273715,
       0.01109335, 0.0111116 , 0.01077392, 0.01174745, 0.0086724 ,
       0.01236329, 0.00640127, 0.00918008, 0.00726333, 0.00983046,
       0.00604901, 0.00791191, 0.00942347, 0.00810809, 0.01098936,
       0.00843232, 0.00814483, 0.0089439 , 0.00752261, 0.00780426,
       0.00904078, 0.00724859, 0.012931  , 0.00749717, 0.00949

In [100]:
# Find the top 5 largest elements
top_k = find_top_k_with_indices(similarities, 5)

index_top_k = [idx for _, idx in top_k]
audio_files_top_k = [audio_files[idx] for idx in index_top_k]
audio_files_top_k

['/content/dataset/train/audio/1699168574.9817324.mp3',
 '/content/dataset/train/audio/1699168569.1880898.mp3',
 '/content/dataset/train/audio/1699168573.8484716.mp3',
 '/content/dataset/train/audio/1699168563.8020413.mp3',
 '/content/dataset/train/audio/1699168582.3717365.mp3']

In [101]:
for audio in audio_files_top_k:
  display(Audio(data=audio))

### Audio Captioning

In [None]:
# Load model (Choose version 'clapcap')
clapcap = CLAP(version = 'clapcap', use_cuda=True)

In [None]:
# Generate captions for the recording
captions = clapcap.generate_caption(audio_files, resample=True, beam_size=5, entry_length=67, temperature=0.01)

In [None]:
# Print the result
for i in range(len(audio_files)):
    print(f"Audio file: {audio_files[i]} \n")
    print(f"Generated caption: {captions[i]} \n")