# Environment setup

## Install neccessary tools, libraries, etc.

In [None]:
!gdown 1JM_MB_0xkpf8jQxsQ8y3GXjVOiG1prBc

Downloading...
From (original): https://drive.google.com/uc?id=1JM_MB_0xkpf8jQxsQ8y3GXjVOiG1prBc
From (redirected): https://drive.google.com/uc?id=1JM_MB_0xkpf8jQxsQ8y3GXjVOiG1prBc&confirm=t&uuid=a3f4051d-d4b9-453a-852c-88283452f24c
To: /content/background_music_generation.zip
100% 302M/302M [00:03<00:00, 97.9MB/s]


In [None]:
!unzip '/content/background_music_generation.zip'
!rm -rf '/content/background_music_generation.zip'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dataset/train/audio/1699168573.2607746.mp3  
  inflating: dataset/train/audio/1699168573.2649837.mp3  
  inflating: dataset/train/audio/1699168573.2694404.mp3  
  inflating: dataset/train/audio/1699168573.2739244.mp3  
  inflating: dataset/train/audio/1699168573.2785115.mp3  
  inflating: dataset/train/audio/1699168573.2833698.mp3  
  inflating: dataset/train/audio/1699168573.2877045.mp3  
  inflating: dataset/train/audio/1699168573.2917523.mp3  
  inflating: dataset/train/audio/1699168573.2950678.mp3  
  inflating: dataset/train/audio/1699168573.2993283.mp3  
  inflating: dataset/train/audio/1699168573.3036797.mp3  
  inflating: dataset/train/audio/1699168573.3076046.mp3  
  inflating: dataset/train/audio/1699168573.311661.mp3  
  inflating: dataset/train/audio/1699168573.3160274.mp3  
  inflating: dataset/train/audio/1699168573.3209093.mp3  
  inflating: dataset/train/audio/1699168573.325484.mp3  
  inflati

## Import important libraries

In [None]:
import os
import json
from glob import glob
from IPython.display import Audio

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchaudio

# Experiment Setup

## Utils

In [None]:
def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")

In [None]:
def plot_specgram(waveform, sample_rate, title="Spectrogram"):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle(title)

## Dataset

In [None]:
dataset_path = '/content/dataset'
os.listdir(dataset_path)

['tokenizer.json', 'stopwords.txt', 'test', 'train']

In [None]:
stopwords = open('/content/dataset/stopwords.txt', 'r').read()
print(stopwords)

a
able
about
above
abst
accordance
according
accordingly
across
act
actually
added
adj
affected
affecting
affects
after
afterwards
again
against
ah
all
almost
alone
along
already
also
although
always
am
among
amongst
an
and
announce
another
any
anybody
anyhow
anymore
anyone
anything
anyway
anyways
anywhere
apparently
approximately
are
aren
arent
arise
around
as
aside
ask
asking
at
auth
available
away
awfully
b
back
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
beginnings
begins
behind
being
believe
below
beside
besides
between
beyond
biol
both
brief
briefly
but
by
c
ca
came
can
cannot
can't
cause
causes
certain
certainly
co
com
come
comes
contain
containing
contains
could
couldnt
d
date
did
didn't
different
do
does
doesn't
doing
done
don't
down
downwards
due
during
e
each
ed
edu
effect
eg
eight
eighty
either
else
elsewhere
end
ending
enough
especially
et
et-al
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
except
f
far
few
ff
fifth
fi

In [None]:
tokens = json.loads(open('/content/dataset/tokenizer.json', 'r').read())
text2labels = tokens[0]
labels2text = tokens[1]

In [None]:
print(text2labels)

{'audio': 0, 'bollywood': 1, 'reminiscing': 2, 'repeated': 3, 'repeats': 4, 'bowls': 5, 'arpeggio': 6, 'middle': 7, 'acid': 8, 'snippet': 9, 'supported': 10, 'violent': 11, 'practicing': 12, 'sensual': 13, 'bouncy': 14, 'alternative': 15, 'claviorgan': 16, 'resounding': 17, 'sub': 18, 'inviting': 19, 'chamber': 20, 'they': 21, 'grim': 22, 'brings': 23, 'male': 24, 'nightclub': 25, 'instrumental': 26, 'sentimental': 27, 'melancholic': 28, 'scenes': 29, 'peace': 30, 'arpeggios': 31, 'cinematic': 32, 'passionately': 33, 'house': 34, 'no': 35, 'of': 36, 'instrumentation': 37, 'genre': 38, 'road': 39, 'crossover': 40, 'backed': 41, 'third': 42, 'flamenco': 43, 'bells': 44, 'beginning': 45, 'up': 46, 'room': 47, 'emphasized': 48, 'trumpets': 49, 'glam': 50, 'clarinet': 51, 'out': 52, 'be': 53, 'pass': 54, 'hungary': 55, 'using': 56, 'white': 57, 'pianos': 58, 'horn': 59, 'match': 60, 'build': 61, 'giving': 62, 'featuring': 63, 'sessions': 64, 'intro': 65, 'village': 66, 'chinese': 67, 'being

In [None]:
print(labels2text)

{'0': 'audio', '1': 'bollywood', '2': 'reminiscing', '3': 'repeated', '4': 'repeats', '5': 'bowls', '6': 'arpeggio', '7': 'middle', '8': 'acid', '9': 'snippet', '10': 'supported', '11': 'violent', '12': 'practicing', '13': 'sensual', '14': 'bouncy', '15': 'alternative', '16': 'claviorgan', '17': 'resounding', '18': 'sub', '19': 'inviting', '20': 'chamber', '21': 'they', '22': 'grim', '23': 'brings', '24': 'male', '25': 'nightclub', '26': 'instrumental', '27': 'sentimental', '28': 'melancholic', '29': 'scenes', '30': 'peace', '31': 'arpeggios', '32': 'cinematic', '33': 'passionately', '34': 'house', '35': 'no', '36': 'of', '37': 'instrumentation', '38': 'genre', '39': 'road', '40': 'crossover', '41': 'backed', '42': 'third', '43': 'flamenco', '44': 'bells', '45': 'beginning', '46': 'up', '47': 'room', '48': 'emphasized', '49': 'trumpets', '50': 'glam', '51': 'clarinet', '52': 'out', '53': 'be', '54': 'pass', '55': 'hungary', '56': 'using', '57': 'white', '58': 'pianos', '59': 'horn', '6

In [None]:
train_json = json.loads(open('/content/dataset/train/train.json', 'r').read())
train_json

{'1699168556.1432111.mp3': 'The recording features a live performance of a traditional song and it consists of sustained synth pad, punchy kick and snare hits, shimmering cymbals, groovy bass and synth pad. It sounds soft, mellow, passionate and emotional.',
 '1699168565.7955616.mp3': 'The recording features a mellow piano melody, synth pad chords and sustained strings melody. It sounds emotional, passionate and the recording is noisy.',
 '1699168589.6105175.mp3': 'The recording features an electro song that consists of a passionate female vocal singing over punchy kick and snare hits, shimmering hi hats, synth lead melody, groovy synth bass and mellow synth keys melody. It sounds energetic, fun, happy and joyful - like something kids would listen to.',
 '1699168567.8103771.mp3': 'This music is an electronic instrumental. The tempo is fast with synthesiser articulation, electronic arrangements and digital drumming. The music is incessant, psychedelic, hypnotic, trippy and trance like. 

In [None]:
class MusicDataset(Dataset):
    def __init__(self,
                 path:str='/content/dataset/train',
                 **kwargs):
        self.path = path
        self.data = json.loads(open(os.path.join(path, 'train.json'), 'r').read())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get description
        ds_key = list(self.data.keys())[idx]
        description = self.data[ds_key]

        # Get waveform from audio
        audio_path = os.path.join(self.path,'audio', ds_key)
        audio_format = audio_path.split('.')[-1]
        waveform, sample_rate = torchaudio.load(audio_path, format=audio_format)
        return {
            'audio_dir': audio_path,
            'waveform': waveform,
            'sample_rate': sample_rate,
            'description': description
        }

In [None]:
ds = MusicDataset()

next(iter(ds))

{'audio_dir': '/content/dataset/train/audio/1699168556.1432111.mp3',
 'waveform': tensor([[-1.6466e-07,  2.7580e-08,  4.1839e-07,  ...,  1.2930e-03,
          -2.5797e-04, -2.3841e-03]]),
 'sample_rate': 16000,
 'description': 'The recording features a live performance of a traditional song and it consists of sustained synth pad, punchy kick and snare hits, shimmering cymbals, groovy bass and synth pad. It sounds soft, mellow, passionate and emotional.'}

In [None]:
Audio(data='/content/dataset/train/audio/1699168556.1432111.mp3')

## Model

## Experiment