In [None]:
# @title fix environment - must restart notebook after!!
ROOT_DIR = f'/content'

%cd {ROOT_DIR}

%pip install -q torch==2.1.0
%pip install -q torchaudio==2.1.0
%pip install -q torchtext==0.16.0 --index-url https://download.pytorch.org/whl/cu121
%pip install -q torchvision==0.16.0

%pip install -q triton==2.1.0

import torch
import torchaudio
import torchtext
import torchvision

print(torch.__version__)
print(torchaudio.__version__)
print(torchtext.__version__)
print(torchvision.__version__)

%pip install -q xformers==0.0.22.post7 --index-url https://download.pytorch.org/whl/cu121 # cuda 12.1 version

%run -m xformers.info
!python os.kill(os.getpid(), signal.SIGKILL)

# ^^^ @baltigor

In [None]:
# @title download and install

%cd {ROOT_DIR}
AUDIOCRAFT_DIR = f'{ROOT_DIR}/audiocraft'
AUDIOCRAFT_REPO = f'https://github.com/facebookresearch/audiocraft.git'
!git clone {AUDIOCRAFT_REPO}
%cd {AUDIOCRAFT_DIR}
%pip install -q -e .
%pip install -q dora-search numba wandb

# ^^^ @baltigor


!python -m pip install -U pip setuptools wheel
!python -m pip install --force-reinstall https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz

!python -m pip install laion-clap
!python -m pip install -U transformers==4.30.0

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title scraper

import os
import json
import random
import librosa
from pydub import AudioSegment
import wave
import yt_dlp
import re
import shutil
import numpy as np

# params
# dataset_folder_link = "/content/drive/MyDrive/musicgen_dataset" # @param {type:"string"}
youtube_playlist_link = "https://www.youtube.com/playlist?list=PLZ4DbyIWUwCq4V8bIEa8jm2ozHZVuREJP" # @param {type:"string"}

dataset_folder = "/content/drive/MyDrive/finetuner" # @param {type:"string"}
os.makedirs(dataset_folder, exist_ok=True)

custom_keywords = ""  # @param {type:"string"}
custom_autolabeler_tags = "House, vocal warmup, Hyperpop, Jazz"  # @param {type:"string"}

temp_output_dir = "/content/temp_dataset"
os.makedirs(temp_output_dir, exist_ok=True)

# @markdown `custom_keywords` will be added to all prompts. `custom_autolabeler_tags` will be used by the autolabeler, and only end up in some prompts. note- CLAP won't necessarily understand the tags you put in for the autolabeler, so try and keep those generic, and specific tags (artist names, etc) should go in `custom_keywords`

# @markdown this cell will take a really long time.

from functools import partial
from tqdm import tqdm
tqdm = partial(tqdm, position=0, leave=True)

# youtube scraper
ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': os.path.join(temp_output_dir, '%(title)s.%(ext)s'),
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '128',
    }],
    'quiet': True,
    'extract_flat': True,
    # 'force_generic_extractor': True,
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info_dict = ydl.extract_info(youtube_playlist_link, download=False)
    if 'entries' in tqdm(info_dict):
        for i, entry in enumerate(info_dict['entries']):
            print(f"extracting {entry['title']} {entry['url']} ({i}/{len(info_dict['entries'])})")
            try:
                ydl.download([entry['url']])
            except:
                print(f"failed to download {entry['url']}")
print(len(os.listdir(temp_output_dir)))

# splitter
print('splitting and resampling...')
os.makedirs(os.path.join(dataset_folder, 'split'), exist_ok=True)
for filename in tqdm(os.listdir(temp_output_dir)):
    if filename.endswith(('.mp3', '.wav', '.flac')):
        audio = AudioSegment.from_file(os.path.join(temp_output_dir, filename))
        audio = audio.set_frame_rate(44100)
        for i in range(0, len(audio), 30000):
            chunk = audio[i:i+30000]
            chunk.export(os.path.join(dataset_folder, "split/" + re.sub(r'\W+', '_', filename[:-4]) + f" - chunk{i//1000}.wav"), format="wav")
        os.remove(os.path.join(temp_output_dir, filename))

# autolabeler
tags = [
    "Fast", "Slow", "Upbeat", "Downbeat", "Moderate",
    "Happy", "Sad", "Energetic", "Relaxed", "Melancholic", "Uplifting", "Aggressive", "Peaceful", "Romantic", "Dark", "Light", "Mysterious", "Dreamy", "Somber", "Hopeful", "Gloomy", "Cheerful", "Reflective", "Nostalgic", "Tense", "Calm",
    "Piano", "Guitar", "Violin", "Drums", "Bass", "Synthesizer", "Saxophone", "Trumpet", "Flute", "Cello", "Clarinet", "Harp", "Viola", "Percussion", "Organ", "Accordion", "Banjo", "Oboe", "Trombone", "French Horn", "Double Bass", "Electronic", "Acoustic",
    "Rock", "Pop", "Jazz", "Classical", "Electronic", "Folk", "Hip-Hop", "Blues", "Ambient", "Country", "Reggae", "Funk", "Soul", "Metal", "Dance", "Disco", "House", "Techno", "Trance", "Opera", "Soundtrack", "World", "Indie", "Alternative", "R&B",
    "Bright", "Warm", "Cool", "Thick", "Thin", "Soft", "Hard", "Smooth", "Rough", "Dense", "Sparse", "Clear", "Muddy", "Distorted", "Clean", "Raw", "Processed", "Lo-fi", "Hi-fi", "Layered", "Minimalist",
    "Live", "Studio", "Sampled", "Loop", "Beat", "Mix", "Master", "Edit", "Remix", "Cover", "Acapella", "Instrumental", "Field Recording", "Digital", "Analog",
    "Groovy", "Rhythmic", "Beat-driven", "Syncopated", "Polyrhythmic", "Drumless", "Percussive",
    "African", "Asian", "Latin", "Middle Eastern", "European", "American", "Caribbean", "Celtic", "Indian", "Balkan",
    "Cinematic", "Atmospheric", "Experimental", "Avant-garde", "Psychedelic", "Trippy", "Ethereal", "Spatial", "Dynamic", "Static", "Complex", "Simple", "Progressive", "Retro", "Vintage", "Modern", "Timeless", "Seasonal", "Holiday", "Festive", "Lyrical", "Melodic", "Harmonic", "Dissonant", "Consonant",
    "At a Bar", "At a Club", "Concert", "In the Car", "Study", "Workout", "Party", "Background", "Night Drive", "Morning",
    "Mandolin", "Ukulele", "Harmonica", "Turntable", "Synth Pad", "Electric Guitar", "Acoustic Guitar", "Keyboards", "Digital Piano", "Marimba", "Xylophone", "Tabla", "Sitar", "Didgeridoo", "Steel Drum", "Congas", "Bongos", "Timpani", "Djembe", "Pan Flute",
    "EDM", "Chillwave", "Dubstep", "New Age", "Gospel", "K-Pop", "J-Pop", "C-Pop", "Afrobeats", "Reggaeton", "Ska", "Punk", "Emo", "Grunge", "Shoegaze", "Lo-fi Hip-Hop", "Drum and Bass", "Jazz Fusion", "Neo-Soul", "Trap"
]

tags.append(custom_autolabeler_tags.split(","))

from transformers import ClapProcessor, ClapModel
import torch
import librosa
import json

processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f'loaded laion/clap-htsat-unfused')

def interrogate(audio_file, top_n=10):
    global model, processor, tags

    # process inputs
    audio, sr = librosa.load(audio_file, sr=48000)
    audio_tensor = torch.tensor(audio, device=device)
    inputs = processor(text=tags, audios=[audio], sampling_rate=sr, return_tensors="pt", padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # compute similarity
    outputs = model(**inputs)
    logits_per_audio = outputs.logits_per_audio
    probs = logits_per_audio.softmax(dim=-1)

    # Get the top top_n=10 indices for each audio
    top_probs, top_indices = probs.topk(top_n, dim=1)
    top_matches = [tags[i] for i in top_indices[0].tolist()]

    return top_matches

# label and dump to jsonl
print('autolabelling...')
dataset_path = os.path.join(dataset_folder, 'split')
with open(os.path.join(dataset_folder, "train.jsonl"), "w") as train_file, \
     open(os.path.join(dataset_folder, "test.jsonl"), "w") as eval_file:
    dset = os.listdir(dataset_path)
    random.shuffle(dset)
    for filename in tqdm(dset):
        try:
            top_matches = interrogate(os.path.join(dataset_path, filename), top_n=10)
            result = {"description": ', '.join(top_matches)}
        except:
            result = {"genres": [], "moods": [], "instruments": []}

        y, sr = librosa.load(os.path.join(dataset_path, filename))
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        tempo = round(tempo)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        key = np.argmax(np.sum(chroma, axis=1))
        key = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'][key]
        length = librosa.get_duration(y=y, sr=sr)
        entry = {
            "key": f"{key}",
            "artist": "",
            "sample_rate": 44100,
            "file_extension": "wav",
            "description": f"{result.get('description', '')}",
            "keywords": f"{custom_model_keywords}",
            "duration": length,
            "bpm": tempo,
            "genre": result.get('genres', ""), # non-functional right now
            "title": filename,
            "name": "",
            "instrument": result.get('instruments', ""), # non-functional right now
            "moods": result.get('moods', []), # non-functional right now
            "path": os.path.join(dataset_path, filename)
        }
        if random.random() < 0.85:
            train_file.write(json.dumps(entry) + '\n')
        else:
            eval_file.write(json.dumps(entry) + '\n')

from numba import cuda
device = cuda.get_current_device()
device.reset()

# define yaml
config_path = os.path.join(dataset_folder, "train.yaml")
package_str = "package"
yaml_contents = f"""#@{package_str} __global__

datasource:
  max_channels: 2
  max_sample_rate: 44100

  evaluate: egs/eval
  generate: egs/train
  train: egs/train
  valid: egs/eval
"""
with open(config_path, 'w') as yaml_file:
    yaml_file.write(yaml_contents)

In [None]:
# @title tuner

%cd /content/audiocraft

dataset_folder = "/content/drive/MyDrive/finetuner" # @param {type:"string"}
num_epochs = 15 # @param {type:"number"}
checkpoint_folder = "/content/drive/MyDrive/finetuner/checkpoint" # @param {type:"string"}
train_in_stereo = True # @param {type:"boolean"}

# @markdown note- stereo models won't work with Audiocraft+ notebook (yet?)

# @markdown this cell will also take a really long time.

os.makedirs("/content/audiocraft/egs/train", exist_ok=True)
os.makedirs("/content/audiocraft/egs/eval", exist_ok=True)
!cp "$dataset_folder/train.jsonl" /content/audiocraft/egs/train/data.jsonl
!cp "$dataset_folder/test.jsonl" /content/audiocraft/egs/eval/data.jsonl
!cp "$dataset_folder/train.yaml" /content/audiocraft/config/dset/audio/train.yaml

%env USER=lyra
if train_in_stereo:
    command = (
        "dora -P audiocraft run "
        " solver=musicgen/musicgen_base_32khz"
        " model/lm/model_scale=small"
        " continue_from=//pretrained/facebook/musicgen-stereo-small"
        " conditioner=text2music"
        " dset=audio/train"
        " dataset.num_workers=2"
        " dataset.valid.num_samples=1"
        " dataset.batch_size=2"
        " schedule.cosine.warmup=8"
        " optim.optimizer=adamw"
        " optim.lr=1e-4"
        f" optim.epochs={num_epochs}"
        " optim.updates_per_epoch=1000"
        " optim.adam.weight_decay=0.01"
        " generate.lm.prompted_samples=False"
        " generate.lm.gen_gt_samples=True"
        # stereo configs
        " channels=2"
        " interleave_stereo_codebooks.use=True"
        " transformer_lm.n_q=8"
        " transformer_lm.card=2048"
        " codebooks_pattern.delay.delays='[0, 0, 1, 1, 2, 2, 3, 3]'"
    )
else:
    command = (
        "dora -P audiocraft run "
        " solver=musicgen/musicgen_base_32khz"
        " model/lm/model_scale=small"
        " continue_from=//pretrained/facebook/musicgen-small"
        " conditioner=text2music"
        " dset=audio/train"
        " dataset.num_workers=2"
        " dataset.valid.num_samples=1"
        " dataset.batch_size=2"
        " schedule.cosine.warmup=8"
        " optim.optimizer=adamw"
        " optim.lr=1e-4"
        f" optim.epochs={num_epochs}"
        " optim.updates_per_epoch=1000"
        " optim.adam.weight_decay=0.01"
        " generate.lm.prompted_samples=False"
        " generate.lm.gen_gt_samples=True"
    )
!{command}

In [None]:
# @title export checkpoint for inference
import os
root_dir =
subfolders = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
joined_paths = [os.path.join(root_dir, subfolder) for subfolder in subfolders]
SIG = max(joined_paths, key=os.path.getmtime)

from audiocraft.utils import export
from audiocraft import train
xp = train.main.get_xp_from_sig(SIG)
export.export_lm(xp.folder / 'checkpoint.th', os.path.join(checkpoint_folder, 'state_dict.bin'))
export.export_pretrained_compression_model('facebook/encodec_32khz', os.path.join(checkpoint_folder, 'compression_state_dict.bin'))

In [None]:
%pip install -q -e .
%pip install -q dora-search numba wandb

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [None]:
# @title generator
checkpoint_folder = "" # @param {type:"string"}
prompt = "jazz beat with pop elements" #@param {type:"string"}
generate_length = 50 #@param {type:"number"}

from audiocraft.data.audio import audio_write
import IPython.display as ipd

from audiocraft.models import MusicGen
musicgen = MusicGen.get_pretrained(checkpoint_folder)
musicgen.set_generation_params(duration=generate_length)

wavs = musicgen.generate([prompt])
for idx, one_wav in enumerate(wavs):
    audio_write(f'{idx}', one_wav.cpu(), musicgen.sample_rate, strategy="loudness", loudness_compressor=True)
    ipd.display(ipd.Audio(one_wav.cpu(), rate=32000))

ModuleNotFoundError: No module named 'av'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
