In [None]:
from tools.project import INPUT_PATH, LOGS_PATH, OUTPUT_PATH, MODELS_PATH, RAW_PATH
import torch
import os
from datasets import Audio, load_dataset, Dataset
from src.jamendo_utils import read_file
from audiocraft.utils.notebook import display_audio
import numpy as np
import pandas as pd
from transformers import ClapConfig, ClapModel, AutoFeatureExtractor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import default_collate
import tqdm
import json
from audiocraft.data.audio import audio_read, audio_write
from audiocraft.data.audio_utils import convert_audio_channels, convert_audio

import umap
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as mcolors
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from collections import Counter
from audiocraft.models import MusicGen
import uuid
from dotenv import load_dotenv
import random

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(DEVICE)
feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")

In [None]:
ds = load_dataset('csv', data_files=[
    RAW_PATH('song_describer', 'song_describer.csv'),
], split='train')
captions = {}
for row in ds:
    idx = row['track_id']
    captions[idx] = captions.get(idx, [])
    captions[idx].append(row['caption'])

In [None]:
base_dir = RAW_PATH('song_describer', 'audio')


def map_path(ex):
    ex['audio'] = os.path.join(base_dir, ex['path'].replace('.mp3', '.2min.mp3'))
    return ex


def show_audio(ds, sec=10):
    for song in ds['audio']:
        display_audio(torch.Tensor(np.array(song['array']))[:song['sampling_rate'] * sec][None], song['sampling_rate'])


ds = load_dataset('csv', data_files=[
    RAW_PATH('song_describer', 'song_describer.csv'),
], split='train').map(map_path)
ds = Dataset.from_pandas(ds.to_pandas().drop_duplicates(['track_id']))
tracks, tags, extra = read_file(RAW_PATH('song_describer', 'song_describer_14_04_23.mtg-jamendo.tsv'))
pop_artists = pd.Series([x['artist_id'] for k, x in tracks.items()]).value_counts().nlargest(5).index.tolist()
pop_artists

In [None]:
sr = 44100
pop_artis = pop_artists[1]
theme_filter = lambda x: x['track_id'] in tags['mood/theme']['relaxing']
pop_filter = lambda x: x['artist_id'] == pop_artis
ds_filtered = ds.cast_column('audio', Audio(sampling_rate=48000))
show_audio(ds_filtered.take(3))

In [None]:
embeds = []
ids = []
with torch.no_grad():
    for row in tqdm.tqdm(ds_filtered):
        audio = row['audio']
        inputs = feature_extractor(torch.tensor(audio['array']), return_tensors="pt",
                                   sampling_rate=audio['sampling_rate'])
        ids.append(row['track_id'])
        embeds.append(model.get_audio_features(**inputs, ))

dim_input = torch.stack(embeds).squeeze().cpu().numpy()
reducer = umap.UMAP(n_neighbors=5, n_components=2, metric='cosine')
# reducer = PCA(n_components=2)

embeddings_2d = reducer.fit_transform(dim_input)

In [None]:
n_clusters = 12
clst = KMeans(n_clusters=n_clusters, random_state=0)
# clst = DBSCAN(eps=0.06, min_samples=5)

labels = clst.fit_predict(embeddings_2d)
clusters = {}
rev_lab = {}
for idx, label in enumerate(labels):
    rev_lab[ids[idx]] = idx
    if label not in clusters:
        clusters[label] = set()
    clusters[label].add(ids[idx])
ds_filtered = ds_filtered.map(lambda x, idx: {'clst': labels[idx]}, with_indices=True)

for i, idxs in clusters.items():
    a_id = [rev_lab[x] for x in idxs]
    plt.scatter(embeddings_2d[a_id, 0], embeddings_2d[a_id, 1], label=f'cluster {i}')
plt.legend()

In [None]:
cl_ids = set(clusters[10])
id_filter = lambda x: x['track_id'] in cl_ids

show_audio(ds_filtered.filter(id_filter, num_proc=12).take(5))

In [None]:
ds_filtered.remove_columns(['audio']).to_json(RAW_PATH('song_describer', 'clap_clustered.json'))

In [None]:
df = pd.read_json(RAW_PATH('song_describer', 'clap_clustered.jsonl'), lines=True)[['track_id', 'clst']]
df['genre'] = df.apply(lambda x: tracks.get(x['track_id'], {}).get('genre', set()), axis=1)
df['instrument'] = df.apply(lambda x: tracks.get(x['track_id'], {}).get('instrument', set()), axis=1)
df['mood/theme'] = df.apply(lambda x: tracks.get(x['track_id'], {}).get('mood/theme', set()), axis=1)
df['path'] = df.apply(lambda x: tracks.get(x['track_id'], {}).get('path', "").replace('.mp3', '.2min.mp3'), axis=1)
df['descriptions'] = df.apply(lambda x: captions.get(x['track_id'], []), axis=1)
df['text_clst'] = df.apply(lambda x: text_clustered.get(x['track_id'], []), axis=1)
df.to_json(RAW_PATH('ds-indexes', 'song-describer.jsonl'), lines=True, orient="records", force_ascii=False)
df

In [None]:
grouped_counts = {}
clusters = {}

for cluster, group in df.groupby('clst'):
    def col(col_name):
        flattened_values = [item for subset in group[col_name] for item in subset]
        value_counts = Counter(flattened_values)
        return dict(sorted(value_counts.items(), key=lambda item: item[1], reverse=True)[:5])


    grouped_counts[cluster] = {
        'genre': col('genre'),
        'instrument': col('instrument'),
        'mood/theme': col('mood/theme')
    }
    clusters[cluster] = group['track_id'].values.tolist()
with open(RAW_PATH('song_describer', 'clusters_stats.json'), 'w') as fh:
    json.dump(grouped_counts, fh, indent=4)
with open(RAW_PATH('song_describer', 'clusters.json'), 'w') as fh:
    json.dump(clusters, fh, indent=4)

In [None]:
model = MusicGen.get_pretrained('facebook/musicgen-small')
model.set_generation_params(
    use_sampling=True,
    top_k=250,
    duration=5
)

In [None]:
import shutil


def clear_if_exists(dir_name):
    if os.path.exists(dir_name):
        shutil.rmtree(dir_name)
    os.makedirs(dir_name, exist_ok=True)


train_data = []
val_data = []
for cluster, idxs in clusters.items():
    rnd_idxs = random.sample(idxs, 20)
    label = f'cluster_{cluster}'


    def copy_files(split, idxs):
        clear_if_exists(INPUT_PATH('textual-inversion-v3', 'data', split, label, 'audio'))
        clear_if_exists(INPUT_PATH('textual-inversion-v3', 'data', split, label, 'encoded'))
        res = []
        for idx in tqdm.tqdm(idxs):
            src_path = RAW_PATH('song_describer', 'audio', tracks.get(idx)['path'].replace('.mp3', '.2min.mp3'))
            dest_path = tracks.get(idx)['path'].replace('.mp3', '.2min.mp3')
            relative_path = os.path.join('data', split, label, 'audio', os.path.basename(dest_path))
            enc_path = os.path.join('data', split, label, 'encoded',
                                    os.path.basename(dest_path).replace('.2min.mp3', '.pt'))
            dest_path = INPUT_PATH('textual-inversion-v3', relative_path)
            shutil.copy2(src_path, dest_path)
            with torch.no_grad():
                music, sr = audio_read(dest_path)
                music = music[None]
                music = convert_audio(music, sr, 32000, 1)
                encoded_music, _ = model.compression_model.encode(music.to(DEVICE))
                torch.save(encoded_music.cpu(), INPUT_PATH('textual-inversion-v3', enc_path))
            res.append({
                'track_id': idx,
                'audio_path': relative_path,
                'encoded_path': enc_path,
                'concept': label
            })
        return res


    train_data.extend(copy_files('train', rnd_idxs[:10]))
    val_data.extend(copy_files('valid', rnd_idxs[10:]))
train_data

In [None]:
with open(INPUT_PATH('textual-inversion-v3', 'metadata_train.json'), 'w') as fh:
    json.dump(train_data, fh, indent=4)
with open(INPUT_PATH('textual-inversion-v3', 'metadata_val.json'), 'w') as fh:
    json.dump(val_data, fh, indent=4)

In [None]:
from openai import OpenAI

load_dotenv()

gpt_client = OpenAI()
descriptions = df[['track_id', 'descriptions']].explode('descriptions', ignore_index=True)['descriptions'].tolist()
embedings = gpt_client.embeddings.create(input=descriptions, model="text-embedding-3-small")

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
import pandas as pd

client = chromadb.Client()
client.delete_collection('audio_descriptions')
collection = client.get_or_create_collection(
    "audio_descriptions",
)
embeds = []
for i, (_, row) in enumerate(df[['track_id', 'descriptions']].explode('descriptions', ignore_index=True).iterrows()):
    track_id = row["track_id"]
    embeds.append(embedings.data[i].embedding)
    collection.add(
        documents=[row["descriptions"]],
        embeddings=[embedings.data[i].embedding],
        metadatas=[{"track_id": track_id}],
        ids=[str(uuid.uuid4())]
    )
dim_input = np.array(embeds)
reducer = umap.UMAP(n_neighbors=5, n_components=2, metric='cosine')
# reducer = PCA(n_components=2)

embeddings_2d = reducer.fit_transform(dim_input)

In [None]:
exploded_df = df[['track_id', 'descriptions']].explode('descriptions', ignore_index=True)

In [None]:
# n_clusters = 12
clst = KMeans()
# clst = DBSCAN(eps=0.06, min_samples=5)

labels = clst.fit_predict(embeddings_2d)
exploded_df['clst'] = labels
clusters = {}
rev_lab = {}
for idx, label in enumerate(labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(idx)

for i, idxs in clusters.items():
    plt.scatter(embeddings_2d[idxs, 0], embeddings_2d[idxs, 1], label=f'cluster {i}')
plt.legend()

In [None]:
exploded_df.groupby("track_id", as_index=False).agg({"clst": list}).set_index('track_id')['clst'].to_json(
    RAW_PATH('song_describer', 'openai_clustered.jsonl'))

In [None]:
exploded_df[exploded_df['clst'] == 4]

In [None]:
query_text = "relaxing piano music"
query_embedding = gpt_client.embeddings.create(input=[description], model="text-embedding-3-small").data[0].embedding
res = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)
res

In [None]:
res