This file builds a dataframe for SBERT.

In [10]:
# import and install required libraries.
import requests
import os
from os import path
import csv
import random
import numpy as np
import pandas as pd
import time
import re
import random
import json
from sklearn.model_selection import train_test_split
from sentence_transformers.readers import InputExample

In [None]:
# create mount point.
mount_dir = '/content/nextcloud'
os.makedirs(mount_dir, exist_ok=True)

In [None]:
webdav_url = "https://cloud.uni-konstanz.de/remote.php/dav/files/removed"
username = "removed"
password = "removed"

In [None]:
!curl -u "removed!" -o genius_cleaned_merged.json "https://cloud.uni-konstanz.de/remote.php/dav/files/removed/genius_cleaned_merged.json"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 12.9M  100 12.9M    0     0  14.1M      0 --:--:-- --:--:-- --:--:-- 14.1M


In [2]:
with open("genius_cleaned_merged.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# keep only what I need: text + label
df = df[["lyrics_clean", "primary_tag"]]
df = df.dropna().reset_index(drop=True)

print(df.head())
print(len(df))

                                        lyrics_clean primary_tag
0  could sell organs chinese black market would e...         pop
1  undeniable together unbelievable used say fall...         r&b
2  set soul ease chased darkness view left desper...         pop
3  clap hands alright clap hands alright clap han...         pop
4  via billboard honor 25th anniversary billboard...         rap
9500


In [3]:
print(df["primary_tag"].value_counts())

primary_tag
rap                        2463
pop                        2461
country                    1596
r&b                        1346
rock                       1110
electronic                  514
chicago drill                 2
#fliptheswitchchallenge       1
rage                          1
drill                         1
funk                          1
edm                           1
hip-hop                       1
trap                          1
hyphy                         1
Name: count, dtype: int64


In [4]:
# keep only classes with more than 2 samples
df = df.groupby("primary_tag").filter(lambda x: len(x) > 2)
print(len(df))

9490


In [5]:
print(df["primary_tag"].value_counts())

primary_tag
rap           2463
pop           2461
country       1596
r&b           1346
rock          1110
electronic     514
Name: count, dtype: int64


Split into Train, Validation, and Test Sets:

In [6]:
# stratify keeps genre distribution similar.
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["primary_tag"], random_state=42)
val_df, test_df   = train_test_split(temp_df, test_size=0.5, stratify=temp_df["primary_tag"], random_state=42)

In [7]:
print(f"Dataset sizes:\n- Training: {len(train_df)}\n- Validation: {len(val_df)}\n- Test: {len(test_df)}")

Dataset sizes:
- Training: 6643
- Validation: 1423
- Test: 1424


Preparing datasets:

In [8]:
# Training + Validation + Test

unique_genres = df['primary_tag'].unique()
genre_to_id = {genre: i for i, genre in enumerate(unique_genres)}
print(genre_to_id)

# train set
train_set = []
for _, row in train_df.iterrows():
    train_set.append(InputExample(
        texts=[row['lyrics_clean']],
        label=genre_to_id[row['primary_tag']]))

# validation set
val_set = []
for _, row in val_df.iterrows():
    val_set.append(InputExample(
        texts=[row['lyrics_clean']],
        label=genre_to_id[row['primary_tag']]))

# test set
test_set = []
for _, row in test_df.iterrows():
    test_set.append(InputExample(
        texts=[row['lyrics_clean']],
        label=genre_to_id[row['primary_tag']]))

print(f"\nCreated {len(train_set)} examples for training (lyric, label format).")
print(f"Created {len(val_set)} examples for validation loss (lyric, label format).")
print(f"Created {len(test_set)} examples for test loss (lyric, label format).")

{'pop': 0, 'r&b': 1, 'rap': 2, 'electronic': 3, 'rock': 4, 'country': 5}

Created 6643 examples for training (lyric, label format).
Created 1423 examples for validation loss (lyric, label format).
Created 1424 examples for test loss (lyric, label format).


In [None]:
from collections import Counter

labels = [ex.label for ex in test_set]
print(Counter(labels))

Counter({2: 370, 0: 369, 5: 240, 1: 202, 4: 166, 3: 77})


In [13]:
# Evaluation + Test (explicit Triplets)

def create_triplets(input_df):
    """
    Generates triplets (anchor, positive, negative) from a dataframe.
    - anchor: A song lyric.
    - positive: A different song lyric from the same genre.
    - negative: A song lyric from a different genre.
    """
    random.seed(42)
    triplets = []

    # create dictionary.
    lyrics_by_genre = input_df.groupby('primary_tag')['lyrics_clean'].apply(list).to_dict()
    all_genres = list(lyrics_by_genre.keys())

    for _, row in input_df.iterrows():
        anchor_lyric = row['lyrics_clean']
        positive_genre = row['primary_tag']

        # --- find positive example ---
        # get all lyrics of the same genre, excluding the anchor itself
        positive_pool = [lyric for lyric in lyrics_by_genre[positive_genre] if lyric != anchor_lyric]
        if not positive_pool:
            continue # skip if no other song of the same genre exists in the set

        positive_lyric = random.choice(positive_pool)

        # --- find negative example ---
        # choose a random genre that is different from the positive one
        negative_genre_options = [g for g in all_genres if g != positive_genre]
        if not negative_genre_options:
            continue # skip if there's only one genre in the entire dataset

        negative_genre = random.choice(negative_genre_options)
        negative_lyric = random.choice(lyrics_by_genre[negative_genre])

        # the TripletEvaluator in sentence-transformers expects InputExample with three texts
        triplets.append(InputExample(texts=[anchor_lyric, positive_lyric, negative_lyric]))

    return triplets

# create the triplet sets for evaluation.
triplet_val = create_triplets(val_df)
triplet_test = create_triplets(test_df)
triplet_train = create_triplets(train_df)

print(f"Created {len(triplet_val)} triplets for the final test set.")
print(f"Created {len(triplet_test)} triplets for validation evaluation.")
print(f"Created {len(triplet_train)} triplets for the final train set.")

Created 1423 triplets for the final test set.
Created 1424 triplets for validation evaluation.
Created 6643 triplets for the final train set.


In [11]:
def create_pairs(input_df):
    """
    Generates labeled pairs (lyric_1, lyric_2, label) where:
    - label = 1 if same genre
    - label = 0 if different genre

    Returns a DataFrame with columns: lyric_1, lyric_2, label
    """
    random.seed(42)
    pairs = []

    #same as triplet function
    lyrics_by_genre = input_df.groupby('primary_tag')['lyrics_clean'].apply(list).to_dict()
    all_genres = list(lyrics_by_genre.keys())

    for _, row in input_df.iterrows():
        anchor_lyric = row['lyrics_clean']
        anchor_genre = row['primary_tag']

        #positives
        positive_pool = [lyric for lyric in lyrics_by_genre[anchor_genre] if lyric != anchor_lyric]
        if positive_pool:
            positive_lyric = random.choice(positive_pool)
            pairs.append({
                'lyric_1': anchor_lyric,
                'lyric_2': positive_lyric,
                'label': 1
            })

        #negative pairs
        negative_genres = [g for g in all_genres if g != anchor_genre and lyrics_by_genre[g]]
        if negative_genres:
            negative_genre = random.choice(negative_genres)
            negative_lyric = random.choice(lyrics_by_genre[negative_genre])
            pairs.append({
                'lyric_1': anchor_lyric,
                'lyric_2': negative_lyric,
                'label': 0
            })

    return pd.DataFrame(pairs)


# create the triplet sets for evaluation.
pairs_train = create_pairs(train_df)
pairs_val = create_pairs(val_df)
pairs_test = create_pairs(test_df)

print(f"Created {len(pairs_train)} pairs for the final train set.")
print(f"Created {len(pairs_val)} pairs for validation evaluation.")
print(f"Created {len(pairs_test)} pairs for the final test set.")

Created 13286 pairs for the final train set.
Created 2846 pairs for validation evaluation.
Created 2848 pairs for the final test set.


Saving files to CSV:

In [14]:
def to_dataframe_label(examples):
    return pd.DataFrame({
        'lyric': [ex.texts[0] for ex in examples],
        'label': [ex.label for ex in examples]})

def to_dataframe_triplets(examples):
    return pd.DataFrame({
        'anchor': [ex.texts[0] for ex in examples],
        'positive': [ex.texts[1] for ex in examples],
        'negative': [ex.texts[2] for ex in examples]})


# convert to DataFrames
label_train_df = to_dataframe_label(train_set)
label_val_df = to_dataframe_label(val_set)
label_test_df = to_dataframe_label(test_set)
triplet_val_df = to_dataframe_triplets(triplet_val)
triplet_test_df = to_dataframe_triplets(triplet_test)
triplet_train_df = to_dataframe_triplets(triplet_train)


# save train/val/test splits for both approaches
label_train_df.to_csv('label_train.csv', index=False)
label_val_df.to_csv('label_val.csv', index=False)
label_test_df.to_csv('label_test.csv', index=False)
triplet_val_df.to_csv('triplets_val.csv', index=False)
triplet_test_df.to_csv('triplets_test.csv', index=False)
triplet_train_df.to_csv('triplets_train.csv', index=False)
pairs_train.to_csv('pairs_train.csv', index=False)
pairs_val.to_csv('pairs_val.csv', index=False)
pairs_test.to_csv('pairs_test.csv', index=False)

In [None]:
from google.colab import files

files.download('label_train.csv')
files.download('label_val.csv')
files.download('label_test.csv')
files.download('triplets_train')
files.download('triplets_val.csv')
files.download('triplets_test.csv')
files.download('pairs_train')
files.download('pairs_val.csv')
files.download('pairs_test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>