In [20]:
import os
import re
import json
from tqdm import tqdm
import pandas as pd
from glob import glob
from openai import OpenAI
import uuid

PATH_PROMPTS = "../dataset/Dataset40k/prompts"
PATH_SPOTIFY_YOUTUBE_CSV = "../dataset/Dataset40k/spotify_with_youtube_clean.csv"
PATH_SAVE_FINAL_CSV = "../dataset/Dataset40k/spotify_with_youtube_and_prompts.csv"

assert os.path.exists(PATH_PROMPTS), f"Received bad path `{PATH_PROMPTS}`"
assert os.path.exists(PATH_SPOTIFY_YOUTUBE_CSV), f"Received bad path `{PATH_SPOTIFY_YOUTUBE_CSV}`"
assert not os.path.exists(PATH_SAVE_FINAL_CSV), f"`{PATH_SAVE_FINAL_CSV}` already exists."

__OPENAI_API_KEY = <ENTER_YOUR_OWN_OPENAI_API_KEY_HERE>
openai_api = OpenAI(api_key=__OPENAI_API_KEY)

# Create ChatGPT Prompts

Prompt generation is fairly time-consuming, so be patience, it’s slow.
<br><br>
I wrote a simple script to run 10 processes in parallel, it works fine with openAI, so perhaps do that.

```bash
python .\parallel_chatgpt_prompt_creation.py --start_index 0  
...  
python .\parallel_chatgpt_prompt_creation.py --start_index 1  
python .\parallel_chatgpt_prompt_creation.py --start_index 9  
```  



In [None]:
#######################################################
# Helpers
#######################################################

GPT_INSTRUCTIONS = """
INSTRUCTIONS:
I'll send you song metadata (from Spotify) like this:
{
    track_id: int,
    artists: [Artist Name(s)],
    track_name: Track Name,
danceability: float, # 0.0 (least danceable) to 1.0 (most danceable)
energy: float, # 0.0 (least energetic) to 1.0 (most energetic)
key: int, 0=C, 1=C#/Db, etc.
mode: int, 1=Major, 0=Minor
speechiness: float, #0.0 (least spoken words) to 1.0 (most spoken)
liveness: float, #0.0 (studio recording) to 1.0 (live performance)
valence: float, #0.0 (sad/negative) to 1.0 (happy/positive)
track_genre: Genre
}

Based on this, generate 10 different, concise, 2-sentence visual image prompts that reflect the song's mood, emotion, and atmosphere. Be extremely creative with the styling—go crazy with different artistic approaches and aesthetics.

Respond simply with 10 lines, one for each prompt, and nothing else!
"""

def get_prompt(api_instance, row, gpt_instructions, model="gpt-4o-mini", expected_prompt_count:int=3):
    # Setup
    EXPECTED_COLUMNS = ["artists", "track_name", "danceability", "energy", "key", "mode", "speechiness", "liveness", "valence", "track_genre"]
    assert all([(c in list(row.index)) for c in EXPECTED_COLUMNS])
    metadata_prompt = str(dict(row[EXPECTED_COLUMNS]))

    # Get prompt from chatgpt
    response = api_instance.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": gpt_instructions},
            {"role": "user", "content": metadata_prompt},
        ]
    )

    # Format output
    response_string = response.choices[0].message.content
    splits = response_string.split("\n")
    splits = [s for s in splits if (len(s) > expected_prompt_count)]
    prompts = [prompt.strip() for prompt in splits]
    assert len(splits) == expected_prompt_count, splits
    return response, prompts

#######################################################
# Prepare dataframe
#######################################################

# Load csv and remove already 
df = pd.read_csv(PATH_SPOTIFY_YOUTUBE_CSV)
expected_columns = ["track_id", "artists", "track_name", "danceability", "energy", "key", "mode", "speechiness", "liveness", "valence", "track_genre"]
renamer = {
    'spotify_artists':'artists',
    'spotify_track_name':'track_name',
    'spotify_danceability': 'danceability',
    'spotify_energy':'energy',
    'spotify_key':'key',
    'spotify_mode':'mode',
    'spotify_speechiness':'speechiness',
    'spotify_liveness':'liveness',
    'spotify_valence':'valence',
    'spotify_genre':'track_genre',
}

#######################################################
# Get prompts from ChatGPT
#######################################################

for row_index, row in tqdm(df.iterrows(), total=len(df)):
    # Setup
    row = row.rename(index=renamer)
    track_id = row["spotify_track_id"]
    json_path = f"{PATH_PROMPTS}/{track_id}.json"
    if os.path.exists(json_path):
        continue

    # Auto generate the prompt --> prompt = gpt(spotify_data, some_instructions)
    random_seed = str(uuid.uuid4())
    randomly_seeded_instructions = f"RANDOM SEED:\n{random_seed}\n{GPT_INSTRUCTIONS}"
    try:
        response, prompts = get_prompt(
            api_instance=openai_api, 
            row=row, 
            gpt_instructions=randomly_seeded_instructions, 
            expected_prompt_count=10
        )
    except Exception as e:
        print(row_index, ": ", e)
        continue

    # Clean prompt
    response = dict(response)
    expected_keys_from_chatgpt = ['id', 'choices', 'created', 'model', 'object', 'service_tier', 'system_fingerprint', 'usage']
    assert list(response.keys()) == expected_keys_from_chatgpt
    assert len(response["choices"]) == 1
    choices = response["choices"][0]
    choices = dict(choices)
    choices["message"] = dict(choices["message"])
    response["choices"] = choices
    response["usage"] = dict(response["usage"])
    response["clean_prompts"] = response["choices"]["message"]["content"].split("\n\n")

    # Save json file
    with open(json_path, 'w') as json_file:
        json.dump(dict(response), json_file, indent=4)
    del response

# Add prompts to dataframe

In [21]:
df = pd.read_csv(PATH_SPOTIFY_YOUTUBE_CSV)
assert len(df["spotify_track_id"].unique()) == len(df)
prompt_paths = glob(f"{PATH_PROMPTS}/*")
prompt_track_ids = [os.path.basename(p)[:-5] for p in prompt_paths]
as_track_id_match = df["path_audio_full"].apply(os.path.basename).apply(lambda x: x.split(".")[0]).isin(prompt_track_ids)
print(f"Rows with no match: {sum(~as_track_id_match)}/{len(df)}") 
df = df[as_track_id_match]
print(f"Rows remaining: {len(df)}")

Rows with no match: 1/37453
Rows remaining: 37452


In [22]:
# Load prompts
openai_prompts = []
failed = []
for track_id in df["spotify_track_id"]:
    json_path = f"{PATH_PROMPTS}/{track_id}.json"
    with open(json_path, "r") as f:
        prompts = json.load(f)["clean_prompts"]
    if len(prompts) != 10:
        prompts = []
        failed.append(track_id)
    elif not all([prompts[i].startswith(f"{i+1}. ") for i in range(10)]):
        prompts = []
        failed.append(track_id)
    else:
        prompts = [re.sub(rf"^{i+1}\. ", "", prompt) for i, prompt in enumerate(prompts)]
    openai_prompts.append(prompts)

# Remove bad prompts
df["openai_prompts"] = openai_prompts
bad_prompts = df["openai_prompts"].apply(len) == 0
print(f"Rows with at least one bad prompt: {sum(bad_prompts)}/{len(df)}")
assert df[bad_prompts]["spotify_track_id"].isin(failed).all()
assert bad_prompts.sum() == len(failed)
df = df[~bad_prompts]
df.to_csv(PATH_SAVE_FINAL_CSV, index=False)
print(f"Rows remaining: {len(df)}")

Rows with at least one bad prompt: 682/37452
Rows remaining: 36770
