# Using LLM-as-a-judge
Asking GPT-3.5-turbo model to compare the two expert rewrite versions directly with the novice in terms of diversity and relevance, and ask model to return '1' v.s. '2'.

In [2]:
# Load Novice Version
import pandas as pd
df_novice = pd.read_csv("initial_dataset/step1_novice_gpt.csv")
hand_curated = [0, 5, 13, 15, 19, 30, 37, 41, 82, 91, 103, 114, 124, 139, 202, 436, 441, 455, 588, 614, 966, 971, 997, 1342]
df_novice_minus_24 = df_novice.drop(index=hand_curated)

df_novice_test = df_novice_minus_24[df_novice_minus_24['is_balanced_subset'] == True]["novice"]
len(df_novice_test)
df_novice_test.head(5)

64     A folk festival song with drums, bass, keyboar...
78     A happy outdoor festival song with drums, bass...
104    A song with a male singer and backup harmonies...
160    Fun country dance music with drum beat, male v...
171    Male singer with friends, piano, drums, bass, ...
Name: novice, dtype: object

In [3]:
import csv
# Load Expert Verison 1
with open('generation_result/gen_res_80_0.6.csv', newline = '') as f:
    reader = csv.reader(f)
    data = list(reader)
    gen_res = []
    for elem in data:
      gen_res.append(str(elem[0]))

len(gen_res)
df_expert1_test = pd.Series(gen_res)
df_expert1_test.head(5)

0    This folk song features a percussionist playin...
1    An acoustic drum is playing a four on the floo...
2    A male singer sings this operatic melody with ...
3    This music is a country dance piece. The tempo...
4    A male singer sings this gospel song with back...
dtype: object

In [5]:
# Load Expert Verison 2
df_expert2 = pd.read_csv("paired_dataset/paired_dataset_LoRA.csv") 
df_expert2_minus_24 = df_expert2.drop(index=hand_curated)

df_expert2_test = df_expert2_minus_24[df_expert2_minus_24['is_balanced_subset'] == True]["caption"]
len(df_expert2_test)
df_expert2_test.head(5)

64     Digital drums are playing a four on the floor ...
78     An acoustic drum is playing along with a bassl...
104    A male singer sings this beautiful melody with...
160    A digital drum is playing a simple beat along ...
171    The R&B music features a male voice singing an...
Name: caption, dtype: object

In [16]:
print("Check row 500 match with other two datasets \n", df_novice_test.iloc[500])
print("\n", df_expert1_test.iloc[500])
print("\n", df_expert2_test.iloc[500])

Check row 500 match with other two datasets 
 A catchy pop song from Finland with male vocals, clean guitar, bass, and keyboard, perfect for a teen drama or disco party.

 This is a pop music piece from Finland. There is a male vocal in the lead singing in a manner that is suitable for pop music. The clean guitar and the bass guitar are playing a simple tune. There is a keyboard playing in the melodic background. The rhythm is provided by an acoustic drum beat. The atmosphere is easygoing and generic. This piece could be used in the soundtrack of a teenage drama. It could also be playing in the background at a disco party. The music would also suit well with advertisement jingles.

 This is a Finnish pop piece. There is a male vocalist singing melodically in the Finnish language. In the background, a clean guitar and a groovy bass guitar can be heard playing the theme with the accompaniment of a keyboard. An 80s disco type beat is being played by the acoustic drums in the rhythmic back

In [18]:
%%capture
!pip install openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet

In [8]:
import openai
from openai import OpenAI
import os
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
def evaluate_with_openai(novice_prompts, expert_version1, expert_version2):
    scores = []
    for novice, expert1, expert2 in zip(novice_prompts, expert_version1, expert_version2):
        prompt = (
            f"Novice prompt: {novice}\n"
            f"Expert rewrite 1: {expert1}\n"
            f"Expert rewrite 2: {expert2}\n"
            f"Which rewrite is better in terms of diversity and relevance? Respond with either '1' or '2' only."
        )
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are an expert musician evaluating quality of prompts used for text-to-music generation."},
                    {"role": "user", "content": prompt},
                ],
                max_tokens=5,  # Limit response length
                temperature=0.1,  # Encourage deterministic responses
            )
            # Extract and validate response
            content = response.choices[0].message.content.strip()
            print(f"Response: {content}")
            if content == "1":
                scores.append(1)
            elif content == "2":
                scores.append(2)
            else:
                print(f"Invalid response: {content}")
                scores.append(0)  # Default score for invalid responses
        except (KeyError, Exception) as e:
            print(f"Error: {e}")
            scores.append(0)  # Default score for errors
    return scores


In [9]:
def main_evaluation(novice_prompts, expert1_prompts, expert2_prompts):
    scores = evaluate_with_openai(novice_prompts, expert1_prompts, expert2_prompts)

    # Calculate win rates
    expert1_wins = scores.count(1)
    expert2_wins = scores.count(2)
    total = len(scores)

    print(f"Expert Version 1 Win Rate: {expert1_wins / total:.2%}")
    print(f"Expert Version 2 Win Rate: {expert2_wins / total:.2%}")


In [10]:
main_evaluation(
    df_novice_test.head(100).tolist(),
    df_expert1_test.head(100).tolist(),
    df_expert2_test.head(100).tolist()
)

Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: I would choose 'Expert
Invalid response: I would choose 'Expert
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 1
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 1
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 1
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Response: 2
Re