# LLM-as-judge with GPT4

Loading judge

In [1]:
from openai import OpenAI
from judge import make_message
import time

model = "gpt-4o"

with open('OpenAI_key', 'r') as file:
    openAI_key = file.readline().strip()

client = OpenAI(api_key = openAI_key)

## Processing fine-tuned model output

In [2]:
import pandas as pd

df = pd.read_csv("../../data/processed/20231220_metrics_CAUSAL.csv")
get_gen = lambda text: text.split('\n')[2].replace('    <assistant>: ','')
df['output'] = df['generation'].apply(get_gen)

df.head(2)

Unnamed: 0,input,target,sesgo_pronombre,sesgo_otro,seq2seq_document,causal_document,generation,reference_tokens,max_ref_len,generated_tokens,input_tokens,bleu_gen,bleu_input,bleu_dif,rouge,output
0,Estimada comunidad beauchefiana: ¿Tienes papel...,['Estimada comunidad beauchefiana: ¿Tienes pap...,NO,NO,Eliminar sesgo de género del siguiente texto:\...,<human>: ¿Puedes reescribir el siguiente texto...,<human>: ¿Puedes reescribir el siguiente texto...,"[['Estimada', 'comunidad', 'beauchefiana', ':'...",11,"['Estimada', 'comunidad', 'beauchefiana', ':',...","['Estimada', 'comunidad', 'beauchefiana', ':',...",1.0,1.0,0.0,1.0,Estimada comunidad beauchefiana: ¿Tienes papel...
1,Desde hoy y hasta el 19 de diciembre puedes de...,['Desde hoy y hasta el 19 de diciembre puedes ...,,,Eliminar sesgo de género del siguiente texto:\...,<human>: ¿Puedes reescribir el siguiente texto...,<human>: ¿Puedes reescribir el siguiente texto...,"[['Desde', 'hoy', 'y', 'hasta', 'el', '19', 'd...",17,"['Desde', 'hoy', 'y', 'hasta', 'el', '19', 'de...","['Desde', 'hoy', 'y', 'hasta', 'el', '19', 'de...",1.0,1.0,0.0,1.0,Desde hoy y hasta el 19 de diciembre puedes de...


In [3]:
# Create 'result' column based on comparison after stripping whitespace
df['bias_answer'] = df.apply(lambda row: "UNBIASED" if row['input'].strip() == row['output'].strip() else "BIASED", axis=1)

In [4]:
import numpy as np

# Define conditions
conditions = [
    (df['sesgo_pronombre'] == 'SI') | (df['sesgo_otro'] == 'SI'),
    (df['sesgo_pronombre'] == 'NO') & (df['sesgo_otro'] == 'NO'),
    df['sesgo_pronombre'].isna() & df['sesgo_otro'].isna()
]

# Assign values based on conditions
choices = ['YES', 'NO', 'Unable to bias']
df['Has bias'] = np.select(conditions, choices, default=None)

df['Has bias'].value_counts()

Has bias
Unable to bias    453
NO                271
YES                58
Name: count, dtype: int64

In [5]:
# Frequency matrix
freq_matrix = pd.crosstab(df['Has bias'], df['bias_answer'])

# Percentage matrix
percentage_matrix = freq_matrix.div(freq_matrix.sum(axis=1), axis=0) * 100

# Display results
print("Frequency Matrix:\n", freq_matrix)
print("\nPercentage Matrix:\n", percentage_matrix)

Frequency Matrix:
 bias_answer     BIASED  UNBIASED
Has bias                        
NO                  50       221
Unable to bias      19       434
YES                 37        21

Percentage Matrix:
 bias_answer        BIASED   UNBIASED
Has bias                            
NO              18.450185  81.549815
Unable to bias   4.194260  95.805740
YES             63.793103  36.206897


Judge

In [6]:
df_judge = df[(df['bias_answer']=='BIASED') & (df['Has bias']=='YES')].copy()
len(df_judge)

37

In [7]:
# Initialize the "Judge" column with NaN values
df_judge['judge_answer'] = None
df_judge['judge_model'] = None
df_judge['judge_prompt'] = None

# Iterate over rows to populate the "Judge" column
for index, row in df_judge.iterrows():
    prompt = make_message(row['input'], row['output'])
    chat_response = client.chat.completions.create(
        model = model,
        messages= prompt
    )
    judge_eval = chat_response.choices[0].message.content
    df_judge.at[index, 'judge_answer'] = judge_eval
    df_judge.at[index, 'judge_model'] = model
    df_judge.at[index, 'judge_prompt'] = prompt

    time.sleep(0.2)  # we will never exceed the rate this way

In [9]:
df_judge["bias_judge"] = df_judge["judge_answer"].str.extract(r'Resultado de sesgo: (\(\w+\))')
df_judge["bias_judge"].value_counts()

bias_judge
(Y)    20
(X)    17
Name: count, dtype: int64

In [10]:
df_judge.to_csv('judge_test/20250208_gpt4-o-judge_ft-causal-model.csv', index=False)

In [13]:
for _, answer in df_judge[df_judge['bias_judge'] == "(X)"].iterrows():
    print('input: ',answer['input'])
    print('output: ',answer['output'])
    print(answer['judge_answer'],'\n')

input:  En el caso de los estudiantes del plan de estudios 2019 consideramos a quienes tienen inscrito el último Taller de práctica profesional y para estudiantes del plan antiguo revisamos quienes cumplirían con los requisitos para tomar el curso práctica profesional en otoño 2024.
output:  En el caso de estudiantes del plan de estudios 2019 consideramos a quienes tienen inscrito el último Taller de práctica profesional y para estudiantes del plan antiguo revisamos quienes cumplirían con los requisitos para tomar el curso práctica profesional en otoño 2024.
---
Resultado de sesgo: (X)
Justificación::: El texto de salida no corrigió el sesgo en la frase "los estudiantes". Aunque se eliminó el artículo "los", no se proporciona una indicación explícita de inclusión de género en la corrección. Además, podrían haberse utilizado términos más inclusivos como "el estudiantado" o "las y los estudiantes" para evitar el sesgo.
--- 

input:  Completa el formulario y adjunta la documentación de re

## Processing LLM output

In [None]:
df = pd.read_csv("../agent/predictions.csv")
df = df.rename(columns={"biases": "biases_detected"})

df_causal = pd.read_csv("../../data/processed/20231220_metrics_CAUSAL.csv")
df = pd.merge(df, df_causal[['input','sesgo_pronombre','sesgo_otro','target']], on='input', how='inner')
print(len(df))

df.head(2)

In [None]:
df['biases'] = df['biases_detected'].fillna('UNBIASED')
df['output'] = df['output'].fillna('UNBIASED')

In [None]:
df["bias_answer"] = df["biases_detected"].apply(lambda x: "UNBIASED" if x == "UNBIASED" else "BIASED")

In [None]:
# Define conditions
conditions = [
    (df['sesgo_pronombre'] == 'SI') | (df['sesgo_otro'] == 'SI'),
    (df['sesgo_pronombre'] == 'NO') & (df['sesgo_otro'] == 'NO'),
    df['sesgo_pronombre'].isna() & df['sesgo_otro'].isna()
]

# Assign values based on conditions
choices = ['YES', 'NO', 'Unable to bias']
df['Has bias'] = np.select(conditions, choices, default=None)

df['Has bias'].value_counts()

In [None]:
# Frequency matrix
freq_matrix = pd.crosstab(df['Has bias'], df['bias_answer'])

# Percentage matrix
percentage_matrix = freq_matrix.div(freq_matrix.sum(axis=1), axis=0) * 100

# Display results
print("Frequency Matrix:\n", freq_matrix)
print("\nPercentage Matrix:\n", percentage_matrix)

In [None]:
df_judge = df[(df['bias_answer']=='BIASED') & (df['Has bias']=='YES')].copy()
len(df_judge)

In [None]:
# Initialize the "Judge" column with NaN values
df_judge['judge_answer'] = None
df_judge['judge_model'] = None
df_judge['judge_prompt'] = None

# Iterate over rows to populate the "Judge" column
for index, row in df_judge.iterrows():
    prompt = make_message(row['input'], row['output'])
    chat_response = client.chat.completions.create(
        model = model,
        messages= prompt
    )
    judge_eval = chat_response.choices[0].message.content
    df_judge.at[index, 'judge_answer'] = judge_eval
    df_judge.at[index, 'judge_model'] = model
    df_judge.at[index, 'judge_prompt'] = prompt

    time.sleep(0.2)  # we will never exceed the rate this way

In [None]:
df_judge["bias_answer"] = df_judge["judge_answer"].str.extract(r'Resultado de sesgo: (\(\w+\))')
df_judge["bias_answer"].value_counts()

In [None]:
df_judge.to_csv('judge_test/20250208_gpt4-o-judge_agent.csv', index=False)

In [None]:
for _, answer in df_judge[df_judge['bias_judge'] == "(X)"].sample(3, random_state=42).iterrows():
    print('input: ',answer['input'])
    print('output: ',answer['output'])
    print(answer['judge_answer'],'\n')