In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
def metrics(predicted_labels,true_labels):
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()
    print(f"F1 Score: {f1:.4f}")
    print(f"Weighted cost: {5*fn+fp}")

In [3]:
# Llama
llama_bad = pd.read_csv("output/llama_bad_deberta_finetuned.csv")
llama_good = pd.read_csv("output/llama_good_deberta_finetuned.csv")

llama_good_first = []
llama_good_second = []
for i in range(len(llama_good)):
    instance = llama_good.iloc[i]
    if pd.isna(instance["second_generation"]):
        llama_good_first.append(instance["model_output"].split("\n")[0])
        llama_good_second.append(instance["model_output"].split("\n")[0])
    else:
        llama_good_first.append(instance["model_output"].split("\n")[0])
        llama_good_second.append(instance["second_generation"].split("\n")[0])
llama_good_first_result = []
llama_good_second_result = []
for i in range(len(llama_good)):
    if "good" in llama_good_first[i].lower():
        llama_good_first_result.append(0)
    else:
        llama_good_first_result.append(1)
    
    if "good" in llama_good_second[i].lower():
        llama_good_second_result.append(0)
    else:
        llama_good_second_result.append(1)

llama_bad_first = []
llama_bad_second = []
for i in range(len(llama_bad)):
    instance = llama_bad.iloc[i]
    if pd.isna(instance["second_generation"]):
        llama_bad_first.append(instance["model_output"].split("\n")[0])
        llama_bad_second.append(instance["model_output"].split("\n")[0])
    else:
        llama_bad_first.append(instance["model_output"].split("\n")[0])
        llama_bad_second.append(instance["second_generation"].split("\n")[0])
llama_bad_first_result = []
llama_bad_second_result = []
for i in range(len(llama_bad)):
    if "good" in llama_bad_first[i].lower():
        llama_bad_first_result.append(0)
    else:
        llama_bad_first_result.append(1)
    
    if "good" in llama_bad_second[i].lower():
        llama_bad_second_result.append(0)
    else:
        llama_bad_second_result.append(1)

In [4]:
print("Llama first round generation without feedback: ")
metrics(llama_good_first_result+llama_bad_first_result, [0]*50+[1]*50)
print("Llama second round generation with deberta feedback: ")
metrics(llama_good_second_result+llama_bad_second_result, [0]*50+[1]*50)

Llama first round generation without feedback: 
F1 Score: 0.7642
Weighted cost: 41
Llama second round generation with deberta feedback: 
F1 Score: 0.7966
Weighted cost: 36


In [5]:
# Gemma
gemma_bad = pd.read_csv("output/gemma_bad_deberta_finetuned.csv")
gemma_good = pd.read_csv("output/gemma_good_deberta_finetuned.csv")

gemma_good_first = []
gemma_good_second = []
for i in range(len(gemma_good)):
    instance = gemma_good.iloc[i]
    if pd.isna(instance["second_generation"]):
        gemma_good_first.append(instance["model_output"].split("\n")[0])
        gemma_good_second.append(instance["model_output"].split("\n")[0])
    else:
        gemma_good_first.append(instance["model_output"].split("\n")[0])
        gemma_good_second.append(instance["second_generation"].split("\n")[0])
gemma_good_first_result = []
gemma_good_second_result = []
for i in range(len(gemma_good)):
    if "good" in gemma_good_first[i].lower():
        gemma_good_first_result.append(0)
    else:
        gemma_good_first_result.append(1)
    
    if "good" in gemma_good_second[i].lower():
        gemma_good_second_result.append(0)
    else:
        gemma_good_second_result.append(1)

gemma_bad_first = []
gemma_bad_second = []
for i in range(len(gemma_bad)):
    instance = gemma_bad.iloc[i]
    if pd.isna(instance["second_generation"]):
        gemma_bad_first.append(instance["model_output"].split("\n")[0])
        gemma_bad_second.append(instance["model_output"].split("\n")[0])
    else:
        gemma_bad_first.append(instance["model_output"].split("\n")[0])
        gemma_bad_second.append(instance["second_generation"].split("\n")[0])
gemma_bad_first_result = []
gemma_bad_second_result = []
for i in range(len(gemma_bad)):
    if "good" in gemma_bad_first[i].lower():
        gemma_bad_first_result.append(0)
    else:
        gemma_bad_first_result.append(1)
    
    if "good" in gemma_bad_second[i].lower():
        gemma_bad_second_result.append(0)
    else:
        gemma_bad_second_result.append(1)

In [6]:
print("Gemma first round generation without feedback: ")
metrics(gemma_good_first_result+gemma_bad_first_result, [0]*50+[1]*50)
print("Gemma second round generation with deberta feedback: ")
metrics(gemma_good_second_result+gemma_bad_second_result, [0]*50+[1]*50)

Gemma first round generation without feedback: 
F1 Score: 0.6711
Weighted cost: 49
Gemma second round generation with deberta feedback: 
F1 Score: 0.6897
Weighted cost: 45


In [7]:
# Phi
phi_bad = pd.read_csv("output/phi_bad_deberta_finetuned.csv")
phi_good = pd.read_csv("output/phi_good_deberta_finetuned.csv")

phi_good_first = []
phi_good_second = []
for i in range(len(phi_good)):
    instance = phi_good.iloc[i]
    if pd.isna(instance["second_generation"]):
        phi_good_first.append(instance["model_output"].split(";")[0])
        phi_good_second.append(instance["model_output"].split(";")[0])
    else:
        phi_good_first.append(instance["model_output"].split(";")[0])
        phi_good_second.append(instance["second_generation"].split(";")[0])
phi_good_first_result = []
phi_good_second_result = []
for i in range(len(phi_good)):
    if "good" in phi_good_first[i].lower():
        phi_good_first_result.append(0)
    else:
        phi_good_first_result.append(1)
    
    if "good" in phi_good_second[i].lower():
        phi_good_second_result.append(0)
    else:
        phi_good_second_result.append(1)

phi_bad_first = []
phi_bad_second = []
for i in range(len(phi_bad)):
    instance = phi_bad.iloc[i]
    if pd.isna(instance["second_generation"]):
        phi_bad_first.append(instance["model_output"].split(";")[0])
        phi_bad_second.append(instance["model_output"].split(";")[0])
    else:
        phi_bad_first.append(instance["model_output"].split(";")[0])
        phi_bad_second.append(instance["second_generation"].split(";")[0])
phi_bad_first_result = []
phi_bad_second_result = []
for i in range(len(phi_bad)):
    if "good" in phi_bad_first[i].lower():
        phi_bad_first_result.append(0)
    else:
        phi_bad_first_result.append(1)
    
    if "good" in phi_bad_second[i].lower():
        phi_bad_second_result.append(0)
    else:
        phi_bad_second_result.append(1)

In [8]:
print("Phi first round generation without feedback: ")
metrics(phi_good_first_result+phi_bad_first_result, [0]*50+[1]*50)
print("Phi second round generation with deberta feedback: ")
metrics(phi_good_second_result+phi_bad_second_result, [0]*50+[1]*50)

Phi first round generation without feedback: 
F1 Score: 0.6711
Weighted cost: 49
Phi second round generation with deberta feedback: 
F1 Score: 0.6711
Weighted cost: 49
