In [161]:
import os
import re
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

from nltk import agreement
from nltk.metrics.distance import interval_distance

In [162]:
dataset = pd.read_csv("testset.tsv", sep="\t")

In [163]:
transdict = {"overall":"Insgesamt",
             "style":"Stil",
             "form":"Form",
             "content":"Inhalt",
             "emotion":"Emotion"}

gpt_trans = {"B":4,"A":0,"H":2,"E":2,"L":3}
dataset_trans = {"left":0, "right":4, "same":2}

In [164]:
def transform_eval_frame_to_rel_data (eval_frame):
    rel_data = []

    for i, annotation in enumerate(eval_frame["true"]):
        rel_data.append(["true", f"Doc{i}", annotation])
    
    for i, annotation in enumerate(eval_frame["pred"]):
        rel_data.append(["pred", f"Doc{i}", annotation])

    return rel_data

def get_alpha_from_eval_frame(eval_frame):
    rel_data = transform_eval_frame_to_rel_data(eval_frame)
    task = agreement.AnnotationTask(distance=interval_distance)
    task.load_array(rel_data)

    alpha = task.alpha()

    return alpha

# Baselines

In [165]:
logger = []

In [166]:
files = [x for x in os.listdir("results") if x.startswith("eng_baseline_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o-mini","eng", aspect, three_class, two_class, alpha, both_true, both_pred])

form
overall
content
style
emotion
multi


In [167]:
files = [x for x in os.listdir("results") if x.startswith("ger_baseline_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue
    
    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)

    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o-mini","ger", aspect, three_class, two_class, alpha, both_true, both_pred])

form
multi
overall
content
style
emotion


In [168]:
files = [x for x in os.listdir("results") if x.startswith("eng_baseline_4o")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, alpha, both_true, both_pred])

style
emotion
content
form
overall
multi


In [169]:
files = [x for x in os.listdir("results") if x.startswith("ger_baseline_4o")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)

    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","ger", aspect, three_class, two_class, alpha, both_true, both_pred])

multi
content
overall
form
emotion
style


In [170]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "alpha", "both_true", "both_pred"]

In [171]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,alpha,both_true,both_pred
0,4o-mini,eng,form,0.33,0.454545,-0.003617,34,13
1,4o-mini,eng,overall,0.44,0.589041,0.164743,27,7
2,4o-mini,eng,content,0.38,0.6,0.088126,40,5
3,4o-mini,eng,style,0.32,0.537037,0.091713,46,8
4,4o-mini,eng,emotion,0.38,0.596774,0.119023,38,3
5,4o-mini,ger,form,0.41,0.621212,0.177382,34,0
6,4o-mini,ger,overall,0.39,0.534247,0.009587,27,0
7,4o-mini,ger,content,0.4,0.666667,0.24229,40,0
8,4o-mini,ger,style,0.34,0.62963,0.16764,46,0
9,4o-mini,ger,emotion,0.4,0.645161,0.221401,38,0


In [172]:
frame.groupby(["model","lang"])[['three class', 'two class', 'alpha']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class,alpha
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4o,eng,0.48,0.646627,0.364126
4o,ger,0.474,0.69275,0.335615
4o-mini,eng,0.37,0.55548,0.091998
4o-mini,ger,0.388,0.619383,0.16366


## Multi vs sinlge aspect

In [173]:
logger = []

In [174]:
data = pd.read_csv("results/ger_baseline_mini_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","ger", aspect, three_class, two_class, alpha, both_true, both_pred])
    

In [175]:
data = pd.read_csv("results/eng_baseline_mini_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","eng", aspect, three_class, two_class, alpha, both_true, both_pred])

In [176]:
data = pd.read_csv("results/eng_baseline_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, alpha, both_true, both_pred])

In [177]:
data = pd.read_csv("results/ger_baseline_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","ger", aspect, three_class, two_class, alpha, both_true, both_pred])

In [178]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "alpha", "both_true", "both_pred"]

In [179]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,alpha,both_true,both_pred
0,mini,ger,Inhalt,0.38,0.633333,0.201505,40,0
1,mini,ger,Stil,0.22,0.388889,-0.146451,46,1
2,mini,ger,Form,0.29,0.212121,-0.14774,34,48
3,mini,ger,Emotion,0.32,0.5,-0.030996,38,4
4,mini,ger,Insgesamt,0.5,0.684932,0.259946,27,0
5,mini,eng,Inhalt,0.42,0.65,0.248958,40,5
6,mini,eng,Stil,0.41,0.111111,0.030929,46,78
7,mini,eng,Form,0.37,0.333333,0.006821,34,40
8,mini,eng,Emotion,0.42,0.596774,0.197894,38,8
9,mini,eng,Insgesamt,0.52,0.712329,0.359349,27,0


In [180]:
frame.groupby(["model","lang"])[['three class', 'two class', 'alpha']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class,alpha
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4o,eng,0.498,0.508777,0.347744
4o,ger,0.468,0.565502,0.311156
mini,eng,0.428,0.480709,0.16879
mini,ger,0.342,0.483855,0.027253


# few shot

In [181]:
logger = []

In [182]:
data = pd.read_csv("results/eng_few_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, alpha, both_true, both_pred])

In [183]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "alpha", "both_true", "both_pred"]

In [184]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,alpha,both_true,both_pred
0,4o,eng,Inhalt,0.48,0.65,0.451711,40,22
1,4o,eng,Stil,0.45,0.481481,0.250188,46,37
2,4o,eng,Form,0.38,0.212121,-0.056671,34,60
3,4o,eng,Emotion,0.4,0.5,0.186882,38,22
4,4o,eng,Insgesamt,0.55,0.630137,0.408051,27,21


In [185]:
frame.groupby(["model","lang"])[['three class', 'two class', 'alpha']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class,alpha
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4o,eng,0.452,0.494748,0.248032


# Explain Dimensions

In [186]:
logger = []

In [187]:
data = pd.read_csv("results/eng_multi_explaindims.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","eng", aspect, three_class, two_class, alpha, both_true, both_pred])

In [188]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "alpha", "both_true", "both_pred"]

In [189]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,alpha,both_true,both_pred
0,mini,eng,Inhalt,0.37,0.616667,0.173796,40,1
1,mini,eng,Stil,0.42,0.074074,0.08585,46,87
2,mini,eng,Form,0.32,0.318182,0.054175,34,41
3,mini,eng,Emotion,0.34,0.306452,-0.061061,38,35
4,mini,eng,Insgesamt,0.44,0.60274,0.166925,27,0


In [190]:
frame.groupby(["model","lang"])[['three class', 'two class', 'alpha']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class,alpha
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mini,eng,0.378,0.383623,0.083937


# Reasoning

In [191]:
logger = []

In [192]:
files = [x for x in os.listdir("results") if x.startswith("eng_reasoning_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["answer_short"] = [x.split("  \n")[0] for x in data["answer"]]
    data["pred"] = data["answer_short"].apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    eval_frame = eval_frame.query("true != 2 and pred != 2")
    double_two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    
    logger.append(["4o-mini","eng", aspect, three_class, two_class, double_two_class, alpha, both_true, both_pred])

overall
content
style
form
emotion


In [193]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "double two class", "alpha", "both_true", "both_pred"]

In [194]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,double two class,alpha,both_true,both_pred
0,4o-mini,eng,overall,0.41,0.561644,0.569444,0.051787,27,1
1,4o-mini,eng,content,0.43,0.716667,0.716667,0.289192,40,0
2,4o-mini,eng,style,0.32,0.592593,0.615385,0.143049,46,2
3,4o-mini,eng,form,0.3,0.439394,0.453125,-0.139002,34,3
4,4o-mini,eng,emotion,0.42,0.677419,0.677419,0.233516,38,0


In [195]:
frame.groupby(["model","lang"])[['three class', 'two class', 'alpha']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class,alpha
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4o-mini,eng,0.376,0.597543,0.115708


# Reinforce Both

In [196]:
logger = []

In [197]:
files = [x for x in os.listdir("results") if x.startswith("eng_reinforceboth_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data["answer"].apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    eval_frame = eval_frame.query("true != 2 and pred != 2")
    double_two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    
    logger.append(["4o-mini","eng", aspect, three_class, two_class, double_two_class, alpha, both_true, both_pred])

overall
content
form
style
emotion


In [198]:
files = [x for x in os.listdir("results") if x.startswith("eng_reinforceboth_4o")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data["answer"].apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    eval_frame = eval_frame.query("true != 2 and pred != 2")
    double_two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, double_two_class, alpha, both_true, both_pred])

emotion
content
form
style
overall


In [199]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "double two class", "alpha", "both_true", "both_pred"]

In [200]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,double two class,alpha,both_true,both_pred
0,4o-mini,eng,overall,0.36,0.328767,0.648649,0.147344,27,48
1,4o-mini,eng,content,0.44,0.383333,0.676471,0.165795,40,47
2,4o-mini,eng,form,0.31,0.212121,0.7,0.141178,34,63
3,4o-mini,eng,style,0.39,0.277778,0.576923,0.051612,46,52
4,4o-mini,eng,emotion,0.4,0.403226,0.694444,0.189392,38,41
5,4o,eng,emotion,0.45,0.370968,0.605263,0.139869,38,46
6,4o,eng,content,0.52,0.533333,0.864865,0.464208,40,43
7,4o,eng,form,0.39,0.348485,0.621622,0.137295,34,45
8,4o,eng,style,0.42,0.388889,0.724138,0.237619,46,46
9,4o,eng,overall,0.43,0.465753,0.772727,0.358754,27,38


In [201]:
frame.groupby(["model","lang"])[['three class', 'two class', 'alpha']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class,alpha
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4o,eng,0.442,0.421486,0.267549
4o-mini,eng,0.38,0.321045,0.139064


# Baseline Binary

In [202]:
logger = []

In [203]:
files = [x for x in os.listdir("results") if x.startswith("eng_baseline_binary_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data["answer"].apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    alpha = get_alpha_from_eval_frame(eval_frame)
    
    eval_frame = eval_frame[eval_frame.true != 2]
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    two_class_balanced = balanced_accuracy_score(eval_frame["true"], eval_frame["pred"])

    eval_frame = eval_frame.query("true != 2 and pred != 2")
    double_two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o-mini","eng", aspect, three_class, two_class, double_two_class, two_class_balanced, alpha, both_true, both_pred])

form
emotion
style
content
overall


In [204]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "double two class", "balanced two class", "alpha", "both_true", "both_pred"]

In [205]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,double two class,balanced two class,alpha,both_true,both_pred
0,4o-mini,eng,form,0.35,0.530303,0.530303,0.551613,-0.071059,34,0
1,4o-mini,eng,emotion,0.36,0.580645,0.580645,0.560084,-0.049547,38,0
2,4o-mini,eng,style,0.26,0.481481,0.481481,0.537167,-0.189003,46,0
3,4o-mini,eng,content,0.35,0.583333,0.583333,0.572303,-0.050369,40,0
4,4o-mini,eng,overall,0.36,0.493151,0.493151,0.509774,-0.214642,27,0


In [206]:
frame.groupby(["model","lang"])[['three class', 'two class', 'balanced two class', 'alpha']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class,balanced two class,alpha
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4o-mini,eng,0.336,0.533783,0.546188,-0.114924
