In [121]:
import os
import re
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

In [122]:
dataset = pd.read_csv("testset.tsv", sep="\t")

In [123]:
transdict = {"overall":"Insgesamt",
             "style":"Stil",
             "form":"Form",
             "content":"Inhalt",
             "emotion":"Emotion"}

gpt_trans = {"B":1,"A":0,"H":2,"E":2,"L":3}
dataset_trans = {"left":0, "right":1, "same":2}

# Baselines

In [124]:
logger = []

In [125]:
files = [x for x in os.listdir("results") if x.startswith("eng_baseline_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o-mini","eng", aspect, three_class, two_class, both_true, both_pred])

form
overall
content
style
emotion
multi


In [126]:
files = [x for x in os.listdir("results") if x.startswith("ger_baseline_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue
    
    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o-mini","ger", aspect, three_class, two_class, both_true, both_pred])

form
multi
overall
content
style
emotion


In [127]:
files = [x for x in os.listdir("results") if x.startswith("eng_baseline_4o")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, both_true, both_pred])

style
emotion
content
form
overall
multi


In [128]:
files = [x for x in os.listdir("results") if x.startswith("ger_baseline_4o")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","ger", aspect, three_class, two_class, both_true, both_pred])

multi
content
overall
form
emotion
style


In [129]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "both_true", "both_pred"]

In [130]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,both_true,both_pred
0,4o-mini,eng,form,0.33,0.454545,34,13
1,4o-mini,eng,overall,0.44,0.589041,27,7
2,4o-mini,eng,content,0.38,0.6,40,5
3,4o-mini,eng,style,0.32,0.537037,46,8
4,4o-mini,eng,emotion,0.38,0.596774,38,3
5,4o-mini,ger,form,0.41,0.621212,34,0
6,4o-mini,ger,overall,0.39,0.534247,27,0
7,4o-mini,ger,content,0.4,0.666667,40,0
8,4o-mini,ger,style,0.34,0.62963,46,0
9,4o-mini,ger,emotion,0.4,0.645161,38,0


In [131]:
frame.groupby(["model","lang"])[['three class', 'two class']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
4o,eng,0.48,0.646627
4o,ger,0.474,0.69275
4o-mini,eng,0.37,0.55548
4o-mini,ger,0.388,0.619383


## Multi vs sinlge aspect

In [132]:
logger = []

In [133]:
data = pd.read_csv("results/ger_baseline_mini_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","ger", aspect, three_class, two_class, both_true, both_pred])
    

In [134]:
data = pd.read_csv("results/eng_baseline_mini_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","eng", aspect, three_class, two_class, both_true, both_pred])

In [135]:
data = pd.read_csv("results/eng_baseline_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, both_true, both_pred])

In [136]:
data = pd.read_csv("results/ger_baseline_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","ger", aspect, three_class, two_class, both_true, both_pred])

In [137]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "both_true", "both_pred"]

In [138]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,both_true,both_pred
0,mini,ger,Inhalt,0.38,0.633333,40,0
1,mini,ger,Stil,0.22,0.388889,46,1
2,mini,ger,Form,0.29,0.212121,34,48
3,mini,ger,Emotion,0.32,0.5,38,4
4,mini,ger,Insgesamt,0.5,0.684932,27,0
5,mini,eng,Inhalt,0.42,0.65,40,5
6,mini,eng,Stil,0.41,0.111111,46,78
7,mini,eng,Form,0.37,0.333333,34,40
8,mini,eng,Emotion,0.42,0.596774,38,8
9,mini,eng,Insgesamt,0.52,0.712329,27,0


In [139]:
frame.groupby(["model","lang"])[['three class', 'two class']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
4o,eng,0.498,0.508777
4o,ger,0.468,0.565502
mini,eng,0.428,0.480709
mini,ger,0.342,0.483855


# few shot

In [140]:
logger = []

In [141]:
data = pd.read_csv("results/eng_few_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, both_true, both_pred])

In [142]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "both_true", "both_pred"]

In [143]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,both_true,both_pred
0,4o,eng,Inhalt,0.48,0.65,40,22
1,4o,eng,Stil,0.45,0.481481,46,37
2,4o,eng,Form,0.38,0.212121,34,60
3,4o,eng,Emotion,0.4,0.5,38,22
4,4o,eng,Insgesamt,0.55,0.630137,27,21


In [144]:
frame.groupby(["model","lang"])[['three class', 'two class']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
4o,eng,0.452,0.494748


# Explain Dimensions

In [145]:
logger = []

In [146]:
data = pd.read_csv("results/eng_multi_explaindims.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub(r"\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred

    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","eng", aspect, three_class, two_class, both_true, both_pred])

In [147]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "both_true", "both_pred"]

In [148]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,both_true,both_pred
0,mini,eng,Inhalt,0.37,0.616667,40,1
1,mini,eng,Stil,0.42,0.074074,46,87
2,mini,eng,Form,0.32,0.318182,34,41
3,mini,eng,Emotion,0.34,0.306452,38,35
4,mini,eng,Insgesamt,0.44,0.60274,27,0


In [149]:
frame.groupby(["model","lang"])[['three class', 'two class']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
mini,eng,0.378,0.383623


# Reasoning

In [150]:
logger = []

In [151]:
files = [x for x in os.listdir("results") if x.startswith("eng_reasoning_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["answer_short"] = [x.split("  \n")[0] for x in data["answer"]]
    data["pred"] = data["answer_short"].apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    eval_frame = eval_frame.query("true != 2 and pred != 2")
    double_two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    
    logger.append(["4o-mini","eng", aspect, three_class, two_class, double_two_class, both_true, both_pred])

overall
content
style
form
emotion


In [152]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "double two class", "both_true", "both_pred"]

In [153]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,double two class,both_true,both_pred
0,4o-mini,eng,overall,0.41,0.561644,0.569444,27,1
1,4o-mini,eng,content,0.43,0.716667,0.716667,40,0
2,4o-mini,eng,style,0.32,0.592593,0.615385,46,2
3,4o-mini,eng,form,0.3,0.439394,0.453125,34,3
4,4o-mini,eng,emotion,0.42,0.677419,0.677419,38,0


# Reinforce Both

In [154]:
logger = []

In [155]:
files = [x for x in os.listdir("results") if x.startswith("eng_reinforceboth_mini")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data["answer"].apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    eval_frame = eval_frame.query("true != 2 and pred != 2")
    double_two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    
    logger.append(["4o-mini","eng", aspect, three_class, two_class, double_two_class, both_true, both_pred])

overall
content
form
style
emotion


In [156]:
files = [x for x in os.listdir("results") if x.startswith("eng_reinforceboth_4o")]

for fname in files:
    
    aspect = re.sub(r".*\_|\.tsv", "", fname)
    print(aspect)
    if aspect == "multi":
        continue

    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data["answer"].apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    both_true = eval_frame[eval_frame["true"] == 2].shape[0]
    both_pred = eval_frame[eval_frame["pred"] == 2].shape[0]

    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])

    eval_frame = eval_frame.query("true != 2 and pred != 2")
    double_two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", aspect, three_class, two_class, double_two_class, both_true, both_pred])

emotion
content
form
style
overall


In [157]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang", "aspect", "three class","two class", "double two class", "both_true", "both_pred"]

In [158]:
frame

Unnamed: 0,model,lang,aspect,three class,two class,double two class,both_true,both_pred
0,4o-mini,eng,overall,0.36,0.328767,0.648649,27,48
1,4o-mini,eng,content,0.44,0.383333,0.676471,40,47
2,4o-mini,eng,form,0.31,0.212121,0.7,34,63
3,4o-mini,eng,style,0.39,0.277778,0.576923,46,52
4,4o-mini,eng,emotion,0.4,0.403226,0.694444,38,41
5,4o,eng,emotion,0.45,0.370968,0.605263,38,46
6,4o,eng,content,0.52,0.533333,0.864865,40,43
7,4o,eng,form,0.39,0.348485,0.621622,34,45
8,4o,eng,style,0.42,0.388889,0.724138,46,46
9,4o,eng,overall,0.43,0.465753,0.772727,27,38
