In [12]:
import os
import re
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("testset.tsv", sep="\t")

In [26]:
transdict = {"overall":"Insgesamt",
             "style":"Stil",
             "form":"Form",
             "content":"Inhalt",
             "emotion":"Emotion"}

gpt_trans = {"B":1,"A":0,"H":2,"E":2,"L":3}
dataset_trans = {"left":0, "right":1, "same":2}

# Baselines

In [4]:
logger = []

In [5]:
files = [x for x in os.listdir("results") if x.startswith("eng_baseline_mini")]

for fname in files:
    
    aspect = re.sub(".*\_|\.tsv", "", fname)
    print(aspect)
    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o-mini","eng", three_class, two_class])

overall
style
emotion
content
form


  aspect = re.sub(".*\_|\.tsv", "", fname)


In [6]:
files = [x for x in os.listdir("results") if x.startswith("ger_baseline_mini")]

for fname in files:
    
    aspect = re.sub(".*\_|\.tsv", "", fname)
    print(aspect)
    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o-mini","ger", three_class, two_class])

overall
content
form
emotion
style


  aspect = re.sub(".*\_|\.tsv", "", fname)


In [7]:
files = [x for x in os.listdir("results") if x.startswith("eng_baseline_4o")]

for fname in files:
    
    aspect = re.sub(".*\_|\.tsv", "", fname)
    print(aspect)
    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", three_class, two_class])

emotion
style
form
overall
content


  aspect = re.sub(".*\_|\.tsv", "", fname)


In [8]:
files = [x for x in os.listdir("results") if x.startswith("ger_baseline_4o")]

for fname in files:
    
    aspect = re.sub(".*\_|\.tsv", "", fname)
    print(aspect)
    data = pd.read_csv("results/"+fname, sep="\t")
    data["pred"] = data.answer.apply(lambda x: gpt_trans[x[-1]])
    true = dataset[transdict[aspect]]
    true = [dataset_trans[x] for x in true]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = true
    eval_frame["pred"] = list(data["pred"])
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","ger", three_class, two_class])

content
style
emotion
form
overall


  aspect = re.sub(".*\_|\.tsv", "", fname)


In [14]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang","three class","two class"]

In [16]:
frame.groupby(["model","lang"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
4o,eng,0.48,0.646627
4o,ger,0.474,0.69275
4o-mini,eng,0.37,0.55548
4o-mini,ger,0.388,0.619383


## Multi vs sinlge aspect

In [38]:
logger = []

In [39]:
data = pd.read_csv("results/ger_baseline_mini_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub("\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","ger", three_class, two_class])
    

  ans = [re.sub("\s+$","",x) for x in ans]


In [40]:
data = pd.read_csv("results/eng_baseline_mini_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub("\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["mini","eng", three_class, two_class])

  ans = [re.sub("\s+$","",x) for x in ans]


In [43]:
data = pd.read_csv("results/eng_baseline_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub("\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","eng", three_class, two_class])

  ans = [re.sub("\s+$","",x) for x in ans]


In [45]:
data = pd.read_csv("results/ger_baseline_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub("\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","ger", three_class, two_class])

  ans = [re.sub("\s+$","",x) for x in ans]


In [46]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang","three class","two class"]
frame.groupby(["model","lang"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
4o,eng,0.498,0.508777
4o,ger,0.468,0.565502
mini,eng,0.428,0.480709
mini,ger,0.342,0.483855


# few shot

In [78]:
logger = []

In [79]:
data = pd.read_csv("results/eng_few_4o_multi.tsv",sep="\t")

answers = []
for ans in data["answer"]:
    
    ans = ans.split("\n")
    ans = [re.sub("\s+$","",x) for x in ans]
    ins = gpt_trans[ans[-1][-1]]
    emo = gpt_trans[ans[-2][-1]]
    form = gpt_trans[ans[-3][-1]]
    stil = gpt_trans[ans[-4][-1]]
    inhalt = gpt_trans[ans[-5][-1]]
    
    answers.append([inhalt, stil, form, emo, ins])
    
prediction = pd.DataFrame(answers)
prediction.columns = ["Inhalt", "Stil", "Form", "Emotion", "Insgesamt"]

y_true = []
y_pred = []
for aspect in prediction.columns:
    
    y_true = [dataset_trans[x] for x in dataset[aspect]]
    y_pred = prediction[aspect]
    
    eval_frame = pd.DataFrame()
    eval_frame["true"] = y_true
    eval_frame["pred"] = y_pred
    
    three_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    eval_frame = eval_frame[eval_frame.true != 2]
    
    two_class = accuracy_score(eval_frame["true"], eval_frame["pred"])
    
    logger.append(["4o","ger", three_class, two_class])

  ans = [re.sub("\s+$","",x) for x in ans]


In [80]:
frame = pd.DataFrame(logger)
frame.columns = ["model","lang","three class","two class"]
frame.groupby(["model","lang"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,three class,two class
model,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
4o,ger,0.452,0.494748


In [76]:
frame

Unnamed: 0,model,lang,three class,two class
0,4o,ger,0.43,0.533333
1,4o,ger,0.36,0.425926
2,4o,ger,0.33,0.257576
3,4o,ger,0.38,0.387097
4,4o,ger,0.47,0.506849
