In [1]:
import numpy as np
import re
import json
import pandas as pd

In [26]:
model = "gpt-3.5-turbo"

labels = pd.read_csv("results.txt", delimiter = ",").loc[:,["pairid", " varA","varB","dataset","groundtruth"]]
labels['pairid'] = labels['pairid'].str.extract('(\d{4})').astype(int) # convert pairid to int
labels["groundtruth"] = labels["groundtruth"].str.replace(r'\s+', '', regex=True) # remove white space in labels

weights = pd.read_csv("pairmeta.txt", delimiter = " ", header = None).iloc[:,[0,5]]
weights.rename(columns = {0:"pairid", 5:"weight"}, inplace=True)
weights['pairid'] = weights['pairid'].astype(int) # convert pairid to int

merged_df = labels.merge(weights, on='pairid')

with open(model + "_response_nosys.json", "r") as file:
    loaded_data = json.load(file)
    
merged_df.head()

Unnamed: 0,pairid,varA,varB,dataset,groundtruth,weight
0,1,Altitude,Temperature,DWD,->,0.166
1,2,Altitude,Precipitation,DWD,->,0.166
2,3,Longitude,Temperature,DWD,->,0.167
3,4,Altitude,Sunshine hours,DWD,->,0.166
4,5,Age,Length,Abalone,->,0.143


In [36]:
def add_prediction(merged_df, loaded_data, model):
    merged_df = merged_df.set_index('pairid')
    for response in loaded_data:
        pairid, resid, answer = extract_answers(response)
        print(pairid,resid)
        col = "AtoB" if resid == "1" else "BtoA"
        merged_df.loc[int(pairid),col] = "Yes" if answer == "Yes." else "No" if answer == "No." else answer
        print("Converted:", merged_df.loc[int(pairid),col])
    return merged_df


def extract_answers(s):
    # Compile regex patterns
    
    pairid_pattern = re.compile(r"\{'pair0(\d{3}),_res(\d)")
    
    # Search for pairid
    pairid_match = pairid_pattern.search(str(s))
    pairid = pairid_match.group(1) if pairid_match else None
    resid = pairid_match.group(2) if pairid_match else None
    
    # Search for answer
    ans = s["pair0"+ str(pairid) + ',_res' + str(resid)]["choices"][0]["message"]['content']
    
    return pairid, resid, ans

In [37]:
loaded_data[1]

{'pair0001,_res2': {'id': 'chatcmpl-7z6lm8UI9dfx5AzCeFfm8FOJb77Mg',
  'object': 'chat.completion',
  'created': 1694798386,
  'model': 'gpt-3.5-turbo-0613',
  'choices': [{'index': 0,
    'message': {'role': 'assistant', 'content': 'No.'},
    'finish_reason': 'stop'}],
  'usage': {'prompt_tokens': 27, 'completion_tokens': 2, 'total_tokens': 29}}}

In [38]:
results = add_prediction(merged_df, loaded_data, model)

001 1
Converted: Yes
001 2
Converted: No
002 1
Converted: Yes
002 2
Converted: No
003 1
Converted: No
003 2
Converted: No
004 1
Converted: No
004 2
Converted: No
005 1
Converted: No
005 2
Converted: No
006 1
Converted: Yes
006 2
Converted: No
007 1
Converted: No
007 2
Converted: No
008 1
Converted: No
008 2
Converted: No
009 1
Converted: No
009 2
Converted: No
010 1
Converted: No
010 2
Converted: No
011 1
Converted: Yes
011 2
Converted: No
012 1
Converted: No
012 2
Converted: No
013 1
Converted: Yes
013 2
Converted: No
014 1
Converted: Yes
014 2
Converted: No
015 1
Converted: Yes
015 2
Converted: No
016 1
Converted: Yes
016 2
Converted: No
017 1
Converted: No
017 2
Converted: No
018 1
Converted: No
018 2
Converted: No
019 1
Converted: No
019 2
Converted: No
020 1
Converted: Yes
020 2
Converted: No
021 1
Converted: No
021 2
Converted: No
022 1
Converted: No
022 2
Converted: No
023 1
Converted: No
023 2
Converted: No
024 1
Converted: Yes
024 2
Converted: No
025 1
Converted: Yes
025 2
Con

In [40]:
results["AtoB"].unique()

array(['Yes', 'No'], dtype=object)

In [41]:
results["gt_AtoB"] = np.where(results["groundtruth"] == "->", "Yes", "No")
results["gt_BtoA"] = np.where(results["groundtruth"] == "->", "No", "Yes")
results["correct_AtoB"] = np.where(results["gt_AtoB"] == results["AtoB"], 1, 0)
results["correct_BtoA"] = np.where(results["gt_BtoA"] == results["BtoA"], 1, 0)
wtd_acc = (sum(results["correct_AtoB"]*results["weight"]) + sum(results["correct_BtoA"]*results["weight"])) / (results["weight"].sum()*2)
acc = (sum(results["correct_AtoB"]) + sum(results["correct_BtoA"])) / 108 / 2
print("acc:", acc, "weighted acc:", wtd_acc)

acc: 0.7407407407407407 weighted acc: 0.7556919731575289


In [42]:
results

Unnamed: 0_level_0,varA,varB,dataset,groundtruth,weight,AtoB,BtoA,gt_AtoB,gt_BtoA,correct_AtoB,correct_BtoA
pairid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Altitude,Temperature,DWD,->,0.166,Yes,No,Yes,No,1,1
2,Altitude,Precipitation,DWD,->,0.166,Yes,No,Yes,No,1,1
3,Longitude,Temperature,DWD,->,0.167,No,No,Yes,No,0,1
4,Altitude,Sunshine hours,DWD,->,0.166,No,No,Yes,No,0,1
5,Age,Length,Abalone,->,0.143,No,No,Yes,No,0,1
...,...,...,...,...,...,...,...,...,...,...,...
104,time for passing 1. segment,time for passing 2. segment,D. Janzing,->,0.200,No,No,Yes,No,0,1
105,pixel vector of a patch,total brightness at the screen,D. Janzing,->,0.500,No,No,Yes,No,0,1
106,time required for one round,voltage,D. Janzing,<-,1.000,No,No,No,Yes,1,0
107,strength of contrast,answer correct or not,"Schuett, edited by D. Janzing",->,1.000,Yes,No,Yes,No,1,1
