In [1]:
import numpy as np
import re
import json
import pandas as pd

In [2]:
model = "gpt-3.5-turbo"

labels = pd.read_csv("results.txt", delimiter = ",").loc[:,["pairid", " varA","varB","dataset","groundtruth"]]
labels['pairid'] = labels['pairid'].str.extract('(\d{4})').astype(int) # convert pairid to int
labels["groundtruth"] = labels["groundtruth"].str.replace(r'\s+', '', regex=True) # remove white space in labels

weights = pd.read_csv("pairmeta.txt", delimiter = " ", header = None).iloc[:,[0,5]]
weights.rename(columns = {0:"pairid", 5:"weight"}, inplace=True)
weights['pairid'] = weights['pairid'].astype(int) # convert pairid to int

merged_df = labels.merge(weights, on='pairid')
merged_df[model] = ""

with open(model + "_response_nosys.json", "r") as file:
    loaded_data = json.load(file)
    
merged_df.head()

Unnamed: 0,pairid,varA,varB,dataset,groundtruth,weight,gpt-3.5-turbo
0,1,Altitude,Temperature,DWD,->,0.166,
1,2,Altitude,Precipitation,DWD,->,0.166,
2,3,Longitude,Temperature,DWD,->,0.167,
3,4,Altitude,Sunshine hours,DWD,->,0.166,
4,5,Age,Length,Abalone,->,0.143,


In [3]:
def add_prediction(merged_df, loaded_data, model):
    merged_df.set_index('pairid', inplace=True)
    for response in loaded_data:
        pairid, answer = extract_answers(str(response))
        print(pairid, answer)
        if answer != 'A/B' and answer[0] != 'E' and (answer[0]=='A' or answer[0]=='B'):
            answer = answer[0]
        answer = '->' if answer == 'A' else '<-' if answer == 'B' else answer
        merged_df.loc[int(pairid),model] = answer
        print("Converted:", answer, "\n")
    return merged_df


def extract_answers(s):
    # Compile regex patterns
    pairid_pattern = re.compile(r"\{'pair0(\d{3}),")
    answer_pattern = re.compile(r'<Answer>(.*?)</Answer>')
    
    # Search for pairid
    pairid_match = pairid_pattern.search(s)
    pairid = pairid_match.group(1) if pairid_match else None
    
    # Search for answer
    ret = answer_pattern.findall(s)
    
    # Error handling for answer
    if len(ret) > 1:
        return pairid, "Error: more than one"
    elif len(ret) == 0:
        return pairid, "Error: empty"
    
    return pairid, ret[0]

In [4]:
results = add_prediction(merged_df, loaded_data, model)

001 B
Converted: <- 

002 B
Converted: <- 

003 B
Converted: <- 

004 A
Converted: -> 

005 A
Converted: -> 

006 A
Converted: -> 

007 A
Converted: -> 

008 A
Converted: -> 

009 B
Converted: <- 

010 A
Converted: -> 

011 A
Converted: -> 

012 Error: empty
Converted: Error: empty 

013 A
Converted: -> 

014 A
Converted: -> 

015 A
Converted: -> 

016 A
Converted: -> 

017 A
Converted: -> 

018 A
Converted: -> 

019 A
Converted: -> 

020 B
Converted: <- 

021 B
Converted: <- 

022 A
Converted: -> 

023 A
Converted: -> 

024 Neither A nor B
Converted: Neither A nor B 

025 A
Converted: -> 

026 A
Converted: -> 

027 B
Converted: <- 

028 A
Converted: -> 

029 A
Converted: -> 

030 A
Converted: -> 

031 A
Converted: -> 

032 A
Converted: -> 

033 A
Converted: -> 

034 A
Converted: -> 

035 A
Converted: -> 

036 A
Converted: -> 

037 A
Converted: -> 

038 A
Converted: -> 

039 A
Converted: -> 

040 A
Converted: -> 

041 A
Converted: -> 

042 Error: empty
Converted: Error: empty 

043 A
C

In [9]:
for i in [11,23, 41, 82, 83, 86, 91]:
    print("Pairid ", i+1, ": \n", loaded_data[i])

Pairid  12 : 
 {'pair0012,': {'id': 'chatcmpl-7z6JPpqtPLzBaUmmeUFtDWzHU4mM5', 'object': 'chat.completion', 'created': 1694796627, 'model': 'gpt-3.5-turbo-0613', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "To determine the more likely cause-and-effect relationship, we need to consider the logical sequence of events and the plausibility of each scenario.\n\nA: Changing the age of a person causes a change in the wage per hour of a person.\nThis relationship suggests that as a person's age changes, their wage per hour also changes. It is plausible that as individuals gain more experience and skills with age, their wage per hour may increase. However, it is also possible that other factors such as education, job performance, or market demand for certain skills may have a more significant impact on wage changes. Therefore, this relationship is less likely.\n\nB: Changing their wage per hour causes a change in their age.\nThis relationship suggests that as a person's

In [6]:
results.loc[12,model] = "->"
results.loc[83,model] = "->"
results.loc[84,model] = "<-"
results.loc[87,model] = "->"
results.loc[92,model] = "->"

In [7]:
results

Unnamed: 0_level_0,varA,varB,dataset,groundtruth,weight,gpt-3.5-turbo
pairid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Altitude,Temperature,DWD,->,0.166,<-
2,Altitude,Precipitation,DWD,->,0.166,<-
3,Longitude,Temperature,DWD,->,0.167,<-
4,Altitude,Sunshine hours,DWD,->,0.166,->
5,Age,Length,Abalone,->,0.143,->
...,...,...,...,...,...,...
104,time for passing 1. segment,time for passing 2. segment,D. Janzing,->,0.200,<-
105,pixel vector of a patch,total brightness at the screen,D. Janzing,->,0.500,->
106,time required for one round,voltage,D. Janzing,<-,1.000,<-
107,strength of contrast,answer correct or not,"Schuett, edited by D. Janzing",->,1.000,->


In [8]:
results["correct"] = np.where(results["groundtruth"] == results[model], 1, 0)
wtd_acc = sum(results["correct"]*results["weight"]) / results["weight"].sum()
acc = sum(results["correct"]) / 108
print("acc:", acc, "weighted acc:", wtd_acc)

acc: 0.8333333333333334 weighted acc: 0.8704904623069447


In [46]:
results["correct"] = np.where(results["groundtruth"] == results[model], 1, 0)
wtd_acc = sum(results["correct"]*results["weight"]) / results["weight"].sum()
acc = sum(results["correct"]) / 108
print("acc:", acc, "weighted acc:", wtd_acc)

acc: 0.7962962962962963 weighted acc: 0.8197082407001405
