In [1]:
import numpy as np
import re
import json
import pandas as pd

In [2]:
model = "gpt-3.5-turbo"

labels = pd.read_csv("results.txt", delimiter = ",").loc[:,["pairid", " varA","varB","dataset","groundtruth"]]
labels['pairid'] = labels['pairid'].str.extract('(\d{4})').astype(int) # convert pairid to int
labels["groundtruth"] = labels["groundtruth"].str.replace(r'\s+', '', regex=True) # remove white space in labels

weights = pd.read_csv("pairmeta.txt", delimiter = " ", header = None).iloc[:,[0,5]]
weights.rename(columns = {0:"pairid", 5:"weight"}, inplace=True)
weights['pairid'] = weights['pairid'].astype(int) # convert pairid to int

merged_df = labels.merge(weights, on='pairid')
merged_df[model] = ""

with open(model + "_response_trial2.json", "r") as file:
    loaded_data = json.load(file)
    
merged_df.head()

Unnamed: 0,pairid,varA,varB,dataset,groundtruth,weight,gpt-3.5-turbo
0,1,Altitude,Temperature,DWD,->,0.166,
1,2,Altitude,Precipitation,DWD,->,0.166,
2,3,Longitude,Temperature,DWD,->,0.167,
3,4,Altitude,Sunshine hours,DWD,->,0.166,
4,5,Age,Length,Abalone,->,0.143,


In [3]:
def add_prediction(merged_df, loaded_data, model):
    merged_df.set_index('pairid', inplace=True)
    for response in loaded_data:
        pairid, answer = extract_answers(str(response))
        print(pairid, answer)
        if answer != 'A/B' and answer[0] != 'E' and (answer[0]=='A' or answer[0]=='B'):
            answer = answer[0]
        answer = '->' if answer == 'A' else '<-' if answer == 'B' else answer
        merged_df.loc[int(pairid),model] = answer
        print("Converted:", answer, "\n")
    return merged_df


def extract_answers(s):
    # Compile regex patterns
    pairid_pattern = re.compile(r"\{'pair0(\d{3}),")
    answer_pattern = re.compile(r'<Answer>(.*?)</Answer>')
    
    # Search for pairid
    pairid_match = pairid_pattern.search(s)
    pairid = pairid_match.group(1) if pairid_match else None
    
    # Search for answer
    ret = answer_pattern.findall(s)
    
    # Error handling for answer
    if len(ret) > 1:
        return pairid, "Error: more than one"
    elif len(ret) == 0:
        return pairid, "Error: empty"
    
    return pairid, ret[0]

In [4]:
results = add_prediction(merged_df, loaded_data, model)

001 B
Converted: <- 

002 B
Converted: <- 

003 Error: empty
Converted: Error: empty 

004 A: Changing the altitude causes a change in the sunshine hours.
Converted: -> 

005 A
Converted: -> 

006 A
Converted: -> 

007 A
Converted: -> 

008 A
Converted: -> 

009 A: Changing the age of an abalone causes a change in the whole weight of an abalone.
Converted: -> 

010 A
Converted: -> 

011 A
Converted: -> 

012 A
Converted: -> 

013 A
Converted: -> 

014 A
Converted: -> 

015 A
Converted: -> 

016 A
Converted: -> 

017 Error: empty
Converted: Error: empty 

018 A: Changing the age of a person causes a change in the concentration of GAG of a person.
Converted: -> 

019 B
Converted: <- 

020 B
Converted: <- 

021 B
Converted: <- 

022 Error: empty
Converted: Error: empty 

023 Error: empty
Converted: Error: empty 

024 A
Converted: -> 

025 A
Converted: -> 

026 A
Converted: -> 

027 A
Converted: -> 

028 A
Converted: -> 

029 A
Converted: -> 

030 A
Converted: -> 

031 A
Converted: -> 

03

In [9]:
for i in [2,16, 21, 22, 40, 70]:
    print("Pairid ", i+1, ": \n", loaded_data[i])

Pairid  3 : 
 {'pair0003,': {'id': 'chatcmpl-7zB3vdhbNfo99LUnJLcbhLh3y15lV', 'object': 'chat.completion', 'created': 1694814887, 'model': 'gpt-3.5-turbo-0613', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'To determine the more likely cause-and-effect relationship between changing longitude and changing temperature, we can consider the following:\n\n1. Causality: We need to determine if there is a logical connection between the two variables. \n\n- Changing longitude: Longitude refers to the angular distance of a location east or west of the Prime Meridian. It is a geographic coordinate and does not directly influence temperature.\n- Changing temperature: Temperature refers to the degree of hotness or coldness of a substance or environment. It can be influenced by various factors such as weather patterns, altitude, proximity to bodies of water, and more.\n\n2. Plausibility: We need to assess the plausibility of the cause-and-effect relationship.\n\n- Changing lo

In [10]:
results.loc[71,model] = "<-"

In [11]:
results

Unnamed: 0_level_0,varA,varB,dataset,groundtruth,weight,gpt-3.5-turbo,correct
pairid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Altitude,Temperature,DWD,->,0.166,<-,0
2,Altitude,Precipitation,DWD,->,0.166,<-,0
3,Longitude,Temperature,DWD,->,0.167,Error: empty,0
4,Altitude,Sunshine hours,DWD,->,0.166,->,1
5,Age,Length,Abalone,->,0.143,->,1
...,...,...,...,...,...,...,...
104,time for passing 1. segment,time for passing 2. segment,D. Janzing,->,0.200,"Therefore, the likelihood of either scenario A...",0
105,pixel vector of a patch,total brightness at the screen,D. Janzing,->,0.500,->,1
106,time required for one round,voltage,D. Janzing,<-,1.000,<-,1
107,strength of contrast,answer correct or not,"Schuett, edited by D. Janzing",->,1.000,->,1


In [12]:
results["correct"] = np.where(results["groundtruth"] == results[model], 1, 0)
wtd_acc = sum(results["correct"]*results["weight"]) / results["weight"].sum()
acc = sum(results["correct"]) / 108
print("acc:", acc, "weighted acc:", wtd_acc)

acc: 0.7685185185185185 weighted acc: 0.7774162198477356
