In [4]:
%load_ext autoreload
%autoreload 2
from art_uw import nl_program, conf_edit
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# tool use distribution
import parsimonious
import json
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import ast

# tool use extraction utils
class Node:
    def __init__(self, expr_name, text):
        self.expr_name = expr_name
        self.text = text

    def __str__(self):
        return json.dumps({"expr_name": self.expr_name, "text": self.text}, indent=2)
    
    def get_content(self):
        return {"expr_name": self.expr_name, "text": self.text}
    
    
def recursive_node_visit(node, selection_criterion, node_list):
    for child in node.children:
        recursive_node_visit(child, selection_criterion, node_list)
    if node.expr_name in selection_criterion:
        node_list.append(Node(node.expr_name, node.text))
        return
    
def clean_text(node: list) -> list:
    cleaned = []
    pattern = r"\[[A-Za-z_ ]+\]"
    for n in node:
        match = re.findall(pattern, n)
        if match:
            cleaned.extend(match)
    return cleaned
    
    
def collect_usage(df: pd.DataFrame) -> list:
    grammar = parsimonious.grammar.Grammar(
r"""
program = program_start*node*partial_command*final_answer
program_start = input_start~r"( |\n)"text~r"\n"
input_start = ~r"Input:"
text = ~r"(?<=Input:( |\n))(.|\n|\t)*?(?=\nQ[0-9]+:)"
node = command_node~r"\n"output_node~r"\n"
command_node = command_start~r"( |\n)"command_instruction
output_node = begin_answer~r"( |\n)"output
command_instruction = ~r"(?<=\]( |\n))(.|\n|\t)*?(?=\n\#[0-9]+)"
command_start = ~r"Q[0-9]+:[ ]+\[[A-Za-z_ ]+\]"
begin_answer = ~r"\#[0-9]+:"
output = ~r"(?<=\#[0-9]+:( |\n))(.|\n|\t)*?(?=\nQ[0-9]+:)"
partial_command = command_start~r"\n"
final_answer = ~r"Ans:( |\n)(.|\n)*$"
""")


    # extract tool use from prompts
    prompts = [p for p in df["after_art"]]
    tool_use = []

    for p in prompts:
        try:
            parsed_program = grammar.parse(p)
            command_nodes = parsed_program.children[1]
            command_node_list = []

            for node in command_nodes.children:
                # Access all children and focus on getting "command_start", "command_instruction", "begin_answer" and "output"
                child_node_list = []
                recursive_node_visit(node, ["command_start", "command_instruction", "begin_answer", "output"], child_node_list)
                command_node_list.append(child_node_list)

            commands = [node[0].get_content()["text"].lower() for node in command_node_list]
            commands = clean_text(commands)
            tool_use.append(commands)

        except:
            tool_use.append([])


    usage = []
    for each in tool_use:
        usage.append(each)
    
    return usage


def collect_conf(df: pd.DataFrame) -> list:
    grammar = parsimonious.grammar.Grammar(
r"""
program = program_start*node*partial_command*final_answer
program_start = input_start~r"( |\n)"text~r"\n"
input_start = ~r"\n*Input:"
text = ~r"(?<=Input:( |\n))(.|\n|\t)*?(?=\nQ[0-9]+:)"
node = command_node~r"\n"output_node~r"\n"
command_node = command_start~r"( |\n)"confidence_score~r"( |\n)"command_instruction
output_node = begin_answer~r"( |\n)"output
command_instruction = ~r"(?<=[0-9]\]( |\n))(.|\n|\t)*?(?=\n\#[0-9]+)"
command_start = ~r"Q[0-9]+:[ \n]+\[[A-Za-z_ ]+\]"
confidence_score = ~r"(?<=\]( |\n))\[[0-9]+\]"
begin_answer = ~r"\#[0-9]+:"
output = ~r"(?<=\#[0-9]+:( |\n))(.|\n|\t)*?(?=\nQ[0-9]+:)"
partial_command = command_start~r"\n"
final_answer = ~r"Ans:( |\n)(.|\n)*$"
""")

    # extract conf_score from prompts
    prompts = [p for p in df["after_art"]]
    conf = []
    for p in prompts:
        try:
            parsed_program = grammar.parse(p)
            command_nodes = parsed_program.children[1]
            command_node_list = []

            for node in command_nodes.children:
                # Access all children and focus on getting "command_start", "command_instruction", "begin_answer" and "output"
                child_node_list = []
                recursive_node_visit(node, ["command_start", "confidence_score", "command_instruction", "begin_answer", "output"], child_node_list)
                command_node_list.append(child_node_list)
                
            commands = [node[1].get_content()["text"] for node in command_node_list]
            conf.append(commands)

        except:
            conf.append([])
            
    # convert '[90]' to float in conf
    final = []
    for c in conf:
        if c:
            lst = [ast.literal_eval(s)[0] for s in c]
            final.append(np.average(np.array(lst)))
        else:
            final.append(0)
    return final

def jaccard_sim(actual_tool, ref_tools): # ref tools is a ndarray
    
    set1, set2 = set(actual_tool), [set(tool) for tool in ref_tools]
    possible_sims = []
    for s in set2:
        # intersection of two sets
        intersection = len(set1.intersection(s))
        # Unions of two sets
        union = len(set1.union(s))
        possible_sims.append(intersection / union)
    return max(possible_sims)

def check_answer(row):
    if row.prediction != "":
        return (row.answer.lower() in row.prediction.lower()) or (row.prediction.lower() in row.answer.lower())
    return False



def collect_conf_acc(f):
    print(f"Reading file {f}...")
    # read single file
    df = pd.read_json(f)[:1000]
    
    # logit based conf collection
    try:
        output = 100*np.e**(df['logprobs'].apply(filter_tool_log))
#         print("IN!!!")
    # verbalized conf collection
    except:
        output = collect_conf(df)

    bins = np.linspace(0, 100, 10)
    bin_indices = np.digitize(output, bins=bins)

    df["adj_correct"] = df.apply(check_answer, axis=1)

    ref_tools = [
#         ['[search]', '[check answer type]'],
        ['[search]', '[check answer type]', '[compare]'], 
        ['[search]', '[check answer type]', '[Internal Knowledge]']
    ]
    tool_usage = collect_usage(df)
    tool_use_acc = [jaccard_sim(each, ref_tools) for each in tool_usage]
    bin_acc = {}
    for i, b in enumerate(bin_indices):
        which_bin = 0.1 * np.average([b-1, b])
        if which_bin in bin_acc:
            # tool use acc
#             bin_acc[which_bin].append(tool_use_acc[i])
            # qa acc
            bin_acc[which_bin].append(df["adj_correct"][i])
        else:
            # tool use acc
#             bin_acc[which_bin] = [tool_use_acc[i]]
            # qa acc
            bin_acc[which_bin] = [df["adj_correct"][i]]

    # calculate ECE for each bin
    ece = 0
    for key in bin_acc:
        ece += len(bin_acc[key]) / len(df) * np.abs(np.average(bin_acc[key]) - key)
        
        
    print(f"ECE for {f}: {format(ece, '.4f')}\n")

#     for key in bin_acc:
#         print(f"bin: {key}, length: {len(bin_acc[key])}")
#         bin_acc[key] = np.average(bin_acc[key])
#     print('\n')

    plt_data = sorted(bin_acc.items())
    conf_range, acc = [int(100 * d[0]) for d in plt_data], [int(100*np.average(d[1])) for d in plt_data]
    return conf_range, acc
    


In [6]:
dev_files = [
#     "mintaka_(dev)_llama_3_70b_out.json",
#     "pop_qa_(dev)_llama_3_70b_out.json",
#     "entity_ques_(dev)_llama_3_70b_out.json",
#     "pop_qa_(dev)_gpt-3.5-turbo-instruct-0914_out.json",
#     "pop_qa_(dev)_gpt-3.5-turbo_out.json",
#     "pop_qa_(dev)_gpt-4_out.json",
#     "entity_ques_(dev)_gpt-3.5-turbo-instruct-0914_out.json",
#     "whole_data_uncalibrated/entity_ques_1_gpt-3.5-turbo_out.json",
#     "entity_ques_(dev)_gpt-4_out.json",
#     "mintaka_(dev)_gpt-3.5-turbo-instruct-0914_out.json",
#     "mintaka_(dev)_gpt-3.5-turbo_out.json",
#     "mintaka_(dev)_gpt-4_out.json",
    "mintaka_ablation(dev)_gpt-3.5-turbo-0613_out.json",
    "mintaka_ablation(dev)_gpt-4_out.json",
    
]

files = [
#     "mintaka_(eval)_llama_3_70b_out.json",
#     "pop_qa_(eval_v3.0)_llama_3_70b_out.json",
#     "entity_ques_(eval)_llama_3_70b_out.json",
#     "pop_qa_(eval)_gpt-3.5-turbo-instruct-0914_out.json",
#     "pop_qa_(eval_v2.0)_gpt-3.5-turbo_out.json",
#     "pop_qa_(eval_v3.0)_gpt-4_out.json",
#     "entity_ques_(eval)_gpt-3.5-turbo-instruct-0914_out.json",
#     "entity_ques_(eval)_gpt-3.5-turbo_out.json",
#     "entity_ques_(eval)_gpt-4_out.json",
#     "mintaka_(eval)_gpt-3.5-turbo-instruct-0914_out.json",
#     "mintaka_(eval)_gpt-3.5-turbo_out.json",
#     "mintaka_(eval_v2.0)_gpt-4_out.json",
    "mintaka_ablation_gpt-3.5-turbo-0613_out.json",
    "mintaka_ablation_gpt-4_out.json",
]

for f1, f2 in zip(dev_files, files):
    conf_range, acc_lst = collect_conf_acc(f1)
    print(conf_range, acc_lst)
    conf_edit(f2, conf_range, acc_lst)

Reading file mintaka_ablation(dev)_gpt-3.5-turbo-0613_out.json...
ECE for mintaka_ablation(dev)_gpt-3.5-turbo-0613_out.json: 0.2635

[5, 55, 65, 75, 85] [18, 0, 25, 48, 58]
Editing file mintaka_ablation_gpt-3.5-turbo-0613_out.json...
Create an empty list in json file!


151it [03:33,  1.41s/it]


Reading file mintaka_ablation(dev)_gpt-4_out.json...
ECE for mintaka_ablation(dev)_gpt-4_out.json: 0.2100

[55, 65, 75, 85] [100, 29, 64, 62]
Editing file mintaka_ablation_gpt-4_out.json...
Create an empty list in json file!


151it [03:40,  1.46s/it]
