In [2]:
from evaluate_retrieval import is_successful_retrieval
from utils import load_obj

# tfidf_retrieval_output_dev_100.json
# tfidf_retrieval_output_dev_1000.json
# mistral_retrieval_output_dev_100.json
# mistral_retrieval_output_dev_1000.json

def calculate_success_percentage_supported(data, retrieval_key="retrieved"):
    supported_data = [item for item in data if item['label'] == "SUPPORTED"]
    success_percentages, average_total_percentage = calculate_success_percentage(supported_data, retrieval_key)
    return success_percentages, average_total_percentage

def calculate_success_percentage_not_supported(data, retrieval_key="retrieved"):
    not_supported_data = [item for item in data if item['label'] == "NOT_SUPPORTED"]
    success_percentages, average_total_percentage = calculate_success_percentage(not_supported_data, retrieval_key)
    return success_percentages, average_total_percentage


def calculate_success_percentage(data, retrieval_key="retrieved"):
    hop_counts = {2: {'total': 0, 'successful': 0},
                    3: {'total': 0, 'successful': 0},
                    4: {'total': 0, 'successful': 0}}

    for obj in data:
        num_hops = obj['num_hops']
        hop_counts[num_hops]["total"] += 1
        if is_successful_retrieval(obj, retrieval_key):
            hop_counts[num_hops]['successful'] += 1

    success_percentages = {}
    for num_hops, counts in hop_counts.items():
        if counts['total'] > 0:
            success_percentage = (counts['successful'] / counts['total']) * 100
            success_percentages[num_hops] = success_percentage
    #print(hop_counts)
    #print(success_percentages)
    average_total_percentage = (success_percentages[2] + success_percentages[3] + success_percentages[4]) / 3
    return success_percentages, average_total_percentage

def calculate_success_percentage_qa(data, retrieval_key, support_type):
    hop_counts = {2: {'total': 0, 'successful': 0},
                    3: {'total': 0, 'successful': 0},
                    4: {'total': 0, 'successful': 0}}


    for hop_count in data:
        int_hop_count = int(hop_count)
        for key in data[hop_count]:
            for item in data[hop_count][key]:
                #if obj["label"] == "SUPPORTED":
                if support_type == "SUPPORTED":
                    if item["label"] == "SUPPORTED":
                        hop_counts[int_hop_count]["total"] += 1
                        if is_successful_retrieval(item, retrieval_key=retrieval_key):
                            hop_counts[int_hop_count]['successful'] += 1
                elif support_type == "NOT_SUPPORTED":
                        if item["label"] == "NOT_SUPPORTED":
                            hop_counts[int_hop_count]["total"] += 1
                            if is_successful_retrieval(item, retrieval_key=retrieval_key):
                                hop_counts[int_hop_count]['successful'] += 1
                else:
                    hop_counts[int_hop_count]["total"] += 1
                    if is_successful_retrieval(item, retrieval_key=retrieval_key):
                        hop_counts[int_hop_count]['successful'] += 1
    #print(hop_counts)
    success_percentages = {}
    for num_hops, counts in hop_counts.items():
        if counts['total'] > 0:
            success_percentage = (counts['successful'] / counts['total']) * 100
            success_percentages[num_hops] = success_percentage


    average_total_percentage = (success_percentages[2] + success_percentages[3] + success_percentages[4]) / 3
    # Example variables
    hops_2 = round(success_percentages[2],2)
    hops_3 = round(success_percentages[3],2)
    hops_4 = round(success_percentages[4],2)
    avg_total = round(average_total_percentage,2)
    return success_percentages, average_total_percentage#hops_2, hops_3, hops_4, avg_total


def generate_latex_entry_simple(success_percentages, average_total_percentage, method, dataset, retrieved):
    hops_2 = round(success_percentages[2],2)
    hops_3 = round(success_percentages[3],2)
    hops_4 = round(success_percentages[4],2)
    avg_total = round(average_total_percentage,2)

    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"   
    print(latex_line)

def generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, method):
    hops_2_supp = round(success_percentages_supp[2],2)
    hops_3_supp = round(success_percentages_supp[3],2)
    hops_4_supp = round(success_percentages_supp[4],2)
    avg_total_supp = round(average_total_percentage_supp,2)

    hops_2_not_supp = round(success_percentages_not_supp[2],2)
    hops_3_not_supp = round(success_percentages_not_supp[3],2)
    hops_4_not_supp = round(success_percentages_not_supp[4],2)
    avg_total_not_supp = round(average_total_percentage_not_supp,2)

    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    #latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"
    # TF-IDF & 83.64\% & 58.13\% & 29.17\% & 56.98\% & 84.84\% & 58.78\% & 34.25\% & 59.29\% \\ 
    latex_line = f"{method} & {hops_2_supp}\% & {hops_3_supp}\% & {hops_4_supp}\% & {avg_total_supp}\%& {hops_2_not_supp}\% & {hops_3_not_supp}\% & {hops_4_not_supp}\% & {avg_total_not_supp}\% \\\\"
    print(latex_line)

def generate_latex_entry_multicolumn_with_total(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, total, method):
    hops_2_supp = round(success_percentages_supp[2],2)
    hops_3_supp = round(success_percentages_supp[3],2)
    hops_4_supp = round(success_percentages_supp[4],2)
    avg_total_supp = round(average_total_percentage_supp,2)

    hops_2_not_supp = round(success_percentages_not_supp[2],2)
    hops_3_not_supp = round(success_percentages_not_supp[3],2)
    hops_4_not_supp = round(success_percentages_not_supp[4],2)
    avg_total_not_supp = round(average_total_percentage_not_supp,2)

    total = round(total,2)
    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    #latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"
    # TF-IDF & 83.64\% & 58.13\% & 29.17\% & 56.98\% & 84.84\% & 58.78\% & 34.25\% & 59.29\% \\ 
    latex_line = f"{method} & {hops_2_supp}\% & {hops_3_supp}\% & {hops_4_supp}\% & {avg_total_supp}\%& {hops_2_not_supp}\% & {hops_3_not_supp}\% & {hops_4_not_supp}\% & {avg_total_not_supp}\% & {total}\%\\\\"
    print(latex_line)
# initial Benchmark

In [19]:
data_path = "data/tfidf_retrieval_output_dev_100.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "TF-IDF")

data_path = "data/mistral_retrieval_output_dev_100.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "Mistral")


TF-IDF & 69.67\% & 35.43\% & 14.09\% & 39.73\%& 70.91\% & 34.14\% & 12.5\% & 39.18\% \\
Mistral & 74.47\% & 39.46\% & 16.44\% & 43.46\%& 66.12\% & 34.95\% & 17.05\% & 39.37\% \\


In [3]:
data_path = "data/mistral_retrieval_output_dev_100_multi_hop_prompt.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "Mistral")


Mistral & 71.21\% & 37.6\% & 15.46\% & 41.42\%& 63.31\% & 32.76\% & 15.15\% & 37.07\% \\


In [20]:
data_path = "data/tfidf_retrieval_output_dev_1000.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "TF-IDF")


data_path = "data/mistral_retrieval_output_dev_1000.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "Mistral")


TF-IDF & 84.84\% & 58.78\% & 34.25\% & 59.29\%& 83.64\% & 58.13\% & 29.17\% & 56.98\% \\
Mistral & 92.71\% & 69.21\% & 45.6\% & 69.17\%& 87.77\% & 63.67\% & 44.7\% & 65.38\% \\


In [2]:
#data = load_obj("data/decomp_baseline_FULL_DATASET.json")
#data = load_obj('/home/sander/code/thesis/hover/leon/data/decomp_TFIDF_baseline.json')
data = load_obj("data/decomp_baseline_FULL_DATASET_9shot_refined.json")
from utils import save_obj
run_count = 0
## BASELINE
for hop_count in data:
    for key in data[hop_count]:
        for item in data[hop_count][key]:

            decomp_retrieval_100 = []
            for index in range(100):
                for retrieval in item[f"decomposed_claims_retrieval_{run_count}"]:
                    #
                    if len(decomp_retrieval_100) >= 100:
                        break
                    if index < len(retrieval):
                        decomp_retrieval_100.append(retrieval[index])
            item[f"decomposed_claims_retrieval_100_mistral_no_filter"] = decomp_retrieval_100
save_obj(data, "data/decomp_baseline_FULL_DATASET_9shot_refined.json")

<h1 style="color:green;">Qualitative Analysis Data</h1>


In [3]:
for threshhold in [5, 10, 20, 60]: #, 80, 125, 300
    #data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA.json"
    data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA_TFIDF.json"
    success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="tfidf_retrieved_1", support_type="SUPPORTED")
    success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="tfidf_retrieved_1", support_type="NOT_SUPPORTED")
    generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, threshhold)


5 & 72.55\% & 38.43\% & 18.98\% & 43.32\%& 61.82\% & 34.26\% & 16.1\% & 37.39\% \\
10 & 72.17\% & 39.26\% & 18.98\% & 43.47\%& 64.96\% & 37.37\% & 16.48\% & 39.6\% \\
20 & 71.79\% & 41.12\% & 19.77\% & 44.22\%& 63.64\% & 36.22\% & 17.23\% & 39.03\% \\
60 & 73.51\% & 39.46\% & 19.37\% & 44.12\%& 67.6\% & 34.83\% & 17.42\% & 39.95\% \\


In [2]:
def generate_latex_entry_multicolumn_all(success_percentages, average_total_percentage, method):
    hops_2_supp = round(success_percentages[2],2)
    hops_3_supp = round(success_percentages[3],2)
    hops_4_supp = round(success_percentages[4],2)
    avg_total_supp = round(average_total_percentage,2)

    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    #latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"
    # TF-IDF & 83.64\% & 58.13\% & 29.17\% & 56.98\% & 84.84\% & 58.78\% & 34.25\% & 59.29\% \\ 
    latex_line = f"{method} & {hops_2_supp}\% & {hops_3_supp}\% & {hops_4_supp}\% & {avg_total_supp}\% \\\\"
    print(latex_line)

for threshhold in [5, 10, 20, 60]: #, 80, 125, 300
    #data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA.json"
    data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA_TFIDF.json"
    success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="tfidf_retrieved_1", support_type="ALL")
    generate_latex_entry_multicolumn_all(success_percentages_supp, average_total_percentage_supp, threshhold)


5 & 66.79\% & 36.46\% & 17.52\% & 40.25\% \\
10 & 68.29\% & 38.37\% & 17.71\% & 41.46\% \\
20 & 67.41\% & 38.8\% & 18.48\% & 41.56\% \\
60 & 70.34\% & 37.28\% & 18.38\% & 42.0\% \\


In [2]:
for threshhold in [5, 10, 20, 60]: #, 80, 125, 300
    #data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA.json"
    data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA_TFIDF.json"
    success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="tfidf_retrieved_1", support_type="SUPPORTED")
    success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="tfidf_retrieved_1", support_type="NOT_SUPPORTED")
    _, total = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="tfidf_retrieved_1", support_type="ALL")
    generate_latex_entry_multicolumn_with_total(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, total,threshhold)

5 & 69.67\% & 39.46\% & 17.61\% & 42.25\%& 68.43\% & 36.91\% & 14.39\% & 39.91\% & 41.08\%\\
10 & 72.36\% & 40.19\% & 18.59\% & 43.71\%& 71.07\% & 40.6\% & 17.99\% & 43.22\% & 43.45\%\\
20 & 72.55\% & 41.12\% & 20.16\% & 44.61\%& 71.24\% & 41.64\% & 18.18\% & 43.69\% & 44.12\%\\
60 & 72.74\% & 41.01\% & 20.94\% & 44.9\%& 71.24\% & 39.33\% & 18.56\% & 43.04\% & 43.96\%\\


In [5]:
#data = load_obj("data/decomp_baseline_FULL_DATASET.json")
#data = load_obj('/home/sander/code/thesis/hover/leon/data/decomp_TFIDF_baseline.json')
from utils import save_obj, load_obj
data = load_obj("data/decomp_baseline_FULL_DATASET_9shot_NOINSTRUCT.json")

run_count = 0
## BASELINE
for hop_count in data:
    for key in data[hop_count]:
        for item in data[hop_count][key]:

            decomp_retrieval_100 = []
            for index in range(100):
                for retrieval in item[f"decomposed_claims_retrieval_{run_count}"]:
                #for retrieval in item[f"decomposed_claims_tfidf_retrieved"]:
                    
                    #
                    if len(decomp_retrieval_100) >= 100:
                        break
                    if index < len(retrieval):
                        decomp_retrieval_100.append(retrieval[index])
            #item[f"decomposed_claims_retrieval_100_tfidf_no_filter"] = decomp_retrieval_100
            item[f"decomposed_claims_retrieval_100_mistral_no_filter"] = decomp_retrieval_100
save_obj(data, "data/decomp_baseline_FULL_DATASET_9shot_NOINSTRUCT.json")

In [12]:
#data = load_obj("data/decomp_baseline_FULL_DATASET.json")
#data = load_obj('/home/sander/code/thesis/hover/leon/data/decomp_TFIDF_baseline.json')
data = load_obj("/home/sander/code/thesis/hover/leon/data/decomp_baseline_FULL_DATASET_FINAL_tfidf.json")

from utils import save_obj
run_count = 0
## BASELINE with filter
for hop_count in data:
    for key in data[hop_count]:
        for item in data[hop_count][key]:

            decomp_retrieval_100 = []
            for index in range(100):
                #for retrieval in item[f"decomposed_claims_retrieval_{run_count}"]:
                for retrieval in item[f"decomposed_claims_tfidf_retrieved"]:
                    
                    #
                    if len(decomp_retrieval_100) >= 100:
                        break
                    if index < len(retrieval):
                        if retrieval[index] not in decomp_retrieval_100:
                            decomp_retrieval_100.append(retrieval[index])

            item[f"decomposed_claims_retrieval_100_tfidf_filter"] = decomp_retrieval_100
            #item[f"decomposed_claims_retrieval_100_mistral_filter"] = decomp_retrieval_100
save_obj(data, "/home/sander/code/thesis/hover/leon/data/decomp_baseline_FULL_DATASET_FINAL_tfidf.json")

In [4]:
from utils import save_obj, load_obj
#data = load_obj("/home/sander/code/thesis/hover/leon/data/iterative_FULL_DATASET_with_questions_NEW2_TFIDF.json")
####
#####ITERATIV
#####

data = load_obj("data/iterative_decomp_FULL_DATASET_1.json")
for hop_count in data:
    for key in data[hop_count]:
        for item in data[hop_count][key]:


            for run_count in range(5):
                if run_count <= int(hop_count):


                    decomp_retrieval_100 = []
                    for index in range(100):
                        for retrieval in item[f"decomposed_claims_retrieval_{run_count}"]:
                            #
                            if len(decomp_retrieval_100) >= 100:
                                break
                            if index < len(retrieval):
                                decomp_retrieval_100.append(retrieval[index])
                    item[f"decomposed_claims_retrieval_100_combined_{run_count}"] = decomp_retrieval_100
save_obj(data, "data/iterative_decomp_FULL_DATASET_1.json")

In [1]:
from utils import save_obj, load_obj
#data = load_obj("/home/sander/code/thesis/hover/leon/data/iterative_FULL_DATASET_with_questions_NEW2_TFIDF.json")
####
#####ITERATIV COMBINATION OF DECOMPOSED RETRIEVALS
#####

data = load_obj("data/iterative_decomp_FULL_DATASET_1.json")
for hop_count in data:
    for key in data[hop_count]:
        for item in data[hop_count][key]:
            for run_count in range(5):
                if run_count <= int(hop_count):
                    decomp_combined_retrieval = []

                    for retrieval in item[f"decomposed_claims_retrieval_{run_count}"]:
                        decomp_combined_retrieval.extend(retrieval)
                    item[f"decomposed_combined_{run_count}"] = decomp_combined_retrieval
save_obj(data, "data/iterative_decomp_FULL_DATASET_1_combined.json")

In [6]:
#data_path= "/home/sander/code/thesis/hover/leon/data/decomp_baseline_FULL_DATASET_FINAL_tfidf.json"
#data_path ="/home/sander/code/thesis/hover/leon/data/decomp_baseline_FULL_DATASET_FINAL.json"
data_path = "data/decomp_baseline_FULL_DATASET_9shot_NOINSTRUCT.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="decomposed_claims_retrieval_100_mistral_no_filter", support_type="SUPPORTED")
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="decomposed_claims_retrieval_100_mistral_no_filter", support_type="NOT_SUPPORTED")
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "No Instruction")


No Instruction & 77.93\% & 43.29\% & 21.92\% & 47.71\%& 72.56\% & 38.75\% & 20.08\% & 43.8\% \\


In [4]:
def generate_latex_rows_for_method_qa(data, max_iter, method, retrieval_key):
    # Print rows for each hop count for the specific method
    for hop_count in data:
        for key in data[hop_count]:
        #hop_count = int(hop_count)
        # Supported
            row_entries_supported = []
            print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, {key}}}}} \\\\")
            for i in range(max_iter + 1):
                if i <= int(hop_count):
                    success = 0
                    for item_index, item in enumerate(data[hop_count][key]):
                        try:
                            success += is_successful_retrieval(item, f"{retrieval_key}_{i}")
                        except:
                            print(f"error for item: {hop_count}, {key}, {i}, {item_index}")


                    total = sum(1 for item in data[hop_count][key])
                    success_rate = success / total * 100 if total > 0 else 0
                    row_entries_supported.append(f"{success_rate:.1f}\%")
                else:
                    row_entries_supported.append("-")
            print(method + " & " + " & ".join(row_entries_supported) + " \\\\")

#data = load_obj("data/iterative_FULL_DATASET_with_questions_60_no_filter.json")
data = load_obj("data/iterative_decomp_FULL_DATASET_1_combined.json")
#data = load_obj("data/iterative_decomp_FULL_DATASET_2.json")
generate_latex_rows_for_method_qa(data, 4, "decomp 1", "decomposed_combined")

## nochmal überprüfen obs hier auch mit rechten Dingen zugeht.

\multicolumn{6}{l}{\textit{Hop 2, SUPPORTED}} \\
decomp 1 & 80.0\% & 82.7\% & 80.0\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 2, NOT_SUPPORTED}} \\
decomp 1 & 76.2\% & 75.4\% & 72.7\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 3, SUPPORTED}} \\
decomp 1 & 48.2\% & 53.4\% & 51.1\% & 50.5\% & - \\
\multicolumn{6}{l}{\textit{Hop 3, NOT_SUPPORTED}} \\
decomp 1 & 43.9\% & 46.9\% & 45.1\% & 43.6\% & - \\
\multicolumn{6}{l}{\textit{Hop 4, SUPPORTED}} \\
decomp 1 & 27.4\% & 36.6\% & 36.8\% & 34.6\% & 33.7\% \\
\multicolumn{6}{l}{\textit{Hop 4, NOT_SUPPORTED}} \\
decomp 1 & 23.7\% & 26.7\% & 25.2\% & 23.3\% & 23.3\% \\


In [6]:
def total_retrieval_success_across_iterations(data, max_iter):
    success_dict ={0 : {"total" : 0, "successful" : 0},
                   1 : {"total" : 0, "successful" : 0},
                   2 : {"total" : 0, "successful" : 0},
                   3 : {"total" : 0, "successful" : 0},
                   4 : {"total" : 0, "successful" : 0}}
    for run_count in range(max_iter + 1):
        for hop_count in data:
            #if run_count <= int(hop_count):
                for key in data[hop_count]:
                    for item_index, item in enumerate(data[hop_count][key]):
                        success_dict[run_count]["total"] += 1
                        try:
                            if run_count > 2 and hop_count == "2":
                                if is_successful_retrieval(item, f"retrieved_{2}"):
                                    success_dict[run_count]["successful"] += 1
                            elif run_count > 3 and hop_count == "3":
                                if is_successful_retrieval(item, f"retrieved_{3}"):
                                    success_dict[run_count]["successful"] += 1

                            elif is_successful_retrieval(item, f"retrieved_{run_count}"):
                                success_dict[run_count]["successful"] += 1
                        except:
                            print(f"Error for item {hop_count}, {key}, index {item_index}")

    print("textbf{Iter 0} & textbf{Iter 1} & textbf{Iter 2} & textbf{Iter 3} & textbf{Iter 4}")
    #print(f"{success_dict[0]['successful']/success_dict[0]['total']} & {success_dict[1]['successful']/success_dict[1]['total']} & {success_dict[2]['successful']/success_dict[2]['total']} & {success_dict[3]['successful']/success_dict[3]['total']} & {success_dict[4]['successful']/success_dict[4]['total']}")
    print(f"tfidf & {success_dict[0]['successful']/success_dict[0]['total'] * 100:.2f}\% & {success_dict[1]['successful']/success_dict[1]['total'] * 100:.2f}\% & {success_dict[2]['successful']/success_dict[2]['total'] * 100:.2f}\% & {success_dict[3]['successful']/success_dict[3]['total'] * 100:.2f}\% & {success_dict[4]['successful']/success_dict[4]['total'] * 100:.2f}\%\\\\")

from utils import load_obj
data = load_obj("data/iterative_FULL_DATASET_with_questions_NEW2.json")
total_retrieval_success_across_iterations(data, 4)

textbf{Iter 0} & textbf{Iter 1} & textbf{Iter 2} & textbf{Iter 3} & textbf{Iter 4}
tfidf & 41.17\% & 41.40\% & 39.45\% & 38.67\% & 38.50\%\\


In [1]:
from utils import load_obj, save_obj
data = load_obj("data/iterative_FULL_DATASET_with_questions_NEW2.json")
new_data = {}
total = 0
subquestions = 0
for hop_count in data:
    new_data[hop_count] = {"SUPPORTED" : []}
    for item in data[hop_count]["SUPPORTED"]:
        total += 1
        subquestions += len(item["sub_questions_0"])
        new_item = {}
        new_item["claim_0"] = item["claim_0"]
        new_item["sub_questions_0"] = item["sub_questions_0"]
        new_item[f"sub_question_retrieval_{0}"] = []
        new_data[hop_count]["SUPPORTED"].append(new_item)
print(total)
print(subquestions)
print(subquestions/total)

save_obj(new_data, "data/baleen_comparison.json")

2000
4764
2.382


In [9]:
1000 / (2.382 * 3) 

139.93842709207948

In [8]:
(2.382 * 3)

7.146000000000001

In [3]:
item.keys()

dict_keys(['uid', 'supporting_facts', 'label', 'num_hops', 'hpqa_id', 'previous_iteration_sentences', 'claim_0', 'retrieved_0', 'sub_questions_0', 'sub_question_retrieval_0', 'sub_question_top_sentences_0', 'sub_question_sentences_0', 'top_sentences_0', 'claim_1', 'retrieved_1', 'sub_questions_1', 'sub_question_retrieval_1', 'sub_question_sentences_1', 'top_sentences_1', 'claim_2', 'retrieved_2'])

In [6]:
import pprint
data = load_obj("data/iterative_decomp_FULL_DATASET2.json")
pprint.pprint(data["2"]["SUPPORTED"][0].keys())


dict_keys(['uid', 'supporting_facts', 'label', 'num_hops', 'hpqa_id', 'previous_iteration_sentences', 'claim_0', 'retrieved_0', 'sub_questions_0', 'sub_question_retrieval_0', 'top_sentences_0', 'claim_1', 'retrieved_1', 'sub_questions_1', 'sub_question_retrieval_1', 'top_sentences_1', 'claim_2', 'retrieved_2', 'tfidf_retrieved_0', 'tfidf_retrieved_1', 'tfidf_retrieved_2', 'decomposed_claims_0', 'decomposed_claims_retrieval_0', 'decomposed_claims_1', 'decomposed_claims_retrieval_1', 'decomposed_claims_2', 'decomposed_claims_retrieval_2'])


In [None]:
# data/qualitative_analysis_claims_10_base.json
# Alle folgenden experimente basieren auf data/qualitative_analysis_claims_10_base.json

# data/iterative_qualitative_analysis_question_answering_60.json
# data/iterative_test_base_60_no_filter.json
# data/iterative_test_base_60.json
# data/iterative_test2.json
# data/iterative_test_with_questions.json -> double cross




In [6]:
from utils import load_obj
qa_data = load_obj("data/qualitative_analysis_claims_10_base.json")
setting_data = load_obj("data/iterative_test_with_questions.json")
for hop_count in qa_data:
    for key in qa_data[hop_count]:
        for index, item in enumerate(qa_data[hop_count][key]):
            if item["claim"] != setting_data[hop_count][key][index]["claim"]:
                print("ERROR")

<h1 style="color:green;">Not Supported analysis</h1>


In [2]:
from evaluate_retrieval import is_successful_retrieval
from utils import load_obj

data = load_obj("data/iterative_qualitative_analysis_not_supported_question_answering_80.json")



def calculate_iteration_success(data, hop_count):
    
    
    for i in range(int(hop_count)+1):
        success = 0
        success_not_supported = 0
        total = 0
        for item in data:
            total += 1
            if is_successful_retrieval(item, f"retrieved_{i}"):
                success += 1
            if is_successful_retrieval(item["not_supported_counterpart"], f"retrieved_{i}"):
                success_not_supported += 1
            
        print(f"Success rate for hop {hop_count} iteration {i} SUPPORTED is {success/total * 100}%")
        print(f"Success rate for hop {hop_count} iteration {i} NOT_SUPPORTED is {success_not_supported/total * 100}%")

for hop_count in data:
    calculate_iteration_success(data[hop_count], hop_count)

Success rate for hop 2 iteration 0 SUPPORTED is 70.0%
Success rate for hop 2 iteration 0 NOT_SUPPORTED is 70.0%
Success rate for hop 2 iteration 1 SUPPORTED is 70.0%
Success rate for hop 2 iteration 1 NOT_SUPPORTED is 70.0%
Success rate for hop 2 iteration 2 SUPPORTED is 60.0%
Success rate for hop 2 iteration 2 NOT_SUPPORTED is 60.0%
Success rate for hop 3 iteration 0 SUPPORTED is 40.0%
Success rate for hop 3 iteration 0 NOT_SUPPORTED is 40.0%
Success rate for hop 3 iteration 1 SUPPORTED is 40.0%
Success rate for hop 3 iteration 1 NOT_SUPPORTED is 30.0%
Success rate for hop 3 iteration 2 SUPPORTED is 50.0%
Success rate for hop 3 iteration 2 NOT_SUPPORTED is 30.0%
Success rate for hop 3 iteration 3 SUPPORTED is 60.0%
Success rate for hop 3 iteration 3 NOT_SUPPORTED is 30.0%
Success rate for hop 4 iteration 0 SUPPORTED is 40.0%
Success rate for hop 4 iteration 0 NOT_SUPPORTED is 40.0%
Success rate for hop 4 iteration 1 SUPPORTED is 60.0%
Success rate for hop 4 iteration 1 NOT_SUPPORTED i

In [20]:
def generate_latex_table(data, max_hop_count, max_iter):
    for hop_count in range(1, max_hop_count + 1):
        hop_count = int(hop_count)
        # Print supported and not supported sections for each hop count
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Supported}}}} \\\\")
        for config in ['Decomposed', 'Base like Decomposed', 'Base 60 like Decomposed', 'Base 60 no filter like Decomposed', 'Questions Double Cross like Decomposed', 'Questions 60 like Decomposed', 'Questions 60 no filter like Decomposed']:
            row_entries = []
            for i in range(max_iter + 1):
                if i <= hop_count:
                    success = sum(is_successful_retrieval(item, f"retrieved_{i}") for item in data if item['config'] == config and item['supported'])
                    total = sum(1 for item in data if item['config'] == config and item['supported'])
                    success_rate = success / total * 100 if total > 0 else 0
                    row_entries.append(f"{success_rate:.1f}%")
                else:
                    row_entries.append("-")
            print(config + " & " + " & ".join(row_entries) + " \\\\")
        
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Not Supported}}}} \\\\")
        for config in ['Decomposed', 'Base like Decomposed', 'Base 60 like Decomposed', 'Base 60 no filter like Decomposed', 'Questions Double Cross like Decomposed', 'Questions 60 like Decomposed', 'Questions 60 no filter like Decomposed']:
            row_entries = []
            for i in range(max_iter + 1):
                if i <= hop_count:
                    success = sum(is_successful_retrieval(item["not_supported_counterpart"], f"retrieved_{i}") for item in data if item['config'] == config and not item['supported'])
                    total = sum(1 for item in data if item['config'] == config and not item['supported'])
                    success_rate = success / total * 100 if total > 0 else 0
                    row_entries.append(f"{success_rate:.1f}%")
                else:
                    row_entries.append("-")
            print(config + " & " + " & ".join(row_entries) + " \\\\")



In [24]:
def generate_latex_row_for_method(data, max_iter, method, retrieval_key):
    # Print rows for each hop count for the specific method
    for hop_count in data:
        #hop_count = int(hop_count)
        # Supported
        row_entries_supported = []
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Supported}}}} \\\\")
        for i in range(max_iter + 1):
            if i <= int(hop_count):
                success = sum(is_successful_retrieval(item, f"{retrieval_key}_{i}") for item in data[hop_count])
                total = sum(1 for item in data[hop_count])
                success_rate = success / total * 100 if total > 0 else 0
                row_entries_supported.append(f"{success_rate:.1f}\%")
            else:
                row_entries_supported.append("-")
        print(method + " & " + " & ".join(row_entries_supported) + " \\\\")

        # Not Supported
        row_entries_not_supported = []
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Not Supported}}}} \\\\")
        for i in range(max_iter + 1):
            if i <= int(hop_count):
                success = sum(is_successful_retrieval(item["not_supported_counterpart"], f"{retrieval_key}_{i}") for item in data[hop_count])
                total = sum(1 for item in data[hop_count])
                success_rate = success / total * 100 if total > 0 else 0
                row_entries_not_supported.append(f"{success_rate:.1f}\%")
            else:
                row_entries_not_supported.append("-")
        print(method + " & " + " & ".join(row_entries_not_supported) + " \\\\")


textbf{Iter 0} & textbf{Iter 1} & textbf{Iter 2} & textbf{Iter 3} & textbf{Iter 4}
tfidf & 39.23\% & 43.35\% & 41.08\% & 39.67\% & 39.42\%\\


In [11]:
print(data["4"]["SUPPORTED"][10].keys())

dict_keys(['uid', 'supporting_facts', 'label', 'num_hops', 'hpqa_id', 'previous_iteration_sentences', 'claim_0', 'retrieved_0', 'sub_questions_0', 'sub_question_retrieval_0', 'sub_question_sentences_0', 'top_sentences_0', 'claim_1', 'retrieved_1', 'sub_questions_1', 'sub_question_retrieval_1', 'sub_question_sentences_1', 'top_sentences_1', 'claim_2', 'retrieved_2', 'sub_questions_2', 'sub_question_retrieval_2', 'sub_question_sentences_2', 'top_sentences_2', 'claim_3', 'sub_questions_3', 'sub_question_retrieval_3', 'sub_question_sentences_3', 'top_sentences_3', 'claim_4', 'retrieved_3', 'retrieved_4'])


In [None]:
from utils import load_obj
qa_data = load_obj("data/qualitative_analysis_claims_10_base.json")
setting_data = load_obj("data/iterative_test_with_questions.json")
for hop_count in qa_data:
    for index, item in enumerate(qa_data[hop_count]):
        if item["claim"] != setting_data[hop_count][index]["claim"]:
            print("ERROR")

# not supported style data
# rohdaten: data/qualitative_analysis_not_supported.json

# iterative_qualitative_analysis_not_supported_no_filter_80.json
# iterative_qualitative_analysis_not_supported_question_answering_80.json
# iterative_qualitative_analysis_not_supported_no_filter_80.json -> normales sub question retrieval

# iterative_not_supported_subquestions_filter_60.json
# iterative_not_supported_base_no_filter_60_100_docs_retrieved.json -> läuft screen 2 DONE
# data/iterative_not_supported_subquestions_no_filter_60.json -> DONE
# data/iterative_not_supported_subquestions_no_filter_change_prompt_60.json -> DONE
# data/iterative_not_supported_question_answering_60.json -> läuft
# data/iterative_not_supported_base_no_filter_60_based_on_subquestions.json 

# "data/iterative_qualitative_analysis_question_answering_60.json" screen 1 

In [3]:
from utils import load_obj
data= load_obj("data/iterative_not_supported_subquestions_no_filter_change_prompt_60.json")
generate_latex_row_for_method(data=data, max_iter=4, method="Subquestions Entity Correction", retrieval_key="retrieved")


NameError: name 'generate_latex_row_for_method' is not defined