In [1]:
from evaluate_retrieval import is_successful_retrieval
from utils import load_obj

# tfidf_retrieval_output_dev_100.json
# tfidf_retrieval_output_dev_1000.json
# mistral_retrieval_output_dev_100.json
# mistral_retrieval_output_dev_1000.json

def calculate_success_percentage_supported(data, retrieval_key="retrieved"):
    supported_data = [item for item in data if item['label'] == "SUPPORTED"]
    success_percentages, average_total_percentage = calculate_success_percentage(supported_data, retrieval_key)
    return success_percentages, average_total_percentage

def calculate_success_percentage_not_supported(data, retrieval_key="retrieved"):
    not_supported_data = [item for item in data if item['label'] == "NOT_SUPPORTED"]
    success_percentages, average_total_percentage = calculate_success_percentage(not_supported_data, retrieval_key)
    return success_percentages, average_total_percentage


def calculate_success_percentage(data, retrieval_key="retrieved"):
    hop_counts = {2: {'total': 0, 'successful': 0},
                    3: {'total': 0, 'successful': 0},
                    4: {'total': 0, 'successful': 0}}

    for obj in data:
        num_hops = obj['num_hops']
        hop_counts[num_hops]["total"] += 1
        if is_successful_retrieval(obj, retrieval_key):
            hop_counts[num_hops]['successful'] += 1

    success_percentages = {}
    for num_hops, counts in hop_counts.items():
        if counts['total'] > 0:
            success_percentage = (counts['successful'] / counts['total']) * 100
            success_percentages[num_hops] = success_percentage

    average_total_percentage = (success_percentages[2] + success_percentages[3] + success_percentages[4]) / 3
    return success_percentages, average_total_percentage

def calculate_success_percentage_qa(data, retrieval_key, support_type):
    hop_counts = {2: {'total': 0, 'successful': 0},
                    3: {'total': 0, 'successful': 0},
                    4: {'total': 0, 'successful': 0}}


    for hop_count in data:
        int_hop_count = int(hop_count)
        for key in data[hop_count]:
            for item in data[hop_count][key]:
                if support_type == "SUPPORTED":
                    if item["label"] == "SUPPORTED":
                        hop_counts[int_hop_count]["total"] += 1
                        if is_successful_retrieval(item, retrieval_key=retrieval_key):
                            hop_counts[int_hop_count]['successful'] += 1
                elif support_type == "NOT_SUPPORTED":
                        if item["label"] == "NOT_SUPPORTED":
                            hop_counts[int_hop_count]["total"] += 1
                            if is_successful_retrieval(item, retrieval_key=retrieval_key):
                                hop_counts[int_hop_count]['successful'] += 1
                else:
                    hop_counts[int_hop_count]["total"] += 1
                    if is_successful_retrieval(item, retrieval_key=retrieval_key):
                        hop_counts[int_hop_count]['successful'] += 1
    #print(hop_counts)
    success_percentages = {}
    for num_hops, counts in hop_counts.items():
        if counts['total'] > 0:
            success_percentage = (counts['successful'] / counts['total']) * 100
            success_percentages[num_hops] = success_percentage


    average_total_percentage = (success_percentages[2] + success_percentages[3] + success_percentages[4]) / 3

    hops_2 = round(success_percentages[2],2)
    hops_3 = round(success_percentages[3],2)
    hops_4 = round(success_percentages[4],2)
    avg_total = round(average_total_percentage,2)
    return success_percentages, average_total_percentage#hops_2, hops_3, hops_4, avg_total


def generate_latex_entry_simple(success_percentages, average_total_percentage, method, dataset, retrieved):
    hops_2 = round(success_percentages[2],2)
    hops_3 = round(success_percentages[3],2)
    hops_4 = round(success_percentages[4],2)
    avg_total = round(average_total_percentage,2)

    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"   
    print(latex_line)

def generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, method):
    hops_2_supp = round(success_percentages_supp[2],2)
    hops_3_supp = round(success_percentages_supp[3],2)
    hops_4_supp = round(success_percentages_supp[4],2)
    avg_total_supp = round(average_total_percentage_supp,2)

    hops_2_not_supp = round(success_percentages_not_supp[2],2)
    hops_3_not_supp = round(success_percentages_not_supp[3],2)
    hops_4_not_supp = round(success_percentages_not_supp[4],2)
    avg_total_not_supp = round(average_total_percentage_not_supp,2)

    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    #latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"
    # TF-IDF & 83.64\% & 58.13\% & 29.17\% & 56.98\% & 84.84\% & 58.78\% & 34.25\% & 59.29\% \\ 
    latex_line = f"{method} & {hops_2_supp}\% & {hops_3_supp}\% & {hops_4_supp}\% & {avg_total_supp}\%& {hops_2_not_supp}\% & {hops_3_not_supp}\% & {hops_4_not_supp}\% & {avg_total_not_supp}\% \\\\"
    print(latex_line)

def generate_latex_entry_multicolumn_with_total(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, total, method):
    hops_2_supp = round(success_percentages_supp[2],2)
    hops_3_supp = round(success_percentages_supp[3],2)
    hops_4_supp = round(success_percentages_supp[4],2)
    avg_total_supp = round(average_total_percentage_supp,2)

    hops_2_not_supp = round(success_percentages_not_supp[2],2)
    hops_3_not_supp = round(success_percentages_not_supp[3],2)
    hops_4_not_supp = round(success_percentages_not_supp[4],2)
    avg_total_not_supp = round(average_total_percentage_not_supp,2)

    total = round(total,2)
    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    #latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"
    # TF-IDF & 83.64\% & 58.13\% & 29.17\% & 56.98\% & 84.84\% & 58.78\% & 34.25\% & 59.29\% \\ 
    latex_line = f"{method} & {hops_2_supp}\% & {hops_3_supp}\% & {hops_4_supp}\% & {avg_total_supp}\%& {hops_2_not_supp}\% & {hops_3_not_supp}\% & {hops_4_not_supp}\% & {avg_total_not_supp}\% & {total}\%\\\\"
    print(latex_line)
    
# initial Benchmark

In [19]:
data_path = "data/tfidf_retrieval_output_dev_100.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "TF-IDF")

data_path = "data/mistral_retrieval_output_dev_100.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "Mistral")


TF-IDF & 69.67\% & 35.43\% & 14.09\% & 39.73\%& 70.91\% & 34.14\% & 12.5\% & 39.18\% \\
Mistral & 74.47\% & 39.46\% & 16.44\% & 43.46\%& 66.12\% & 34.95\% & 17.05\% & 39.37\% \\


In [20]:
data_path = "data/tfidf_retrieval_output_dev_1000.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "TF-IDF")


data_path = "data/mistral_retrieval_output_dev_1000.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_supported(load_obj(data_path))
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_not_supported(load_obj(data_path))
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "Mistral")


TF-IDF & 84.84\% & 58.78\% & 34.25\% & 59.29\%& 83.64\% & 58.13\% & 29.17\% & 56.98\% \\
Mistral & 92.71\% & 69.21\% & 45.6\% & 69.17\%& 87.77\% & 63.67\% & 44.7\% & 65.38\% \\


In [14]:
data = load_obj("/home/sander/code/thesis/hover/leon/data/decomp_baseline_FULL_DATASET_FINAL.json")
retrieval_key = "decomposed_claims_retrieval_100_mistral_no_filter"
base_retrieval_key = "retrieved_0"
decomposed_claims_key = "decomposed_claims_0"

results = {}
for hop in ["2", "3", "4"]:
    results[hop] = {
        "SUPPORTED": {
            "worse_results_counter": 0,
            "better_results_counter": 0,
            "same_results_counter": 0,
            "single_answer": 0,
            "perfect_result": 0
        },
        "NOT_SUPPORTED": {
            "worse_results_counter": 0,
            "better_results_counter": 0,
            "same_results_counter": 0,
            "single_answer": 0,
            "perfect_result": 0
        }
    }

for hop_count in data:
    for key in data[hop_count]:
        for item_index, item in enumerate(data[hop_count][key]):
            found = []
            found_base = []
            not_found = []
            not_found_base = []
            for fact in item["supporting_facts"]:
                if fact[0] in item[retrieval_key]:
                    found.append(fact[0])
                else:
                    not_found.append(fact[0])
                if fact[0] in item[base_retrieval_key]:
                    found_base.append(fact[0])
                else:
                    not_found_base.append(fact[0])

            if len(item["decomposed_claims_0"]) == 1:
                results[hop_count][key]["single_answer"] += 1
            if len(not_found) > len(not_found_base):
                results[hop_count][key]["worse_results_counter"] += 1
            if len(found) > len(found_base):
                results[hop_count][key]["better_results_counter"] += 1
            if len(found) == len(found_base):
                results[hop_count][key]["same_results_counter"] += 1
            if len(found) > len(found_base) and len(found) == len(item["supporting_facts"]):
                results[hop_count][key]["perfect_result"] += 1


"""for hop_count in results:
    for key in results[hop_count]:
        print(f'Results for hop_count {hop_count}, key {key}:')
        for result_type, count in results[hop_count][key].items():
            print(f'{result_type}: {count}')
"""

# Function to format LaTeX line for the table
def format_latex_line(label, results):
    hop_counts = ["2", "3", "4"]
    keys = ["SUPPORTED", "NOT_SUPPORTED"]
    line = label + ' & '

    # Process for supported claims
    totals_supported = {hop: 0 for hop in hop_counts}
    for hop in hop_counts:
        values = [results[hop][key][label] for key in keys[:1]]  # Get only supported
        line += ' & '.join(map(str, values)) + ' & '
        totals_supported[hop] += sum(values)

    # Add total for supported
    total_supported = sum(totals_supported.values())
    line += str(total_supported) + ' & '

    # Process for not supported claims
    totals_not_supported = {hop: 0 for hop in hop_counts}
    for hop in hop_counts:
        values = [results[hop][key][label] for key in keys[1:]]  # Get only not supported
        line += ' & '.join(map(str, values)) + ' & '
        totals_not_supported[hop] += sum(values)

    # Add total for not supported
    total_not_supported = sum(totals_not_supported.values())
    line += str(total_not_supported) + ' \\\\'

    return line


latex_lines = []
labels = ["better_results_counter", "worse_results_counter", "same_results_counter", "single_answer", "perfect_result"]
for label in labels:
    line = format_latex_line(label, results)
    latex_lines.append(line)

for line in latex_lines:
    print(line)

better_results_counter & 58 & 223 & 194 & 475 & 87 & 223 & 180 & 490 \\
worse_results_counter & 21 & 118 & 73 & 212 & 35 & 114 & 81 & 230 \\
same_results_counter & 442 & 627 & 244 & 1313 & 483 & 530 & 267 & 1280 \\
single_answer & 55 & 80 & 3 & 138 & 42 & 46 & 7 & 95 \\
perfect_result & 48 & 100 & 67 & 215 & 75 & 89 & 45 & 209 \\


<h1 style="color:green;">Qualitative Analysis Data</h1>


In [2]:
for threshhold in [5, 10, 20, 60]: #, 80, 125, 300
    data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA.json"
    success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="retrieved_1", support_type="SUPPORTED")
    success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="retrieved_1", support_type="NOT_SUPPORTED")
    generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, threshhold)


5 & 72.55\% & 38.43\% & 18.98\% & 43.32\%& 61.82\% & 34.26\% & 16.1\% & 37.39\% \\
10 & 72.17\% & 39.26\% & 18.98\% & 43.47\%& 64.96\% & 37.37\% & 16.48\% & 39.6\% \\
20 & 71.79\% & 41.12\% & 19.77\% & 44.22\%& 63.64\% & 36.22\% & 17.23\% & 39.03\% \\
60 & 73.51\% & 39.46\% & 19.37\% & 44.12\%& 67.6\% & 34.83\% & 17.42\% & 39.95\% \\


In [3]:
def generate_latex_entry_multicolumn_all(success_percentages, average_total_percentage, method):
    hops_2_supp = round(success_percentages[2],2)
    hops_3_supp = round(success_percentages[3],2)
    hops_4_supp = round(success_percentages[4],2)
    avg_total_supp = round(average_total_percentage,2)

    #latex_line = f"{method} & {dataset} & {retrieved} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"    
    #latex_line = f"{method} & {hops_2}\% & {hops_3}\% & {hops_4}\% & {avg_total}\% \\\\ \\hline\n"
    # TF-IDF & 83.64\% & 58.13\% & 29.17\% & 56.98\% & 84.84\% & 58.78\% & 34.25\% & 59.29\% \\ 
    latex_line = f"{method} & {hops_2_supp}\% & {hops_3_supp}\% & {hops_4_supp}\% & {avg_total_supp}\% \\\\"
    print(latex_line)

for threshhold in [5, 10, 20, 60]: #, 80, 125, 300
    data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA.json"
    #data_path = f"data/iterative_cross_test_{threshhold}_FULL_DATA_TFIDF.json"
    success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="retrieved_1", support_type="ALL")
    generate_latex_entry_multicolumn_all(success_percentages_supp, average_total_percentage_supp, threshhold)


5 & 66.79\% & 36.46\% & 17.52\% & 40.25\% \\
10 & 68.29\% & 38.37\% & 17.71\% & 41.46\% \\
20 & 67.41\% & 38.8\% & 18.48\% & 41.56\% \\
60 & 70.34\% & 37.28\% & 18.38\% & 42.0\% \\


In [6]:
#data_path= "/home/sander/code/thesis/hover/leon/data/decomp_baseline_FULL_DATASET_FINAL_tfidf.json"
#data_path ="/home/sander/code/thesis/hover/leon/data/decomp_baseline_FULL_DATASET_FINAL.json"
data_path = "data/decomp_baseline_FULL_DATASET_9shot_NOINSTRUCT.json"
success_percentages_supp, average_total_percentage_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="decomposed_claims_retrieval_100_mistral_no_filter", support_type="SUPPORTED")
success_percentages_not_supp, average_total_percentage_not_supp = calculate_success_percentage_qa(load_obj(data_path), retrieval_key="decomposed_claims_retrieval_100_mistral_no_filter", support_type="NOT_SUPPORTED")
generate_latex_entry_multicolumn(success_percentages_supp, average_total_percentage_supp, success_percentages_not_supp, average_total_percentage_not_supp, "No Instruction")


No Instruction & 77.93\% & 43.29\% & 21.92\% & 47.71\%& 72.56\% & 38.75\% & 20.08\% & 43.8\% \\


In [15]:
def detailed_retrieval_success_across_iterations(data, max_iter, method, retrieval_key):
    # Print rows for each hop count for the specific method
    for hop_count in data:
        for key in data[hop_count]:
        #hop_count = int(hop_count)
        # Supported
            row_entries_supported = []
            print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, {key}}}}} \\\\")
            for i in range(max_iter + 1):
                if i <= int(hop_count):
                    success = 0
                    for item_index, item in enumerate(data[hop_count][key]):
                        try:
                            success += is_successful_retrieval(item, f"{retrieval_key}_{i}")
                        except:
                            print(f"error for item: {hop_count}, {key}, {i}, {item_index}")


                    total = sum(1 for item in data[hop_count][key])
                    success_rate = success / total * 100 if total > 0 else 0
                    row_entries_supported.append(f"{success_rate:.2f}\%")
                else:
                    row_entries_supported.append("-")
            print(method + " & " + " & ".join(row_entries_supported) + " \\\\")


data = load_obj("data/iterative_decomp_FULL_DATASET_Ablation.json")
detailed_retrieval_success_across_iterations(data, 4, "Subquestion Ablation", "decomposed_claims_retrieval_100_combined")



\multicolumn{6}{l}{\textit{Hop 2, SUPPORTED}} \\
Subquestion Ablation & 78.12\% & 81.38\% & 79.27\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 2, NOT_SUPPORTED}} \\
Subquestion Ablation & 73.39\% & 73.22\% & 72.23\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 3, SUPPORTED}} \\
Subquestion Ablation & 41.12\% & 44.21\% & 43.70\% & 43.49\% & - \\
\multicolumn{6}{l}{\textit{Hop 3, NOT_SUPPORTED}} \\
Subquestion Ablation & 37.02\% & 43.37\% & 42.56\% & 39.56\% & - \\
\multicolumn{6}{l}{\textit{Hop 4, SUPPORTED}} \\
Subquestion Ablation & 22.70\% & 26.81\% & 25.83\% & 24.85\% & 23.29\% \\
\multicolumn{6}{l}{\textit{Hop 4, NOT_SUPPORTED}} \\
Subquestion Ablation & 18.18\% & 23.11\% & 20.83\% & 20.27\% & 20.27\% \\


In [16]:
data = load_obj("data/iterative_decomp_FULL_DATASET_1_combined.json")
detailed_retrieval_success_across_iterations(data, 4, "Refined Decomp Prompt", "decomposed_claims_retrieval_100_combined")


\multicolumn{6}{l}{\textit{Hop 2, SUPPORTED}} \\
Refined Decomp Prompt & 78.12\% & 78.89\% & 76.78\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 2, NOT_SUPPORTED}} \\
Refined Decomp Prompt & 73.39\% & 72.40\% & 68.76\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 3, SUPPORTED}} \\
Refined Decomp Prompt & 41.12\% & 48.55\% & 45.76\% & 45.25\% & - \\
\multicolumn{6}{l}{\textit{Hop 3, NOT_SUPPORTED}} \\
Refined Decomp Prompt & 37.02\% & 42.21\% & 40.25\% & 38.18\% & - \\
\multicolumn{6}{l}{\textit{Hop 4, SUPPORTED}} \\
Refined Decomp Prompt & 22.70\% & 28.96\% & 28.57\% & 26.42\% & 25.05\% \\
\multicolumn{6}{l}{\textit{Hop 4, NOT_SUPPORTED}} \\
Refined Decomp Prompt & 18.18\% & 21.97\% & 19.89\% & 18.37\% & 16.86\% \\


In [17]:
data = load_obj("data/iterative_decomp_FULL_DATASET_0.json")
detailed_retrieval_success_across_iterations(data, 4, "Baseline Decomp Prompt", "decomposed_claims_retrieval_100_combined")


\multicolumn{6}{l}{\textit{Hop 2, SUPPORTED}} \\
Baseline Decomp Prompt & 79.65\% & 78.12\% & 75.82\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 2, NOT_SUPPORTED}} \\
Baseline Decomp Prompt & 73.39\% & 72.89\% & 69.42\% & - & - \\
\multicolumn{6}{l}{\textit{Hop 3, SUPPORTED}} \\
Baseline Decomp Prompt & 43.39\% & 47.93\% & 45.25\% & 43.60\% & - \\
\multicolumn{6}{l}{\textit{Hop 3, NOT_SUPPORTED}} \\
Baseline Decomp Prompt & 38.52\% & 41.06\% & 39.56\% & 37.37\% & - \\
\multicolumn{6}{l}{\textit{Hop 4, SUPPORTED}} \\
Baseline Decomp Prompt & 25.44\% & 28.38\% & 26.42\% & 22.70\% & 23.29\% \\
\multicolumn{6}{l}{\textit{Hop 4, NOT_SUPPORTED}} \\
Baseline Decomp Prompt & 20.08\% & 20.83\% & 18.56\% & 17.80\% & 17.61\% \\


In [6]:
def total_retrieval_success_across_iterations(data, max_iter):
    success_dict ={0 : {"total" : 0, "successful" : 0},
                   1 : {"total" : 0, "successful" : 0},
                   2 : {"total" : 0, "successful" : 0},
                   3 : {"total" : 0, "successful" : 0},
                   4 : {"total" : 0, "successful" : 0}}
    for run_count in range(max_iter + 1):
        for hop_count in data:
            #if run_count <= int(hop_count):
                for key in data[hop_count]:
                    for item_index, item in enumerate(data[hop_count][key]):
                        success_dict[run_count]["total"] += 1
                        try:
                            if run_count > 2 and hop_count == "2":
                                if is_successful_retrieval(item, f"retrieved_{2}"):
                                    success_dict[run_count]["successful"] += 1
                            elif run_count > 3 and hop_count == "3":
                                if is_successful_retrieval(item, f"retrieved_{3}"):
                                    success_dict[run_count]["successful"] += 1

                            elif is_successful_retrieval(item, f"retrieved_{run_count}"):
                                success_dict[run_count]["successful"] += 1
                        except:
                            print(f"Error for item {hop_count}, {key}, index {item_index}")

    print("textbf{Iter 0} & textbf{Iter 1} & textbf{Iter 2} & textbf{Iter 3} & textbf{Iter 4}")
    #print(f"{success_dict[0]['successful']/success_dict[0]['total']} & {success_dict[1]['successful']/success_dict[1]['total']} & {success_dict[2]['successful']/success_dict[2]['total']} & {success_dict[3]['successful']/success_dict[3]['total']} & {success_dict[4]['successful']/success_dict[4]['total']}")
    print(f"e5-mistral-7b-instruct & {success_dict[0]['successful']/success_dict[0]['total'] * 100:.2f}\% & {success_dict[1]['successful']/success_dict[1]['total'] * 100:.2f}\% & {success_dict[2]['successful']/success_dict[2]['total'] * 100:.2f}\% & {success_dict[3]['successful']/success_dict[3]['total'] * 100:.2f}\% & {success_dict[4]['successful']/success_dict[4]['total'] * 100:.2f}\%\\\\")

from utils import load_obj
data = load_obj("data/iterative_FULL_DATASET_with_questions_NEW2.json")
total_retrieval_success_across_iterations(data, 4)

textbf{Iter 0} & textbf{Iter 1} & textbf{Iter 2} & textbf{Iter 3} & textbf{Iter 4}
tfidf & 41.17\% & 41.40\% & 39.45\% & 38.67\% & 38.50\%\\


<h1 style="color:green;">Not Supported analysis</h1>


In [2]:
from evaluate_retrieval import is_successful_retrieval
from utils import load_obj

data = load_obj("data/iterative_qualitative_analysis_not_supported_question_answering_80.json")



def calculate_iteration_success(data, hop_count):
    
    
    for i in range(int(hop_count)+1):
        success = 0
        success_not_supported = 0
        total = 0
        for item in data:
            total += 1
            if is_successful_retrieval(item, f"retrieved_{i}"):
                success += 1
            if is_successful_retrieval(item["not_supported_counterpart"], f"retrieved_{i}"):
                success_not_supported += 1
            
        print(f"Success rate for hop {hop_count} iteration {i} SUPPORTED is {success/total * 100}%")
        print(f"Success rate for hop {hop_count} iteration {i} NOT_SUPPORTED is {success_not_supported/total * 100}%")

for hop_count in data:
    calculate_iteration_success(data[hop_count], hop_count)

Success rate for hop 2 iteration 0 SUPPORTED is 70.0%
Success rate for hop 2 iteration 0 NOT_SUPPORTED is 70.0%
Success rate for hop 2 iteration 1 SUPPORTED is 70.0%
Success rate for hop 2 iteration 1 NOT_SUPPORTED is 70.0%
Success rate for hop 2 iteration 2 SUPPORTED is 60.0%
Success rate for hop 2 iteration 2 NOT_SUPPORTED is 60.0%
Success rate for hop 3 iteration 0 SUPPORTED is 40.0%
Success rate for hop 3 iteration 0 NOT_SUPPORTED is 40.0%
Success rate for hop 3 iteration 1 SUPPORTED is 40.0%
Success rate for hop 3 iteration 1 NOT_SUPPORTED is 30.0%
Success rate for hop 3 iteration 2 SUPPORTED is 50.0%
Success rate for hop 3 iteration 2 NOT_SUPPORTED is 30.0%
Success rate for hop 3 iteration 3 SUPPORTED is 60.0%
Success rate for hop 3 iteration 3 NOT_SUPPORTED is 30.0%
Success rate for hop 4 iteration 0 SUPPORTED is 40.0%
Success rate for hop 4 iteration 0 NOT_SUPPORTED is 40.0%
Success rate for hop 4 iteration 1 SUPPORTED is 60.0%
Success rate for hop 4 iteration 1 NOT_SUPPORTED i

In [20]:
def generate_latex_table(data, max_hop_count, max_iter):
    for hop_count in range(1, max_hop_count + 1):
        hop_count = int(hop_count)
        # Print supported and not supported sections for each hop count
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Supported}}}} \\\\")
        for config in ['Decomposed', 'Base like Decomposed', 'Base 60 like Decomposed', 'Base 60 no filter like Decomposed', 'Questions Double Cross like Decomposed', 'Questions 60 like Decomposed', 'Questions 60 no filter like Decomposed']:
            row_entries = []
            for i in range(max_iter + 1):
                if i <= hop_count:
                    success = sum(is_successful_retrieval(item, f"retrieved_{i}") for item in data if item['config'] == config and item['supported'])
                    total = sum(1 for item in data if item['config'] == config and item['supported'])
                    success_rate = success / total * 100 if total > 0 else 0
                    row_entries.append(f"{success_rate:.1f}%")
                else:
                    row_entries.append("-")
            print(config + " & " + " & ".join(row_entries) + " \\\\")
        
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Not Supported}}}} \\\\")
        for config in ['Decomposed', 'Base like Decomposed', 'Base 60 like Decomposed', 'Base 60 no filter like Decomposed', 'Questions Double Cross like Decomposed', 'Questions 60 like Decomposed', 'Questions 60 no filter like Decomposed']:
            row_entries = []
            for i in range(max_iter + 1):
                if i <= hop_count:
                    success = sum(is_successful_retrieval(item["not_supported_counterpart"], f"retrieved_{i}") for item in data if item['config'] == config and not item['supported'])
                    total = sum(1 for item in data if item['config'] == config and not item['supported'])
                    success_rate = success / total * 100 if total > 0 else 0
                    row_entries.append(f"{success_rate:.1f}%")
                else:
                    row_entries.append("-")
            print(config + " & " + " & ".join(row_entries) + " \\\\")



In [24]:
def generate_latex_row_for_method(data, max_iter, method, retrieval_key):
    # Print rows for each hop count for the specific method
    for hop_count in data:
        #hop_count = int(hop_count)
        # Supported
        row_entries_supported = []
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Supported}}}} \\\\")
        for i in range(max_iter + 1):
            if i <= int(hop_count):
                success = sum(is_successful_retrieval(item, f"{retrieval_key}_{i}") for item in data[hop_count])
                total = sum(1 for item in data[hop_count])
                success_rate = success / total * 100 if total > 0 else 0
                row_entries_supported.append(f"{success_rate:.1f}\%")
            else:
                row_entries_supported.append("-")
        print(method + " & " + " & ".join(row_entries_supported) + " \\\\")

        # Not Supported
        row_entries_not_supported = []
        print(f"\\multicolumn{{6}}{{l}}{{\\textit{{Hop {hop_count}, Not Supported}}}} \\\\")
        for i in range(max_iter + 1):
            if i <= int(hop_count):
                success = sum(is_successful_retrieval(item["not_supported_counterpart"], f"{retrieval_key}_{i}") for item in data[hop_count])
                total = sum(1 for item in data[hop_count])
                success_rate = success / total * 100 if total > 0 else 0
                row_entries_not_supported.append(f"{success_rate:.1f}\%")
            else:
                row_entries_not_supported.append("-")
        print(method + " & " + " & ".join(row_entries_not_supported) + " \\\\")
