# Qualitative evaluation

In [1]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
from collections import defaultdict
from scipy.stats import mode

In [None]:
def extract_model_name(filename):
    """
    Extracts the model name from the filename.
    """
    return filename.split('_')[0]

def extract_k_value(filename):
    """
    Extracts the k value from the filename.
    """
    return int(os.path.basename(filename).split('_')[1].split('k')[1])

def extract_retriever_method(filename):
    """
    Extracts the retriever method from the filename.
    """
    retriever_name = os.path.basename(filename).split('_')[2]
    retriever_type = os.path.basename(filename).split('_')[3]
    if retriever_name == "Original" and retriever_type == "Original":
        return "Original"
    elif retriever_name == "Original" and retriever_type == "Pagerank":
        return "Pruning"
    elif retriever_name == "Reranker" and retriever_type == "Original":
        return "Reranking"
    elif retriever_name == "Reranker" and retriever_type == "Pagerank":
        return "Reranking+Pruning"
    else:
        raise ValueError(f"Unknown retriever method: {retriever_name} {retriever_type}")

#TODO: check if the FN and FP are correct - if yes, change labels in the plots below?
def classify_outcome(row):
    if row['rag_answer_correct'] and row['target'] == 'Target':
        return 'TP'
    elif not row['rag_answer_correct'] and row['target'] == 'Target':
        return 'FP'
    elif not row['rag_answer_correct'] and row['target'] == 'Non-Target':
        return 'FN'
    elif row['rag_answer_correct'] and row['target'] == 'Non-Target':
        return 'TN'
    elif row['rag_answer_correct'] and row['target'] == 'Unknown':
        return 'TN'
    elif not row['rag_answer_correct'] and row['target'] == 'Unknown':
        return 'FN'
    else:
        return 'Unclassified

## Experiment 1 (LLM choice)

## Experiment 2 (Retriever)

### 2a: K-value

In [3]:
csv_folder = "2_Retriever/2a_k_value"
csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
print(f"Found {len(csv_files)} CSV files in {csv_folder}")

Found 134 CSV files in 2_Retriever/2a_k_value


In [9]:
all_dfs = defaultdict(lambda: defaultdict(list))

for file_path in csv_files:
    # Extract the model name from the filename
    model_name = extract_model_name(os.path.basename(file_path))
    # Extract the k value from the filename
    k_value = extract_k_value(os.path.basename(file_path))
    # Read the CSV file
    df = pd.read_csv(file_path)
    all_dfs[model_name][k_value].append(df)


In [14]:
config_prompt_answers = defaultdict(dict)

for model, k_values in all_dfs.items():
    for k, dfs in k_values.items():
        answers = pd.DataFrame([df['rag_answer_correct'] for df in dfs])
        mode_answers = answers.mode(axis=0).iloc[0]
        config_prompt_answers[model][k] = mode_answers

# print(config_prompt_answers)

In [15]:
changed_prompt_indices_per_model = {}

for model, k_answers in config_prompt_answers.items():
    answer_df = pd.DataFrame(k_answers)  # rows = prompts, columns = k-values
    # Find rows where answers differ across k-values
    changed = answer_df.nunique(axis=1) > 1
    changed_prompt_indices_per_model[model] = answer_df[changed].index.tolist()

print("Changed prompt indices per model:")
for model, indices in changed_prompt_indices_per_model.items():
    print(f"{model}: {indices}")

Changed prompt indices per model:
falcon: [9, 15, 23, 24, 30, 38, 57, 61, 66, 69, 72, 78, 86, 87, 113]
gpt-4.1-nano: [13, 15, 18, 26, 28, 30, 33, 34, 37, 45, 46, 55, 64, 68, 70, 72, 75, 78, 80, 81, 83, 86, 89, 97, 104, 105, 108, 112, 114, 116]
qwen2.5: [0, 3, 9, 16, 22, 26, 32, 41, 44, 45, 50, 58, 67, 71, 80, 86, 89, 90, 97, 98, 100, 101, 104, 108, 110, 112, 116, 120]
deepseek-v2: [6, 7, 9, 15, 17, 22, 23, 34, 36, 38, 41, 43, 56, 58, 60, 65, 68, 76, 81, 85, 88, 89, 92, 97, 101, 104, 105, 114, 124]
mistral: [5, 9, 10, 12, 14, 17, 20, 22, 27, 30, 31, 34, 41, 47, 52, 58, 59, 60, 63, 64, 67, 70, 73, 74, 75, 80, 83, 84, 91, 93, 95, 98, 106, 110, 114, 116, 118, 124]
gemini-2.0-flash: []
llama3.2: [1, 4, 8, 10, 12, 14, 20, 21, 24, 26, 28, 33, 35, 36, 37, 41, 46, 48, 53, 56, 65, 75, 84, 85, 88, 89, 90, 92, 95, 101, 105, 110, 111, 117, 118, 123]


### 2b: Retriever method

In [17]:
csv_folder = "2_Retriever/2b_Retriever"
csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
print(f"Found {len(csv_files)} CSV files in {csv_folder}")

Found 122 CSV files in 2_Retriever/2b_Retriever


In [18]:
all_dfs = defaultdict(lambda: defaultdict(list))

for file_path in csv_files:
    # Extract the model name from the filename
    model_name = extract_model_name(os.path.basename(file_path))
    # Extract the k value from the filename
    k_value = extract_k_value(os.path.basename(file_path))
    # Extract the retriever method from the filename
    retriever_method = extract_retriever_method(os.path.basename(file_path))
    # Read the CSV file
    df = pd.read_csv(file_path)
    all_dfs[model_name][retriever_method].append(df)

In [19]:
config_prompt_answers = defaultdict(dict)

for model, retrievers in all_dfs.items():
    for r, dfs in retrievers.items():
        answers = pd.DataFrame([df['rag_answer_correct'] for df in dfs])
        mode_answers = answers.mode(axis=0).iloc[0]
        config_prompt_answers[model][r] = mode_answers

In [20]:
changed_prompt_indices_per_model = {}

for model, retrievers in config_prompt_answers.items():
    answer_df = pd.DataFrame(retrievers)  # rows = prompts, columns = k-values
    # Find rows where answers differ across k-values
    changed = answer_df.nunique(axis=1) > 1
    changed_prompt_indices_per_model[model] = answer_df[changed].index.tolist()

print("Changed prompt indices per model:")
for model, indices in changed_prompt_indices_per_model.items():
    print(f"{model}: {indices}")

Changed prompt indices per model:
deepseek-v2: [6, 8, 9, 10, 14, 24, 26, 29, 32, 34, 41, 49, 56, 66, 67, 68, 72, 76, 78, 80, 83, 85, 87, 92, 97, 102, 104, 111, 124]
llama3.2: [1, 6, 13, 20, 21, 28, 48, 51, 53, 54, 60, 72, 75, 77, 101, 109, 117]
gpt-4.1-nano: [7, 8, 11, 15, 16, 17, 30, 33, 34, 45, 51, 55, 62, 64, 72, 78, 80, 83, 85, 86, 97, 103, 104, 112, 116]
falcon: [2, 7, 17, 18, 19, 23, 24, 61, 78]
qwen2.5: [9, 11, 15, 16, 22, 23, 25, 41, 49, 56, 58, 62, 66, 76, 97, 100, 109, 110, 112, 113, 122]
mistral: [7, 9, 12, 14, 17, 29, 30, 52, 60, 61, 66, 67, 68, 73, 80, 82, 83, 91, 93, 94, 97, 99, 106, 116, 117, 118, 123, 124]
