In [1]:
from utils import load_dict_from_file, all_int_in_set, flatten
from utils_nlp import get_number_tokens
import numpy as np
import os
import pandas as pd
import re
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu


from dotenv import load_dotenv

load_dotenv()

dataset_name = os.getenv("dataset_name")
data_path = os.getenv("data_path")

In [2]:
list_question_ids = [0, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 77, 78, 80, 81, 83, 84, 85, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 107, 108]
print(len(list_question_ids))

92


In [3]:
def get_dict_to_df(dict_):
    df = pd.DataFrame(dict_)
    return df

def get_precision(y_pred, y_true):
    """
    Get the precision of the model
    """
    return len(y_pred.intersection(y_true)) / len(y_pred) if len(y_pred) > 0 else 0

def get_recall(y_pred, y_true):
    """
    Get the recall of the model
    """
    return len(y_pred.intersection(y_true)) / len(y_true) if len(y_true) > 0 else 0

def get_f1_score(y_pred, y_true):
    """
    Get the F1 score of the model
    """
    precision = get_precision(y_pred, y_true)
    recall = get_recall(y_pred, y_true)
    return 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

def get_response_in_context(response, context):
    """
    Get the number of response in context
    """
    context = context.lower()
    count_in_context = 0
    indx = 0
    if len(response) == 0:
        return 0, np.nan
    for answer in response:
        if answer.lower() in context:
            count_in_context += 1
            # find where answer in in context_
            indx_ = context.index(answer.lower())
            if indx_ > indx:
                indx = indx_
        else:
            indx = 999999

    if indx == 999999:
        largest_indx = np.nan
    else:
        largest_indx = get_number_tokens(context[:indx])
    return count_in_context, largest_indx

def get_matching_reponse_id(response_id, response_name):
    """
    Get the number of response in context
    """
    return len(response_id)/len(response_name) if len(response_name) > 0 else np.nan

def get_name_response(y_pred, y_true, response_name):
    correct_responses = []
    incorrect_responses = []
    for i, item in enumerate(y_pred):
        if item in y_true:
            correct_responses.append(response_name[i])
        else:
            incorrect_responses.append(response_name[i])
    return correct_responses, incorrect_responses


def get_self_knowledge(y_pred, y_true, response_name, context):
    cor_res, _ = get_name_response(y_pred, y_true, response_name)
    response_in_context, _ = get_response_in_context(cor_res, context)
    response_not_in_context = len(cor_res) - response_in_context

    # add the responses from y_pred and y_true
    number_responses = len(y_pred.union(y_true))
    return response_not_in_context/number_responses if number_responses > 0 else np.nan


def get_faithfulness(y_pred, y_true, response_name, context):    
    response_in_context, _ = get_response_in_context(response_name, context)
    number_responses = len(y_pred.union(y_true))
    return response_in_context/number_responses if number_responses > 0 else np.nan


def get_hallucination(y_pred, y_true, response_name, context):
    _, incor_res = get_name_response(y_pred, y_true, response_name)
    response_in_context, _ = get_response_in_context(incor_res, context)
    response_not_in_context = len(incor_res) - response_in_context
    number_responses = len(y_pred.union(y_true))
    return response_not_in_context/number_responses if number_responses > 0 else np.nan


def get_context_utilization(y_pred, y_true, response_name, context):
    cor_res, incor_res = get_name_response(y_pred, y_true, response_name)
    good_response_in_context, _ = get_response_in_context(cor_res, context)
    bad_response_in_context, _ = get_response_in_context(incor_res, context)
    total_ = good_response_in_context + bad_response_in_context
    return good_response_in_context/total_ if total_ > 0 else np.nan


def get_noise_sensitivity(y_pred, y_true, response_name, context):
    _, incor_res = get_name_response(y_pred, y_true, response_name)
    bad_response_in_context, _ = get_response_in_context(incor_res, context)
    number_responses = len(y_pred.union(y_true))
    return bad_response_in_context/number_responses if number_responses > 0 else np.nan


def get_number_different_word_counter(context):
    context = context.lower()
    context = re.sub(r'[^\w\s]', '', context)
    context = context.split()
    return len(set(context))


def get_count_text_in_context(text, context):
    context = context.lower()
    text = text.lower()
    return context.count(text)


def get_variability_context(response_name, context):
    num_words = get_number_different_word_counter(context)
    count_text = [get_count_text_in_context(text, context) for text in response_name]
    return sum(count_text)/num_words if num_words > 0 else 0


def get_rouge_score(correct_response, predicted_response):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    list_to_text1 = ' '.join(correct_response)
    list_to_text2 = ' '.join(predicted_response)
    scores = scorer.score(list_to_text1, list_to_text2)
    return scores['rougeL'].fmeasure

def get_bleu_score(correct_response, predicted_response):
    list_to_text1 = ' '.join(correct_response).split()
    list_to_text2 = ' '.join(predicted_response).split()
    scores = sentence_bleu([list_to_text1], list_to_text2, weights = [1])
    return scores


def get_meteor_score(correct_response, predicted_response):
    list_to_text1 = ' '.join(correct_response).split()
    list_to_text2 = ' '.join(predicted_response).split()
    scores = meteor_score([list_to_text1], list_to_text2)
    return scores


def get_dict_compiled_results(results_dict):
    list_question_id = []
    list_precision = []
    list_recall = []
    list_f1_score = []
    list_answer_in_context = []
    list_len_context = []
    list_smallest_context_needed = []
    list_matching_reponse_id = []
    list_self_knowledge = []
    list_faithfulness = []
    list_hallucination = []
    list_context_utilization = []
    list_noise_sensitivity = []
    list_variability_context = []

    list_rouge_score = []
    list_bleu_score = []
    list_meteor_score = []


    for item in results_dict.keys():
        item_as_int = int(item)
        if item_as_int in list_question_ids:
            answer_id = set(results_dict[item]["answer_id_"])
            answer_name = list(results_dict[item]["answer_name"])
            response_id = set(flatten(results_dict[item]['answer_0']["matching_ids"]))
            # response_id = set(flatten(results_dict[item]['answer_0']["matching_ids_sim"]))
            response_name = list(results_dict[item]['answer_0']["response"])
            context_ = str(results_dict[item]['answer_0']["context"])

            if response_name == ['None']:
                response_id = set([-1])
                
            list_question_id.append(item_as_int)
            list_precision.append(get_precision(response_id, answer_id))
            list_recall.append(get_recall(response_id, answer_id))
            list_f1_score.append(get_f1_score(response_id, answer_id))
            in_context, indx = get_response_in_context(answer_name, context_)
            list_answer_in_context.append(in_context/len(answer_id))
            list_smallest_context_needed.append(indx)
            list_len_context.append(get_number_tokens(context_))
            list_matching_reponse_id.append(get_matching_reponse_id(response_id, response_name))

            list_self_knowledge.append(get_self_knowledge(response_id, answer_id, response_name, context_))
            list_faithfulness.append(get_faithfulness(response_id, answer_id, response_name, context_))
            list_hallucination.append(get_hallucination(response_id, answer_id, response_name, context_))
            list_context_utilization.append(get_context_utilization(response_id, answer_id, response_name, context_))
            list_noise_sensitivity.append(get_noise_sensitivity(response_id, answer_id, response_name, context_))
            list_variability_context.append(get_variability_context(response_name, context_))
            list_rouge_score.append(get_rouge_score(answer_name, response_name))
            list_bleu_score.append(get_bleu_score(answer_name, response_name))
            list_meteor_score.append(get_meteor_score(answer_name, response_name))

    dict_ = {
        "question_id": list_question_id,
        "precision": list_precision,
        "recall": list_recall,
        "f1_score": list_f1_score,
        "answer_in_context": list_answer_in_context,
        "len_context": list_len_context,
        "smallest_context_needed": list_smallest_context_needed,
        "matching_reponse_id": list_matching_reponse_id,
        "self_knowledge": list_self_knowledge,
        "faithfulness": list_faithfulness,
        "hallucination": list_hallucination,
        "context_utilization": list_context_utilization,
        "noise_sensitivity": list_noise_sensitivity,
        "variability_context": list_variability_context,
        "rouge_score": list_rouge_score,
        "bleu_score": list_bleu_score,
        "meteor_score": list_meteor_score
    } 
    return dict_


def get_metrics(dict_):
    metrics_ = {}
    answer_id = set(dict_["answer_id_"])
    answer_name = list(dict_["answer_name"])
    response_id = set(flatten(dict_['answer_0']["matching_ids"]))
    # response_id = set(flatten(dict_['answer_0']["matching_ids_sim"]))
    response_name = list(dict_['answer_0']["response"])
    context_ = str(dict_['answer_0']["context"])

    metrics_["precision"] = get_precision(response_id, answer_id)
    metrics_["recall"] = get_recall(response_id, answer_id)
    metrics_["f1_score"] = get_f1_score(response_id, answer_id)
    in_context, metrics_["smallest_context_needed"] = get_response_in_context(answer_name, context_)
    metrics_["answer_in_context"] = in_context/len(answer_id)

    metrics_["len_context"] = get_number_tokens(context_)
    metrics_["matching_reponse_id"] = get_matching_reponse_id(response_id, response_name)

    metrics_["self_knowledge"] = get_self_knowledge(response_id, answer_id, response_name, context_)
    metrics_["faithfulness"] = get_faithfulness(response_id, answer_id, response_name, context_)
    metrics_["hallucination"] = get_hallucination(response_id, answer_id, response_name, context_)
    metrics_["context_utilization"] = get_context_utilization(response_id, answer_id, response_name, context_)
    metrics_["noise_sensitivity"] = get_noise_sensitivity(response_id, answer_id, response_name, context_)
    metrics_["variability_context"] = get_variability_context(response_name, context_)
    metrics_["rouge_score"] = get_rouge_score(answer_name, response_name)
    metrics_["bleu_score"] = get_bleu_score(answer_name, response_name)
    metrics_["meteor_score"] = get_meteor_score(answer_name, response_name)
    return metrics_


def compare_results_Q(Q_number, names):
    results_dict = load_dict_from_file(data_path + names[0])
    results_Q = results_dict[Q_number]

    print(results_Q)

    print("Questions: ", results_Q['question_'])
    print("Answer ID: ", results_Q['answer_id_'])
    print("True Answer: ", results_Q['answer_name'])

    print("-"*50)
    metrics_ = []

    for item in names:
        results_dict = load_dict_from_file(data_path + item)
        results_Q = results_dict[Q_number]

        print("Pred Answer : ", results_Q['answer_0']['response'])
        print("Matching ID: ", results_Q['answer_0']['matching_ids'])

        results = get_metrics(results_Q)
        metrics_.append(results.values())
        print("*"*50)

    columns = results.keys()
    df = pd.DataFrame(metrics_, columns=columns).T
    df.columns = names
    return df



In [13]:
compare_files = ['04_09_RAG_v2.json', '16_10_both_low_top_k_no_text_tf_no_type_token7k.json', '16_10_both_low_top_k_no_text_tf_no_type_token7k_2.json']

In [14]:
list_mean_values, list_std_values = [], []
for name in compare_files:
    results_dict = load_dict_from_file(data_path + name)
    df = get_dict_to_df(get_dict_compiled_results(results_dict))
    list_columns = df.columns
    list_mean_values.append(np.mean(df, axis=0))
    list_std_values.append(np.std(df, axis=0))

df_mean = pd.DataFrame(list_mean_values, columns=list_columns).T
df_std = pd.DataFrame(list_std_values, columns=list_columns).T

df_mean.columns = compare_files
df_std.columns = compare_files



Dictionary loaded from ../data/results/mini/04_09_RAG_v2.json
Dictionary loaded from ../data/results/mini/16_10_both_low_top_k_no_text_tf_no_type_token7k.json
Dictionary loaded from ../data/results/mini/16_10_both_low_top_k_no_text_tf_no_type_token7k_2.json


In [15]:
df_mean

Unnamed: 0,04_09_RAG_v2.json,16_10_both_low_top_k_no_text_tf_no_type_token7k.json,16_10_both_low_top_k_no_text_tf_no_type_token7k_2.json
question_id,54.717391,54.717391,54.717391
precision,0.331522,0.35971,0.344328
recall,0.274819,0.377536,0.325
f1_score,0.288509,0.318435,0.298962
answer_in_context,0.429891,0.713225,0.691486
len_context,7176.782609,7218.565217,7219.956522
smallest_context_needed,1064.424242,1156.590164,1148.762712
matching_reponse_id,0.971571,0.974638,0.993577
self_knowledge,0.0,0.0,0.0
faithfulness,0.598148,0.67454,0.665682


16_10_both_no_text_max_no_type: nlp_id=3, nlp_emb=5, q_emb=15, q_emb_top_5=3, check_ID>10

16_10_both_low_top_k_no_text_max_no_type: nlp_id=2, nlp_emb=3, q_emb=15, q_emb_top_5=2, check_ID>10

16_10_both_low_top_k_low_check_id_no_text_max_no_type: nlp_id=2, nlp_emb=3, q_emb=15, q_emb_top_5=2, check_ID >= 4

16_10_both_low_top_k_no_q_emb_top_5_no_text_max_no_type: nlp_id=2, nlp_emb=3, q_emb=5, q_emb_top_5=NONE, check_ID>10

16_10_sp_low_top_k_no_text_max_no_type: sp nlp_id=2, nlp_emb=3, q_emb=15, q_emb_top_5=2, check_ID>10

16_10_both_low_top_k_no_text_max: nlp_id=2, nlp_emb=3, q_emb=15, q_emb_top_5=2, check_ID>10, 

16_10_both_low_top_k_no_text_strange_tdidf_no_type: using old tdidf nlp_id=2, nlp_emb=3, q_emb=15, q_emb_top_5=2, check_ID>10

16_10_both_low_top_k_no_text_tf_no_type: tf nlp_id=2, nlp_emb=3, q_emb=15, q_emb_top_5=2, check_ID>10

16_10_both_low_top_k_no_text_tf_no_type_token7k: tf nlp_id=2, nlp_emb=3, q_emb=15, q_emb_top_5=2, check_ID>10 token7k


In [22]:
results_dict = load_dict_from_file(data_path + "16_10_both_low_top_k_no_text_tf_no_type_token7k.json")

dict_path = {}
dict_link = {}
for item in results_dict:    
    context_ = str(results_dict[item]['answer_0']["context"])
    splits_ = context_.split("\n")
    len_splits_ = [len(split.split("->")) for split in splits_]
    
    # create a bool for have any split greater than 2
    is_path = any([item > 3 for item in len_splits_])
    if is_path:
        dict_path[item] = results_dict[item]
    else:
        dict_link[item] = results_dict[item]

print(len(dict_path))
print(len(dict_link))

Dictionary loaded from ../data/results/mini/16_10_both_low_top_k_no_text_tf_no_type_token7k.json
54
55


In [23]:
list_mean_values, list_std_values = [], []

df = get_dict_to_df(get_dict_compiled_results(dict_path))
list_columns = df.columns
list_mean_values.append(np.mean(df, axis=0))
list_std_values.append(np.std(df, axis=0))
df = get_dict_to_df(get_dict_compiled_results(dict_link))
list_columns = df.columns
list_mean_values.append(np.mean(df, axis=0))
list_std_values.append(np.std(df, axis=0))

df_mean = pd.DataFrame(list_mean_values, columns=list_columns).T
df_std = pd.DataFrame(list_std_values, columns=list_columns).T

df_mean.columns = ["path", "link"]
df_std.columns = ["path", "link"]

df_mean



Unnamed: 0,path,link
question_id,60.489796,48.139535
precision,0.420918,0.28996
recall,0.346939,0.412403
f1_score,0.355653,0.276024
answer_in_context,0.740476,0.682171
len_context,7175.44898,7267.697674
smallest_context_needed,818.529412,1582.296296
matching_reponse_id,0.979592,0.968992
self_knowledge,0.0,0.0
faithfulness,0.656755,0.694807


In [21]:
list_mean_values, list_std_values = [], []

df = get_dict_to_df(get_dict_compiled_results(dict_path))
list_columns = df.columns
list_mean_values.append(np.mean(df, axis=0))
list_std_values.append(np.std(df, axis=0))
df = get_dict_to_df(get_dict_compiled_results(dict_link))
list_columns = df.columns
list_mean_values.append(np.mean(df, axis=0))
list_std_values.append(np.std(df, axis=0))

df_mean = pd.DataFrame(list_mean_values, columns=list_columns).T
df_std = pd.DataFrame(list_std_values, columns=list_columns).T

df_mean.columns = ["path", "link"]
df_std.columns = ["path", "link"]

df_mean



Unnamed: 0,path,link
question_id,60.489796,48.139535
precision,0.435374,0.240578
recall,0.360544,0.284496
f1_score,0.371429,0.216383
answer_in_context,0.740476,0.635659
len_context,7175.408163,7270.72093
smallest_context_needed,801.617647,1620.88
matching_reponse_id,1.0,0.986258
self_knowledge,0.0,0.0
faithfulness,0.644558,0.689754


In [None]:
for item in results_dict_rag:
    response_name = list(results_dict_rag[item]["answer_0"]["response"])
    response_id = list(results_dict_rag[item]["answer_0"]["matching_ids"])
    if len(response_name) != len(response_id):
        print(item)
        print(results_dict_rag[item]["answer_0"]["response"])
        print(results_dict_rag[item]["answer_0"]["matching_ids"])

In [None]:
results_dict_rag = load_dict_from_file(data_path + "04_09_RAG_v2.json")
results_rag = get_dict_compiled_results(results_dict_rag)

results_dict_sp = load_dict_from_file(data_path + "16_10_both_no_text_max_no_type.json")
results_sp_10k_hybrid = get_dict_compiled_results(results_dict_sp)


In [None]:
results_dict_sp

In [None]:
results_dict_rag

In [None]:
compare_results_Q("8", ['04_09_RAG_v2.json', '12_09_both_10k_hybrid099.json'])

# By question

In [34]:
import matplotlib.pyplot as plt
import numpy as np

def get_variable_name(variable):
    names = [name for name in globals() if globals()[name] is variable]
    if names:
        return names[0]
    return None

def plot_by_question(data, name_of_metric="metric", data2=None, where_=None):
    if name_of_metric == "metric":
        name_of_metric = get_variable_name(data)

    if where_ is None:
        where_ = np.ones_like(data, dtype=bool)

    # Plotting
    questions = np.array(list_question_ids)[where_]

    plt.figure(figsize=(14, 3))

    def plot_(data_, colors=["blue", "green"], marker='o', name=None):
        # Plot non-NaN points
        is_nan = np.isnan(data_)
        print("mean of data: ", np.nanmean(data_))
        plt.scatter(questions[~is_nan], np.array(data_)[~is_nan], marker=marker, s=20, color=colors[0], label=f"{name} Mean")

        print(f"Number of NaN points: {np.sum(is_nan)}")

        if np.sum(is_nan) > 0:
            # Explicitly plot NaN points as green dots
            plt.scatter(questions[is_nan], np.zeros_like(questions[is_nan]), color=colors[1], marker=marker, s=20, label="NaN Points")

    plot_(np.array(data)[where_], colors=["blue", "green"], marker='o', name='first')
    if data2 is not None:
        plot_(np.array(data2)[where_], colors=["purple", "darkgreen"], marker='x', name='second')

    plt.xlabel('Question', fontsize=14)
    plt.ylabel(f"{name_of_metric}", fontsize=14)
    plt.title(f"Mean {name_of_metric} with Error Bars for Each Question", fontsize=16)
    plt.xticks(questions, rotation=90)
    plt.grid(True, linestyle='--', alpha=0.7)

    # put legend outside of the plot
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

    plt.tight_layout()
    plt.show()

In [None]:
where_ = np.where(np.array(results_rag["answer_in_context"])<np.array(results_sp_10k_hybrid["answer_in_context"]))[0]
where_

In [None]:
plot_by_question(results_rag["precision"], "precision", results_sp_10k_hybrid["precision"], where_)

In [None]:
plot_by_question(results_rag["recall"], "recall", results_sp_10k_hybrid["recall"], where_)

In [None]:
plot_by_question(results_rag["rouge_score"], "rouge_score", results_sp_10k_hybrid["rouge_score"], where_)

In [None]:
plot_by_question(results_rag["smallest_context_needed"], "smallest_context_needed", results_sp_10k_hybrid["smallest_context_needed"], where_)

In [None]:
plot_by_question(results_rag["len_context"], "len_context", results_sp_10k_hybrid["len_context"], where_)