In [1]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

data = []
k_values = [50, 100, 200, 500]
df_50 = pd.read_csv(f"results_50.csv", nrows=10001) 
df_100 = pd.read_csv(f"results_100.csv", nrows=10001) 
df_200 = pd.read_csv(f"results_200.csv", nrows=10001) 
df_500 = pd.read_csv(f"results_500.csv", nrows=10001)
df_all = [df_50,df_100,df_200,df_500]


In [None]:

for k, df in zip(k_values,df_all):
    df["bm_25"] = df["bm_25"].apply(lambda x: 1 if x else 0)
    df["dpr"] = df["dpr"].apply(lambda x: 1 if x else 0)

    # Create subplots
    table_infos = df["table_info"].unique()
    sns.set(style="whitegrid")

    # Create a FacetGrid for subplots by table_info
    g = sns.FacetGrid(df, col="table_info", height=5, aspect=1.2, sharey=False)

    # Map barplot to each subplot
    g.map_dataframe(sns.barplot, x="top_k", y="bm_25", color="blue", label="BM25", alpha=0.6)
    g.map_dataframe(sns.barplot, x="top_k", y="dpr", color="orange", label="DPR", alpha=0.6)

    # Adjust the layout and add titles
    g.set_axis_labels("Top-k", "Boolean Value (True=1, False=0)")
    g.set_titles("{col_name}")
    #g.add_legend(title="Metrics", labels=["BM25", "DPR"])
    g.add_legend(title="Metrics")

    # Show the plot
    plt.suptitle(f"Comparison of BM25 and DPR Across Retrieved from {k} by different Table Info", y=1.02)
    plt.show()

In [None]:
for k, df in zip(k_values,df_all):
    mean_values = df.groupby(['top_k',"table_info"])[['bm_25', 'dpr']].mean().reset_index()
    mean_values_melted = mean_values.melt(id_vars=['top_k',"table_info"], var_name='Approach', value_name='Mean Value')
    sns.set(style="whitegrid")
    # Create a FacetGrid for subplots by table_info
    g = sns.FacetGrid(mean_values_melted, col="table_info",col_order=['title_tab-description', 'title_column_header', 'title_col_table', 'exact_row'], height=5, aspect=1.2, sharey=False)

    # Map barplot to each subplot
    g.map_dataframe(sns.lineplot,data=mean_values_melted, x='top_k', y='Mean Value', hue='Approach', marker='o')

    # Adjust the layout and add titles
    g.set_axis_labels("Top-k", "Boolean Value (True=1, False=0)")
    g.set_titles("{col_name}")
    #g.add_legend(title="Metrics", labels=["BM25", "DPR"])
    g.add_legend(title="Metrics")

    # Show the plot
    plt.suptitle(f'Mean Value of Approaches by {k}', fontsize=14)
    plt.show()
   

In [None]:
for k in  k_values:
    # Assuming bm_25 and dpr are binary (True/False)
    conf_matrix = confusion_matrix(df_all[df_all["k"]==k]['bm_25'], df_all[df_all["k"]==k]['dpr'])

    # Display the confusion matrix
    disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["False", "True"])
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix( for k = {k}): BM25 vs DPR')
    plt.xlabel("DPR")
    plt.ylabel("BM25")
    plt.show()