In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines

mcp_path = f"./multiple_choice_prompt.csv"
poe_path = f"./process_of_elimination.csv"


raw_poe_df = pd.read_csv(poe_path)
raw_mcp_df = pd.read_csv(mcp_path)

# rename mask_token: nan to "empty"
# raw_df['mask_token'] = raw_df['mask_token'].fillna("empty")
# the [N/A] is actually N/A in implementation.
# raw_df['mask_token'].unique()

In [2]:
def process(df, drop_columns=None): 
    # get rid of identical rows 
    df = df.drop_duplicates()
    if isinstance(drop_columns, list):
        drop_columns += ["model_family", "seed", "batch_size", "loading_precision", "sample"]
    else:
        drop_columns = ["model_family", "seed", "batch_size", "loading_precision", "sample"]
    df = df.drop(columns=drop_columns)
    # shorten checkpoint names
    df['checkpoint'] = df["checkpoint"].apply(lambda x: x.split("/")[-1])
    return df

def process_v2(df):
    datasets="code_line_description reasoning_about_colored_objects crass_ai evaluating_information_essentiality identify_math_theorems identify_odd_metaphor logical_args riddle_sense".split()
    df = df[df["dataset"].isin(datasets)]
    df_mean = df.groupby(["dataset", "checkpoint", "method"]).mean().reset_index()
    df_error = df.groupby(["dataset", "checkpoint", "method"]).std().reset_index()
    df_mean = df_mean.drop(columns=["checkpoint"])
    df_error = df_error.drop(columns=["checkpoint"])
    # accuracy: 3 decimal places
    df_mean["accuracy"] = df_mean["accuracy"].apply(lambda x: round(x, 3))
    df_error["accuracy"] = df_error["accuracy"].apply(lambda x: round(x, 3))
    return df_mean, df_error

In [3]:
poe_df = process(raw_poe_df, drop_columns=["n_shot", "prompting_method", "scoring_method", "mask_strategy", "mask_accuracy", "mask_token"])
mcp_df = process(raw_mcp_df, drop_columns=["n_shot"])

poe_df, poe_error = process_v2(poe_df)
mcp_df, mcp_error = process_v2(mcp_df)

# save to csv
poe_df.to_csv("poe.csv", index=False)
mcp_df.to_csv("mcp.csv", index=False)

# rename accuracy to poe_accuracy
poe_df = poe_df.rename(columns={"accuracy": "poe_accuracy"})
mcp_df = mcp_df.rename(columns={"accuracy": "mcp_accuracy"})

# drop colum: method
poe_df = poe_df.drop(columns=["method"])
mcp_df = mcp_df.drop(columns=["method"])

# merge
df = pd.merge(poe_df, mcp_df, on=["dataset"])

# rearrange columns: dataset, mcp_accuracy, poe_accuracy
df = df[["dataset", "mcp_accuracy", "poe_accuracy"]]
# mcp_accuracy, poe_accuracy: convert to percentage
df["mcp_accuracy"] = df["mcp_accuracy"] * 100
df["poe_accuracy"] = df["poe_accuracy"] * 100
# compute the difference, 3 decimal places
df["difference"] = df["poe_accuracy"] - df["mcp_accuracy"]
df["difference"] = df["difference"].apply(lambda x: round(x, 3))
# sort by difference: descending
df = df.sort_values(by=["difference"], ascending=False)

# save to csv
df.to_csv("logical_reasoning.csv", index=False)

In [4]:
temp_df = df.copy()
temp_df = temp_df.set_index("dataset")
temp_df = temp_df.rename(columns={"mcp_accuracy": "MCP",
                                    "poe_accuracy": "PoE",
                                    "difference": "PoE - MCP"
                                  },
                          index={"logical_args":"LA",
                                "identify_math_theorems": "IMT",
                                "code_line_description": "CLD",
                                "reasoning_about_colored_objects": "RACO",
                                "crass_ai": "CAI",
                                "evaluating_information_essentiality": "EIE",
                                "identify_odd_metaphor": "IOM",
                                "riddle_sense": "RS",
                                "identify_odd_metaphor": "IOM",
                                })


temp_df = temp_df.reset_index()
temp_df = temp_df.rename(columns={"dataset": "Task"})
# save to csv
temp_df.to_csv("logical_reasoning_v2.csv", index=False)
# save to markdown
temp_df.to_markdown("logical_reasoning_v2.md", index=False)

In [20]:
latex_df = temp_df.copy()

latex_df = latex_df.set_index("Task")
latex_df = latex_df.applymap(lambda x: f"{x:.1f}")
latex_df['PoE - MCP'] = latex_df['PoE - MCP'].apply(lambda x: "+" + x if float(x) > 0 else x)

latex_df.style.to_latex(
    buf="logical_reasoning.tex",
    column_format="l|c|c|c",
    position="h",
    position_float="centering",
    label="tab:logical_reasoning",
    caption=r"Comparison of MCP and PoE accuracy scores on 8 new tasks. The top 4 tasks are logical reasoning tasks. PoE largely outperforms MCP on 4 logical reasoning tasks, and underperforms MCP on other 4 tasks.",
    multicol_align="c",
    multirow_align="c",
)