- rename models in df
- rename tasks in df
- print table for tasks, datasets

In [56]:
import pandas as pd
from IPython.display import display, Markdown, Latex

In [52]:
import pandas as pd

def generate_table(df, metric="F1", caption="", label="", output_format="latex"):
    """
    Generate a table from a pandas DataFrame containing F1 scores and standard errors.
    
    Parameters:
    df (pd.DataFrame): DataFrame with columns 'Model', 'Task', 'F1', and 'F1_SE'.
    metric (str, optional): The metric to use (default is "F1").
    caption (str, optional): Caption for the table. Defaults to "F1 macro scores with standard errors for `out of distribution' tasks.".
    label (str, optional): Label for the table. Defaults to "tab:ood_bench".
    output_format (str, optional): Format of the table. Use "latex" for LaTeX output and "markdown" for Markdown output.
    
    Returns:
    str: A formatted table as a string in the specified format.
    """
    # Define the tasks based on the DataFrame
    tasks = df['Task'].unique()
    
    # Pivot the DataFrame so that rows = Model and columns = Task.
    pivot_ = df.pivot(index='Model', columns='Task', values=f'{metric}')
    pivot_SE = df.pivot(index='Model', columns='Task', values=f'{metric}_SE')
    
    # Compute best scores for each task
    best_scores = {task: pivot_[task].max() if task in pivot_.columns else None for task in tasks}
    
    # Get unique models
    models = df["Model"].unique()
    
    if output_format.lower() == "latex":
        # Begin constructing the LaTeX table as a list of strings.
        latex_lines = [
            r"\begin{table}[htbp]",
            r"\centering",
            fr'\caption{{{caption}}}',
            fr"\label{{{label}}}",
            r"\begin{tabular}{p{3cm}" + "p{.9cm}" * len(tasks) + r"}",
            r"\toprule",
            "Model & " + " & ".join(tasks) + r" \\",
            r"\toprule"
        ]
        
        for model in models:
            row_cells = []
            for task in tasks:
                if task in pivot_.columns:
                    score = pivot_.loc[model, task]
                    se = pivot_SE.loc[model, task]
                    if pd.notna(score):
                        # Bold the best score
                        if best_scores[task] is not None and score == best_scores[task]:
                            cell = rf"$\mathbf{{{100*score:.1f}}}_{{ {100*se:.1f} }}$"
                        else:
                            cell = rf"${100*score:.1f}_{{ {100*se:.1f} }}$"
                    else:
                        cell = ""
                else:
                    cell = ""
                row_cells.append(cell)
            
            # Add row to LaTeX table
            row_str = model + " & " + " & ".join(row_cells) + r" \\[1ex]"
            latex_lines.append(row_str)
        
        # Finalize table
        latex_lines.extend([r"\bottomrule", r"\end{tabular}", r"\end{table}"])
        return "\n".join(latex_lines)
    
    elif output_format.lower() == "markdown":
        markdown_lines = []
        if caption:
            markdown_lines.append(f"**{caption}**")
            markdown_lines.append("")  # blank line for spacing
        
        # Header row
        header_row = "Model | " + " | ".join(tasks)
        markdown_lines.append(header_row)
        
        # Separator row (one cell for Model and one for each task)
        separator_row = " | ".join(["---"] * (len(tasks) + 1))
        markdown_lines.append(separator_row)
        
        for model in models:
            row_cells = [model]
            for task in tasks:
                if task in pivot_.columns:
                    score = pivot_.loc[model, task]
                    se = pivot_SE.loc[model, task]
                    if pd.notna(score):
                        if best_scores[task] is not None and score == best_scores[task]:
                            cell = f"**{100*score:.1f} ({100*se:.1f})**"
                        else:
                            cell = f"{100*score:.1f} ({100*se:.1f})"
                    else:
                        cell = ""
                else:
                    cell = ""
                row_cells.append(cell)
            markdown_lines.append(" | ".join(row_cells))
        
        return "\n".join(markdown_lines)
    
    else:
        raise ValueError("Unsupported output_format. Use 'latex' or 'markdown'.")

def sort_df(df, colname, order):
    """
    df: The dataframe to sort
    colname: The column to sort by
    order: The order in which values are to appear
    """
    return df.sort_values(by=colname, key=lambda column: column.map(lambda e: order.index(e)))

In [29]:
df = pd.read_csv('../data/results_matrix.csv')

In [30]:
modnames = {
    'base_nli': 'DeBERTa_Base', 
    'large_nli': 'DeBERTa_Large',
    'base_debate': 'DEBATE_Base',
    'large_debate': 'DEBATE_Large',
    'base_modern': 'DEBATE_Base(MB)',
    'large_modern': 'DEBATE_Large(MB)',
    'llama': 'Llama 3.1_8B',
    'sonnet': 'Claude 3.5'
}
df['Model'] = df['Model'].replace(modnames)

In [31]:
tasknames = {
    'overall': 'Overall',
    'event extraction': 'Event',
    'hatespeech and toxicity': 'Hatespeech',
    'stance detection': 'Stance',
    'topic classification': 'Topic',
    'PoliStance_Affect': 'Polistance Affect',
    'PoliStance_Affect_QT': 'Polistance Affect Qt',
    'acled_event_entailment': 'ACLED',
    'argument_quality_ranking_entailment': 'Argument Quality Ranking',
    'bill_summary_entailment': 'Bill Summary',
    'dehumanizing_hatespeech_entailment': 'Dehumanizing Hatespeech',
    'dem_rep_party_platform_topics': 'Party Platforms',
    'ibm_claimstance_entailment': 'Claimstance',
    'ibm_claimstance_topic_entailment': 'Claimstance Topic',
    'polistance_issue_tweets': 'Polistance Issue Tweets',
    'scad_event_entailment': 'SCAD',
    'targeted_hatespeech_entailment': 'Targeted Hatespeech',
    'violent_hatespeech_entailment': 'Violent Hatespeech'
}
df['Task'] = df['Task'].replace(tasknames)

In [59]:
tasks = ['Stance', 'Topic', 'Hatespeech', 'Event', 'Overall']
task = df[df['Task'].isin(tasks)]
task = sort_df(task, 'Task', tasks)

task_md = generate_table(task, output_format = 'markdown')
display(Markdown(task_md))

Model | Stance | Topic | Hatespeech | Event | Overall
--- | --- | --- | --- | --- | ---
Llama 3.1_8B | 80.0 (0.6) | 96.0 (0.3) | 79.9 (0.7) | **90.5 (0.5)** | 86.3 (0.3)
DEBATE_Large(MB) | 96.7 (0.3) | 97.1 (0.2) | 95.5 (0.4) | 88.1 (0.6) | 95.0 (0.2)
DEBATE_Base(MB) | 94.9 (0.3) | 96.4 (0.3) | 94.1 (0.4) | 83.7 (0.7) | 93.1 (0.2)
DEBATE_Large | **98.1 (0.2)** | **97.6 (0.2)** | **95.7 (0.4)** | 90.4 (0.5) | **96.1 (0.2)**
Claude 3.5 | 89.8 (0.4) | 97.4 (0.2) | 85.4 (0.7) | 88.1 (0.6) | 90.9 (0.2)
DeBERTa_Base | 80.5 (0.6) | 92.0 (0.4) | 83.6 (0.8) | 74.7 (0.8) | 83.5 (0.3)
DEBATE_Base | 97.0 (0.2) | 96.9 (0.3) | 95.2 (0.4) | 86.3 (0.7) | 94.6 (0.2)
DeBERTa_Large | 81.3 (0.6) | 94.1 (0.4) | 86.0 (0.7) | 86.0 (0.7) | 87.0 (0.3)

In [None]:
stance_data = []
stance = df[df['Task'].isin(stance_data)]
stance = sort_df(stance, 'Task', stance_data)

stance_md = generate_table(stance, output_format = 'markdown')
display(Markdown(stance_md))