In [None]:
import lang2vec.lang2vec as l2v
import matplotlib.pyplot as plt
from datasets import load_dataset
import copy
import numpy as np
import pandas as pd
import pickle
import scipy
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import seaborn as sns
from collections import Counter
from tqdm import tqdm
from tabulate import tabulate
import sys

sys.path.append("../mbbq")
from mbbq import detect_answers, get_samples
import random
random.seed(42)
np.random.seed(42)

# Defaults and functions

In [None]:
file_names = [
    ("aya-expanse-8b", "ayaexpanse", "base", "instruct"),
    ("aya-expanse-8b_lora_biasdpo_model", "ayaexpanse", "biasdpo", "instruct"),
    ("aya-expanse-8b_lora_biassft_model", "ayaexpanse", "biassft", "instruct"),
    ("aya-expanse-8b_lora_panda_model", "ayaexpanse", "panda", "instruct"),
    ("aya-expanse-8b_lora_jigsaw_model", "ayaexpanse", "jigsaw", "instruct"),
    ("aya-expanse-8b_lora_detoxdpo_model", "ayaexpanse", "detoxdpo", "instruct"),
    ("aya-expanse-8b_lora_detoxsft_model", "ayaexpanse", "detoxsft", "instruct"),
    ("aya-23-8B", "aya", "base", "instruct"),
    ("aya-23-8B_lora_biasdpo_model", "aya", "biasdpo", "instruct"),
    ("aya-23-8B_lora_panda_model", "aya", "panda", "instruct"),
    ("aya-23-8B_lora_jigsaw_model", "aya", "jigsaw", "instruct"),
    ("aya-23-8B_lora_detoxdpo_model", "aya", "detoxdpo", "instruct"),
    ("aya-23-8B_lora_biassft_model", "aya", "biassft", "instruct"),
    ("aya-23-8B_lora_detoxsft_model", "aya", "detoxsft", "instruct"),
    ("Mistral-7B-v0.3", "mistral0.3", "base", "base"),
    ("Mistral-7B-Instruct-v0.3", "mistral0.3instruct", "base", "instruct"),
    ("Meta-Llama-3-8B", "llama3", "base", "base"),
    ("Meta-Llama-3-8B-Instruct", "llama3instruct", "base", "instruct"),
    ("Meta-Llama-3.1-8B", "llama3.1", "base", "base"),
    ("Meta-Llama-3.1-8B-Instruct", "llama3.1instruct", "base", "instruct"),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model",
        "llama3.1instruct",
        "panda",
        "instruct",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biassft_model",
        "llama3.1instruct",
        "biassft",
        "instruct",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxsft_model",
        "llama3.1instruct",
        "detoxsft",
        "instruct",
    ),
    ("Meta-Llama-3.1-8B_lora_biasdpo_model", "llama3.1", "biasdpo", "base"),
    ("Meta-Llama-3.1-8B_lora_biassft_model", "llama3.1", "biassft", "base"),
    ("Meta-Llama-3.1-8B_lora_panda_model", "llama3.1", "panda", "base"),
    ("Meta-Llama-3.1-8B_lora_jigsaw_model", "llama3.1", "jigsaw", "base"),
    ("Meta-Llama-3.1-8B_lora_detoxdpo_model", "llama3.1", "detoxdpo", "base"),
    ("Meta-Llama-3.1-8B_lora_detoxsft_model", "llama3.1", "detoxsft", "base"),
    ("gemma-2-2b", "gemma2b", "base", "base"),
    ("gemma-2-2b-it", "gemma2binstruct", "base", "instruct"),
    ("gemma-2-9b", "gemma9b", "base", "base"),
    ("gemma-2-9b_lora_biasdpo_model", "gemma9b", "biasdpo", "base"),
    ("gemma-2-9b_lora_biassft_model", "gemma9b", "biassft", "base"),
    ("gemma-2-9b_lora_panda_model", "gemma9b", "panda", "base"),
    ("gemma-2-9b_lora_jigsaw_model", "gemma9b", "jigsaw", "base"),
    ("gemma-2-9b_lora_detoxdpo_model", "gemma9b", "detoxdpo", "base"),
    ("gemma-2-9b_lora_detoxsft_model", "gemma9b", "detoxsft", "base"),
    ("gemma-2-9b-it", "gemma9binstruct", "base", "instruct"),
    ("gemma-2-9b-it_lora_biasdpo_model", "gemma9binstruct", "biasdpo", "instruct"),
    ("gemma-2-9b-it_lora_biassft_model", "gemma9binstruct", "biassft", "instruct"),
    ("gemma-2-9b-it_lora_panda_model", "gemma9binstruct", "panda", "instruct"),
    ("gemma-2-9b-it_lora_jigsaw_model", "gemma9binstruct", "jigsaw", "instruct"),
    ("gemma-2-9b-it_lora_detoxdpo_model", "gemma9binstruct", "detoxdpo", "instruct"),
    ("gemma-2-9b-it_lora_detoxsft_model", "gemma9binstruct", "detoxsft", "instruct"),
]

In [None]:
def compute_biasd_score(context_cond, labels, target_loc, pred):
    if context_cond == "ambig":
        return 0
    elif target_loc == labels and pred == 1:
        return 1
    elif target_loc == 100:
        return 0 
    elif target_loc != labels and pred == -1:
        return -1
    else:
        return 0

In [None]:
def get_understood_templates(
    dicts, languages, models, all_templates, disamb_only=False, amb_only=False
):
    dicts = copy.deepcopy(dicts)
    language_templates = {l: set(all_templates) for l in languages}
    model_templates = {m: set(all_templates) for m in models}
    for i, dicti in enumerate(dicts):
        for model in models:
            templates = []
            for t in all_templates:
                if disamb_only:
                    accuracy = (
                        dicti[dicti["subset"] == t[0]][dicti["q_id"] == t[1]][
                            dicti["context_condition"] == "disambig"
                        ]["labels"]
                        == dicti[dicti["subset"] == t[0]][dicti["q_id"] == t[1]][
                            dicti["context_condition"] == "disambig"
                        ][f"answer_{model}_detected"]
                    ).mean()
                elif amb_only:
                    accuracy = (
                        dicti[dicti["subset"] == t[0]][dicti["q_id"] == t[1]][
                            dicti["context_condition"] == "ambig"
                        ]["labels"]
                        == dicti[dicti["subset"] == t[0]][dicti["q_id"] == t[1]][
                            dicti["context_condition"] == "ambig"
                        ][f"answer_{model}_detected"]
                    ).mean()
                else:
                    accuracy = (
                        dicti[dicti["subset"] == t[0]][dicti["q_id"] == t[1]]["labels"]
                        == dicti[dicti["subset"] == t[0]][dicti["q_id"] == t[1]][
                            f"answer_{model}_detected"
                        ]
                    ).mean()
                if accuracy > 0.33:
                    templates.append(t)
            model_templates[model] = model_templates[model].intersection(set(templates))
            language_templates[languages[i]] = language_templates[
                languages[i]
            ].intersection(set(templates))
    return language_templates, model_templates

In [None]:
def get_kobbq_ind_template_friedman(
    dicts,
    languages,
    models,
    score,
    return_all_results=False,
    split_by_subset=False,
    templates=None,
    verbose=True,
):
    dicts = copy.deepcopy(dicts)
    for i in range(len(dicts)):
        dicts[i].drop(dicts[i][dicts[i]["target_loc"] == 100].index, inplace=True)
        if type(templates) == set:
            selected_templates = pd.DataFrame([])
            for t in templates:
                selected_templates = pd.concat(
                    [
                        selected_templates,
                        dicts[i][dicts[i]["subset"] == t[0]][dicts[i]["q_id"] == t[1]],
                    ]
                )
            dicts[i] = selected_templates
        for model in models:
            if score == "amb accuracy":
                dicts[i][f"{model}_template_score"] = (
                    dicts[i][f"answer_{model}_detected"] == dicts[i]["labels"]
                )
                dicts[i] = dicts[i][dicts[i]["context_condition"] == "ambig"]
            elif score == "disamb accuracy":
                dicts[i][f"{model}_template_score"] = (
                    dicts[i][f"answer_{model}_detected"] == dicts[i]["labels"]
                )
                dicts[i] = dicts[i][dicts[i]["context_condition"] == "disambig"]
            elif score == "amb bias":
                dicts[i][f"{model}_template_score"] = dicts[i][
                    f"answer_{model}_processed"
                ]
                dicts[i] = dicts[i][dicts[i]["context_condition"] == "ambig"]
            elif score == "disamb bias":
                dicts[i][f"{model}_template_score"] = dicts[i].apply(
                    lambda x: compute_biasd_score(
                        x["context_condition"],
                        x["labels"],
                        x["target_loc"],
                        x[f"answer_{model}_processed"],
                    ),
                    axis=1,
                )
                dicts[i] = dicts[i][dicts[i]["context_condition"] == "disambig"]
    language_scores = {}
    model_scores = {}

    for i in range(len(languages)):
        scores = dicts[i][
            ["prompt_id", "subset", "q_id"]
            + [f"{model}_template_score" for model in models]
        ].set_index(["prompt_id"])
        for model in models:
            model_scores[model] = scores[
                ["subset", "q_id", f"{model}_template_score"]
            ].reset_index(drop=False)
        scores = scores.reset_index(drop=False)
        language_scores[languages[i]] = scores

    if len(languages) > 1:
        final_language_scores = {}
        for language1 in languages:
            final_language_scores[language1] = (
                language_scores[language1]
                .groupby(["subset", "q_id", "prompt_id"])[f"{models[0]}_template_score"]
                .mean()
            )
        means = []
        if split_by_subset:
            subset_results = {language: {} for language in languages}
        else:
            results = {}
        for k, v in final_language_scores.items():
            if split_by_subset:
                v = v.reset_index(drop=False)
                subset_data = [
                    v[v["subset"] == subset][f"{models[0]}_template_score"].to_list()
                    for subset in [
                        "Age",
                        "Disability_status",
                        "Gender_identity",
                        "Physical_appearance",
                        "SES",
                        "Sexual_orientation",
                    ]
                ]
                for subset, v_subset in zip(
                    [
                        "Age",
                        "Disability_status",
                        "Gender_identity",
                        "Physical_appearance",
                        "SES",
                        "Sexual_orientation",
                    ],
                    subset_data,
                ):
                    if not v_subset:
                        print(subset, "nan")
                    else:
                        print(
                            subset, len(v_subset), np.nanmean(v_subset)
                        )
                test = scipy.stats.kruskal(
                    *[s_data for s_data in subset_data if s_data]
                )
                print(test)
                subset_results[k] = [
                    (
                        (
                            np.nanmean(v_subset),
                            scipy.stats.ttest_1samp(v_subset, 0).pvalue < 0.05,
                        )
                        if v_subset
                        else (np.nan, False)
                    )
                    for v_subset in subset_data
                ]
                continue
            v = v.tolist()
            means.append(np.nanmean(v))
            t_test = scipy.stats.ttest_1samp(v, 0)
            if verbose:
                print(k, len(v), np.nanmean(v))
                print(t_test)
            results[k] = (np.nanmean(v), t_test.pvalue < 0.05)
        if not split_by_subset:
            test = scipy.stats.kruskal(*list(final_language_scores.values()))
            if verbose:
                print(test)
            if return_all_results:
                return final_language_scores
            return results
        if split_by_subset:
            return subset_results

    if len(models) > 1:
        final_model_scores = {}
        for model in models:
            final_model_scores[model] = (
                model_scores[model]
                .groupby(["subset", "q_id", "prompt_id"])[f"{model}_template_score"]
                .mean()
            )
        means = []
        if split_by_subset:
            subset_results = {language: {} for language in languages}
        else:
            results = {}
        for k, v in final_model_scores.items():
            if split_by_subset:
                v = v.reset_index(drop=False)
                subset_data = [
                    v[v["subset"] == subset][f"{model}_template_score"].to_list()
                    for subset in [
                        "Age",
                        "Disability_status",
                        "Gender_identity",
                        "Physical_appearance",
                        "SES",
                        "Sexual_orientation",
                    ]
                ]
                for subset, v_subset in zip(
                    [
                        "Age",
                        "Disability_status",
                        "Gender_identity",
                        "Physical_appearance",
                        "SES",
                        "Sexual_orientation",
                    ],
                    subset_data,
                ):
                    if not v_subset:
                        print(subset, "nan")
                    else:
                        print(
                            subset, len(v_subset), np.nanmean(v_subset)
                        )
                test = scipy.stats.kruskal(
                    *[s_data for s_data in subset_data if s_data]
                )
                print(test)
                subset_results[k] = [
                    (
                        (
                            np.nanmean(v_subset),
                            scipy.stats.ttest_1samp(v_subset, 0).pvalue < 0.05,
                        )
                        if v_subset
                        else (np.nan, False)
                    )
                    for v_subset in subset_data
                ]
                continue
            v = v.tolist()
            means.append(np.nanmean(v))
            t_test = scipy.stats.ttest_1samp(v, 0)
            if verbose:
                print(k, len(v), np.nanmean(v))
                print(t_test)
            results[k] = (np.nanmean(v), t_test.pvalue < 0.05)

        if not split_by_subset:
            test = scipy.stats.kruskal(*list(final_model_scores.values()))
            if verbose:
                print(test)
            if return_all_results:
                return final_model_scores
            return results
        if split_by_subset:
            return subset_results

In [None]:
def print_corr_with_p(df, method="pearson", add_pval=False):
    rho = df.corr(method=method)
    corr_method = pearsonr if method == "pearson" else spearmanr
    pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    if add_pval:
        p = pval.round(2).astype(str).map(lambda x: " p: " + x)
    else:
        p = pval.map(lambda x: "".join(["*" for t in [0.05, 0.01, 0.001] if x <= t]))
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print(rho.round(2).astype(str) + p)

# StereoSet

In [None]:
names = {
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "tr": "Turkish",
    "en": "English",
    "kr": "Korean",
}


def dict_func():
    return {}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "ss_score": [],
    "lm_score": [],
    "bias_type": [],
}

for model_name, model, dataset, instruct in file_names:
    if dataset in ["jigsaw", "detoxdpo", "detoxsft"]:
        continue
    for language in names:
        with open(f"stereoset/{model_name}_{language}_ssresults.pkl", "rb") as infile:
            score = pickle.load(infile)
        for bias_type in ["gender", "profession", "race", "religion", "overall"]:
            data["model"].append(model)
            data["dataset"].append(dataset)
            data["instruction"].append(instruct)
            data["language"].append(names[language])
            data["ss_score"].append(score["intrasentence"][bias_type]["SS Score"])
            data["lm_score"].append(score["intrasentence"][bias_type]["LM Score"])
            data["bias_type"].append(bias_type)

df_stereoset = pd.DataFrame(data)

In [None]:
df_stereoset

## SS score

In [None]:
df_stereoset[df_stereoset["bias_type"] == "overall"][
    df_stereoset["language"] == "English"
].groupby(["model", "dataset"])["ss_score"].sum().round(2)

In [None]:
df_stereoset[df_stereoset["bias_type"] == "overall"][
    df_stereoset["language"] != "English"
].groupby(
    [
        "model",
        "dataset",
    ]
)[
    "ss_score"
].mean().round(
    2
)

In [None]:
df_stereoset[df_stereoset["bias_type"] == "overall"][
    df_stereoset["language"] != "English"
].groupby(["model", "dataset"])["ss_score"].std(ddof=0).round(2)

# CrowSPairs

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "zh": "Chinese",
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "ss_score": [],
    "n_samples": [],
    "bias_type": [],
}

for model_name, model, dataset, instruct in file_names:
    if dataset in ["jigsaw", "detoxdpo", "detoxsft"]:
        continue
    for language in names:
        with open(f"crowspairs/{model_name}_{language}_cspresults.pkl", "rb") as infile:
            score = pickle.load(infile)
        for bias_type in [
            "race-color",
            "socioeconomic",
            "gender",
            "disability",
            "nationality",
            "sexual-orientation",
            "physical-appearance",
            "religion",
            "age",
        ]:
            data["model"].append(model)
            data["dataset"].append(dataset)
            data["instruction"].append(instruct)
            data["language"].append(names[language])
            data["ss_score"].append(score[bias_type][0])
            data["n_samples"].append(score[bias_type][1])
            data["bias_type"].append(bias_type)

df_crowspairs = pd.DataFrame(data)

In [None]:
df_crowspairs

In [None]:
n_samples = {
    sorted(list(names.values()))[i]: df_crowspairs.groupby(["model", "dataset", "instruction", "language"])[
        "n_samples"
    ]
    .sum()
    .values[i]
    for i in range(len(list(names.values())))
}

In [None]:
new_rows = {
    "model": [],
    "dataset": [],
    "instruction":[],
    "language": [],
    "n_samples": [],
    "ss_score": [],
    "bias_type": [],
}
for row in (
    df_crowspairs.groupby(["model", "dataset", "instruction", "language"])
    .apply(lambda x: np.average(x.ss_score, weights=x.n_samples))
    .reset_index()
    .iloc
):
    new_rows["model"].append(row["model"])
    new_rows["dataset"].append(row["dataset"])
    new_rows["instruction"].append(row["instruction"])
    new_rows["language"].append(row["language"])
    new_rows["n_samples"].append(n_samples[row["language"]])
    new_rows["ss_score"].append(row[0])
    new_rows["bias_type"].append("overall")

df_crowspairs = pd.concat(
    [df_crowspairs, pd.DataFrame(data=new_rows)], ignore_index=True
)

In [None]:
df_crowspairs

## SS Score

In [None]:
df_crowspairs[df_crowspairs["bias_type"] == "overall"][
    df_crowspairs["language"] == "English"
].groupby(["model", "dataset"])["ss_score"].sum().round(2)

In [None]:
df_crowspairs[df_crowspairs["bias_type"] == "overall"][
    df_crowspairs["language"] != "English"
].groupby(["model", "dataset"])["ss_score"].mean().round(2)

In [None]:
df_crowspairs[df_crowspairs["bias_type"] == "overall"][
    df_crowspairs["language"] != "English"
].groupby(["model", "dataset"])["ss_score"].std(ddof=0).round(2)

# MBBQ

In [None]:
models = [
    "aya-23-8B",
    "aya-23-8Bpanda",
    "aya-23-8Bbiasdpo",
    "aya-23-8Bbiassft",
    "aya-expanse-8b",
    "aya-expanse-8bpanda",
    "aya-expanse-8bbiasdpo",
    "aya-expanse-8bbiassft",
    "Meta-Llama-3-8B-Instruct",
    "Meta-Llama-3.1-8B-Instruct",
    "Meta-Llama-3.1-8B-Instructpanda",
    "Meta-Llama-3.1-8B-Instructbiasdpo",
    "Meta-Llama-3.1-8B-Instructbiassft",
    "Mistral-7B-Instruct-v0.3",
    "gemma-2-9b-it",
    "gemma-2-9b-itpanda",
    "gemma-2-9b-itbiasdpo",
    "gemma-2-9b-itbiassft",
    "gemma-2-2b-it",
]

In [None]:
with open("mbbq/trialaya-23-8B_samples_en.pkl", "rb") as infile:
    english_samples = pickle.load(infile)

english_samples = english_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_samples_en.pkl", "rb") as infile:
        samples = pickle.load(infile)
    english_samples[f"answer_{model}"] = samples["answer"]
    english_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    english_samples[f"answer_{model}_processed"] = samples["answer_processed"]

with open("mbbq/trialaya-23-8B_control_samples_en.pkl", "rb") as infile:
    english_control_samples = pickle.load(infile)

english_control_samples = english_control_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_control_samples_en.pkl", "rb") as infile:
        samples = pickle.load(infile)
    english_control_samples[f"answer_{model}"] = samples["answer"]
    english_control_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    english_control_samples[f"answer_{model}_processed"] = samples["answer_processed"]

In [None]:
with open("mbbq/trialaya-23-8B_samples_es.pkl", "rb") as infile:
    spanish_samples = pickle.load(infile)

spanish_samples = spanish_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_samples_es.pkl", "rb") as infile:
        samples = pickle.load(infile)
    spanish_samples[f"answer_{model}"] = samples["answer"]
    spanish_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    spanish_samples[f"answer_{model}_processed"] = samples["answer_processed"]

with open("mbbq/trialaya-23-8B_control_samples_es.pkl", "rb") as infile:
    spanish_control_samples = pickle.load(infile)

spanish_control_samples = spanish_control_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_control_samples_es.pkl", "rb") as infile:
        samples = pickle.load(infile)
    spanish_control_samples[f"answer_{model}"] = samples["answer"]
    spanish_control_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    spanish_control_samples[f"answer_{model}_processed"] = samples["answer_processed"]

In [None]:
with open("mbbq/trialaya-23-8B_samples_nl.pkl", "rb") as infile:
    dutch_samples = pickle.load(infile)

dutch_samples = dutch_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_samples_nl.pkl", "rb") as infile:
        samples = pickle.load(infile)
    dutch_samples[f"answer_{model}"] = samples["answer"]
    dutch_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    dutch_samples[f"answer_{model}_processed"] = samples["answer_processed"]

with open("mbbq/trialaya-23-8B_control_samples_nl.pkl", "rb") as infile:
    dutch_control_samples = pickle.load(infile)

dutch_control_samples = dutch_control_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_control_samples_nl.pkl", "rb") as infile:
        samples = pickle.load(infile)
    dutch_control_samples[f"answer_{model}"] = samples["answer"]
    dutch_control_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    dutch_control_samples[f"answer_{model}_processed"] = samples["answer_processed"]

In [None]:
with open("mbbq/trialaya-23-8B_samples_tr.pkl", "rb") as infile:
    turkish_samples = pickle.load(infile)

turkish_samples = turkish_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_samples_tr.pkl", "rb") as infile:
        samples = pickle.load(infile)
    turkish_samples[f"answer_{model}"] = samples["answer"]
    turkish_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    turkish_samples[f"answer_{model}_processed"] = samples["answer_processed"]

with open("mbbq/trialaya-23-8B_control_samples_tr.pkl", "rb") as infile:
    turkish_control_samples = pickle.load(infile)

turkish_control_samples = turkish_control_samples.rename(
    columns={
        "answer": "answer_aya-23-8B",
        "answer_detected": "answer_aya-23-8B_detected",
        "answer_processed": "answer_aya-23-8B_processed",
    }
)

for model in models[1:]:
    with open(f"mbbq/trial{model}_control_samples_tr.pkl", "rb") as infile:
        samples = pickle.load(infile)
    turkish_control_samples[f"answer_{model}"] = samples["answer"]
    turkish_control_samples[f"answer_{model}_detected"] = samples["answer_detected"]
    turkish_control_samples[f"answer_{model}_processed"] = samples["answer_processed"]

## Selected templates across languages

In [None]:
all_templates = list(
    set(zip(english_control_samples["subset"], english_control_samples["q_id"]))
)

languages = ["English", "Dutch", "Spanish", "Turkish"]

language_templates, model_templates = get_understood_templates(
    [
        dutch_control_samples,
        english_control_samples,
        spanish_control_samples,
        turkish_control_samples,
    ],
    ["English", "Dutch", "Spanish", "Turkish"],
    models,
    all_templates,
    disamb_only=True,
)

selec_temp_results = {}
for score in ["amb bias", "disamb bias"]:
    selec_temp_results[score] = {}
    print(score)
    for model in models:
        print(model, len(model_templates[model]))
        results = get_kobbq_ind_template_friedman(
            [english_samples, dutch_samples, spanish_samples, turkish_samples],
            ["English", "Dutch", "Spanish", "Turkish"],
            [model],
            score=score,
            templates=model_templates[model],
            verbose=False,
        )
        selec_temp_results[score][model] = results

In [None]:
all_templates = list(
    set(zip(english_control_samples["subset"], english_control_samples["q_id"]))
)

languages = ["English", "Dutch", "Spanish", "Turkish"]

subset_results = {}
for score in ["amb bias", "disamb bias"]:
    subset_results[score] = {}
    print(score)
    for model in models:
        print(model, len(model_templates[model]))
        results = get_kobbq_ind_template_friedman(
            [english_samples, dutch_samples, spanish_samples, turkish_samples],
            ["English", "Dutch", "Spanish", "Turkish"],
            [model],
            score=score,
            templates=model_templates[model],
            split_by_subset=True,
            verbose=False,
        )
        subset_results[score][model] = results

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "bias_type": [],
    "mbbq_context": [],
    "bias_score": [],
}

model_map = {
    "aya-23-8B": "aya",
    "aya-expanse-8b": "ayaexpanse",
    "Meta-Llama-3-8B-Instruct": "llama3instruct",
    "Meta-Llama-3.1-8B-Instruct": "llama3.1instruct",
    "Mistral-7B-Instruct-v0.3": "mistral0.3instruct",
    "gemma-2-2b-it": "gemma2binstruct",
    "gemma-2-9b-it": "gemma9binstruct",
}

for entry in selec_temp_results["amb bias"]:
    for language in selec_temp_results["amb bias"][entry]:
        if "panda" in entry:
            model = entry.split("panda")[0]
            dataset = "panda"
            instruct = "instruct"
        elif "biasdpo" in entry:
            model = entry.split("biasdpo")[0]
            dataset = "biasdpo"
            instruct = "instruct"
        elif "jigsaw" in entry:
            model = entry.split("jigsaw")[0]
            dataset = "jigsaw"
            instruct = "instruct"
        elif "detoxdpo" in entry:
            model = entry.split("detoxdpo")[0]
            dataset = "detoxdpo"
            instruct = "instruct"
        else:
            model = entry
            dataset = "base"
            instruct = "instruct"
        model = model_map[model]
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(language)
        data["mbbq_context"].append("amb")
        data["bias_type"].append("overall")
        data["bias_score"].append(
                selec_temp_results["amb bias"][entry][language][0]
        )
        for i, bias_type in enumerate(
                [
                    "age",
                    "disability",
                    "gender",
                    "physical-appearance",
                    "socioeconomic",
                    "sexual-orientation",
                ]
            ):
                data["model"].append(model)
                data["dataset"].append(dataset)
                data["instruction"].append(instruct)
                data["language"].append(language)
                data["bias_type"].append(bias_type)
                data["mbbq_context"].append("amb")
                data["bias_score"].append(
                    subset_results["amb bias"][entry][language][i][0]
                )

        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(language)
        data["mbbq_context"].append("disamb")
        data["bias_type"].append("overall")
        data["bias_score"].append(
                selec_temp_results["disamb bias"][entry][language][0]
            )
        for i, bias_type in enumerate(
                [
                    "age",
                    "disability",
                    "gender",
                    "physical-appearance",
                    "socioeconomic",
                    "sexual-orientation",
                ]
            ):
                data["model"].append(model)
                data["dataset"].append(dataset)
                data["instruction"].append(instruct)
                data["language"].append(language)
                data["bias_type"].append(bias_type)
                data["mbbq_context"].append("disamb")
                data["bias_score"].append(
                    subset_results["disamb bias"][entry][language][i][0]
                )

df_mbbq = pd.DataFrame(data=data)

In [None]:
df_mbbq

In [None]:
df_mbbq[df_mbbq["bias_type"] == "overall"][df_mbbq["mbbq_context"] == "amb"][
    df_mbbq["language"] == "English"
].groupby(["dataset", "model"])["bias_score"].sum().round(3)

In [None]:
df_mbbq[df_mbbq["bias_type"] == "overall"][df_mbbq["mbbq_context"] == "disamb"][
    df_mbbq["language"] == "English"
].groupby(["dataset", "model"])["bias_score"].sum().round(3)

In [None]:
df_mbbq[df_mbbq["bias_type"] == "overall"][df_mbbq["mbbq_context"] == "amb"][
    df_mbbq["language"] != "English"
].groupby(["dataset", "model"])["bias_score"].mean().round(3)

In [None]:
df_mbbq[df_mbbq["bias_type"] == "overall"][df_mbbq["mbbq_context"] == "amb"][
    df_mbbq["language"] != "English"
].groupby(["dataset", "model"])["bias_score"].std(ddof=0).round(3)

In [None]:
df_mbbq[df_mbbq["bias_type"] == "overall"][df_mbbq["mbbq_context"] == "disamb"][
    df_mbbq["language"] != "English"
].groupby(["dataset", "model"])["bias_score"].mean().round(3)

In [None]:
df_mbbq[df_mbbq["bias_type"] == "overall"][df_mbbq["mbbq_context"] == "disamb"][
    df_mbbq["language"] != "English"
].groupby(["dataset", "model"])["bias_score"].std(ddof=0).round(3)

# Toxicity

In [None]:
names = {
    "ar": "Arabic",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "emt": [],
}

for model_name, model, dataset, instruct in file_names:
    if dataset in ["biasdpo", "panda", "biassft"]:
        continue
    for language in names:
        with open(f"toxicity/{model_name}_{language}_rtp-lx_lang.pkl", "rb") as infile:
            right_language = pickle.load(infile)
        with open(
            f"toxicity/{model_name}_{language}_rtp-lx_tox_scores.pkl", "rb"
        ) as infile:
            toxicity_scores = pickle.load(infile)
        data["emt"].append(
            np.mean(
                np.max(toxicity_scores, axis=1, where=right_language == 1, initial=0)
            )
        )
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(names[language])

df_toxicity = pd.DataFrame(data)

In [None]:
df_toxicity

In [None]:
df_toxicity[df_toxicity["language"] == "English"].groupby(["model", "dataset"])[
    "emt"
].sum().round(3)

In [None]:
df_toxicity[df_toxicity["language"] != "English"].groupby(["model", "dataset"])[
    "emt"
].mean().round(3)

In [None]:
df_toxicity[df_toxicity["language"] != "English"].groupby(["model", "dataset"])[
    "emt"
].std(ddof=0).round(3)

# Perplexity

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "perplexity": [],
}

for model_name, model, dataset, instruct in file_names:
    with open(f"perplexity/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    for language in score:
        if language == "mt":
            num_sent = 65
        else:
            num_sent = 100
        if len([ppl for ppl in score[language]["selected"] if ppl != "wrong_lang"]) < num_sent:
            continue
        if dataset == "base":
            data["perplexity"].append(
                np.nanmedian(
                    [ppl for ppl in score[language]["selected"] if ppl != "wrong_lang"]
                )
            )
        else:
            base_name = model_name.replace(f'_lora_{dataset}_model', '')
            with open(f"perplexity/{base_name}_results.pkl", "rb") as infile:
                base_score = pickle.load(infile)
            row_ids = set(
                [
                     i
                     for i, v in enumerate(score[language]["selected"])
                     if v != "wrong_lang"
                 ]
            )
            base_ids = set(
                [
                    i
                     for i, v in enumerate(base_score[language]["selected"])
                     if v != "wrong_lang"
                 ]
            )
            valid_ids = row_ids.intersection(base_ids)
            if len(valid_ids) < num_sent:
                continue
            data["perplexity"].append(
                np.nanmedian([score[language]["selected"][i] for i in valid_ids])
                - np.nanmedian([base_score[language]["selected"][i] for i in valid_ids])
            )
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(names[language])

df_perplexity = pd.DataFrame(data)

In [None]:
df_perplexity

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    with pd.option_context("display.float_format", "{:.1f}".format):
        print(
            df_perplexity[df_perplexity["language"] != "English"]
            .groupby(["model", "dataset"])["perplexity"]
            .mean()
        )

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    with pd.option_context("display.float_format", "{:.1f}".format):
        print(
            df_perplexity[df_perplexity["language"] != "English"]
            .groupby(["model", "dataset"])["perplexity"]
            .std(ddof=0)
        )

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    with pd.option_context("display.float_format", "{:.0f}".format):
        print(
            df_perplexity[df_perplexity["language"] == "English"]
            .groupby(["model", "dataset"])["perplexity"]
            .sum()
        )

# Global-MMLU

In [None]:
names = {
    "ar": "Arabic",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "acc": [],
}

for model_name, model, dataset, instruct in file_names:
    if instruct != "instruct":
        continue
    for language in names:
        for dirpath, dirnames, filenames in os.walk(
            f"global_mmlu/{model_name.split("_model")[0]}_{language}"
        ):
            if filenames:
                file_name = filenames[0]
                break
        result = json.load(open(f"{dirpath}/{file_name}"))

        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(names[language])
        data["acc"].append(result["results"][f"global_mmlu_full_{language}"]["acc,none"] * 100)

df_global_mmlu = pd.DataFrame(data)

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(df_global_mmlu[df_global_mmlu["language"] == "English"].groupby(
        ["model", "dataset"]
    )["acc"].sum().round(1))

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(df_global_mmlu[df_global_mmlu["language"] != "English"].groupby(
        ["model", "dataset"]
    )["acc"].mean().round(1))

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(df_global_mmlu[df_global_mmlu["language"] != "English"].groupby(
        ["model", "dataset"]
    )["acc"].std(ddof=0).round(1))

# Language consistency

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "lc_acc": [],
    "lc_lpr": [],
    "lc_wpr": [],
}

for model_name, model, dataset, instruct in file_names:
    with open(f"lang_confusion/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    for language in names:
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(names[language])
        data["lc_acc"].append(score[("tatoeba", language)]["acc"]*100)
        data["lc_lpr"].append(score[("tatoeba", language)]["lpr"]*100)
        if "wpr" in score[("tatoeba", language)]:
            data["lc_wpr"].append(score[("tatoeba", language)]["wpr"]*100)
        else:
            data["lc_wpr"].append(np.nan)

df_lang_confusion = pd.DataFrame(data)

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(df_lang_confusion[df_lang_confusion["language"] == "English"].groupby(
        ["model", "dataset"]
    )["lc_lpr"].sum().round(1))

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(df_lang_confusion[df_lang_confusion["language"] != "English"].groupby(
        ["model", "dataset"]
    )["lc_lpr"].mean().round(1))

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(df_lang_confusion[df_lang_confusion["language"] != "English"].groupby(
        ["model", "dataset"]
    )["lc_lpr"].std(ddof=0).round(1))

# Diversity

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
    "div_uni": [],
}

for model_name, model, dataset, instruct in file_names:
    with open(f"diversity/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    for language in names:
        if language == "mt":
            num_sent = 65
        else:
            num_sent = 100
        if (
            len([div for div in score[language]["selected"] if div != "wrong_lang"])
            < num_sent
        ):
            continue
        if dataset == "base":
            data["div_uni"].append(
                np.mean(
                    [div for div in score[language]["selected"] if div != "wrong_lang"]
                )
                * 100
            )
        else:
            base_name = model_name.replace(f"_lora_{dataset}_model", "")
            with open(f"diversity/{base_name}_results.pkl", "rb") as infile:
                base_score = pickle.load(infile)
            row_ids = set(
                [
                    i
                    for i, v in enumerate(score[language]["selected"])
                    if v != "wrong_lang"
                ]
            )
            base_ids = set(
                [
                    i
                    for i, v in enumerate(base_score[language]["selected"])
                    if v != "wrong_lang"
                ]
            )
            valid_ids = row_ids.intersection(base_ids)
            if len(valid_ids) < num_sent:
                continue
            data["div_uni"].append(
                (
                    np.mean([score[language]["selected"][i] for i in valid_ids])
                    - np.mean([base_score[language]["selected"][i] for i in valid_ids])
                )
                * 100
            )
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(names[language])

df_diversity = pd.DataFrame(data)

In [None]:
df_diversity

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(
        df_diversity[df_diversity["language"] != "English"]
        .groupby(["model", "dataset"])["div_uni"]
        .mean()
        .round(1)
    )

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(
        df_diversity[df_diversity["language"] != "English"]
        .groupby(["model", "dataset"])["div_uni"]
        .std(ddof=0)
        .round(1)
    )

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(
        df_diversity[df_diversity["language"] == "English"]
        .groupby(["model", "dataset"])["div_uni"]
        .sum()
        .round(1)
    )

# Create one large dataframe

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "language": [],
}

for model_name, model, dataset, instruct in file_names:
    for language in names:
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["language"].append(names[language])
        
df_all_results = pd.DataFrame(data)

In [None]:
df_all_results

In [None]:
df_bias_results = df_stereoset.merge(
    df_crowspairs,
    on=["model", "dataset", "instruction", "language", "bias_type", "ss_score"],
    how="outer",
    indicator="bias_dataset",
).drop(columns=["lm_score", "n_samples"])

df_bias_results["bias_dataset"] = df_bias_results["bias_dataset"].cat.rename_categories(
    {"left_only": "stereoset", "right_only": "crowspairs"}
)


df_mbbq_updated = df_mbbq.rename(
    columns={"bias_score": "ss_score"}
)
df_mbbq_updated.insert(len(df_mbbq_updated.columns), "bias_dataset", "mbbq")

df_bias_results = df_bias_results.merge(
    df_mbbq_updated,
    on=[
        "model",
        "dataset",
        "instruction",
        "language",
        "bias_type",
        "ss_score",
        "bias_dataset",
    ],
    how="outer",
)

In [None]:
df_bias_results

In [None]:
df_lm_results = (
    df_diversity.rename(columns={"div_uni": "lm_score"})
    .merge(
        df_lang_confusion.drop(columns=["lc_wpr", "lc_acc"]).rename(
            columns={"lc_lpr": "lm_score"}
        ),
        on=["model", "dataset", "instruction", "language", "lm_score"],
        how="outer",
        indicator="lm_measure",
    )
)

df_lm_results["lm_measure"] = df_lm_results["lm_measure"].cat.rename_categories(
    {"left_only": "diversity", "right_only": "language consistency"}
)

df_perplexity_updated = df_perplexity.rename(columns={"perplexity": "lm_score"})
df_perplexity_updated["lm_score"] = df_perplexity_updated["lm_score"].mul(-1)
df_perplexity_updated.insert(
    len(df_perplexity_updated.columns), "lm_measure", "fluency"
)

df_lm_results = df_lm_results.merge(
    df_perplexity_updated,
    on=["model", "dataset", "instruction", "language", "lm_score", "lm_measure"],
    how="outer",
)

df_global_mmlu_updated = df_global_mmlu.rename(columns={"acc": "lm_score"})
df_global_mmlu_updated.insert(
    len(df_global_mmlu_updated.columns), "lm_measure", "language understanding"
)

df_lm_results = df_lm_results.merge(
    df_global_mmlu_updated,
    on=["model", "dataset", "instruction", "language", "lm_score", "lm_measure"],
    how="outer",
)

In [None]:
df_lm_results

In [None]:
df_all_results = df_all_results.merge(
    df_bias_results, on=["model", "dataset", "instruction", "language"], how="outer"
)
df_all_results = df_all_results.merge(
    df_toxicity,
    on=["model", "dataset", "instruction", "language"],
    how="left",
)
df_all_results = df_all_results.merge(
    df_lm_results,
    on=[
        "model",
        "dataset",
        "instruction",
        "language",
    ],
    how="outer",
)

In [None]:
df_all_results

# Figures

In [None]:
for i in range(len(df_all_results)):
    model = df_all_results.loc[i, "model"]
    dataset = df_all_results.loc[i, "dataset"]
    instruction = df_all_results.loc[i, "instruction"]
    language = df_all_results.loc[i, "language"]
    bias_type = df_all_results.loc[i, "bias_type"]
    mbbq_context = df_all_results.loc[i, "mbbq_context"]
    bias_dataset = df_all_results.loc[i, "bias_dataset"]
    lm_measure = df_all_results.loc[i, "lm_measure"]
    if dataset != "base":
        for c in [
            "ss_score",
            "emt",
            "lm_score",
        ]:
            new = df_all_results.loc[i, c]
            old = df_all_results[df_all_results["model"] == model][
                df_all_results["dataset"] == "base"
            ][df_all_results["instruction"] == instruction][
                df_all_results["language"] == language
            ][
                df_all_results["lm_measure"] == lm_measure
            ]
            if not pd.isna(bias_type):
                old = old[df_all_results["bias_type"] == bias_type]
                old = old[df_all_results["bias_dataset"] == bias_dataset]
                if not pd.isna(mbbq_context):
                    old = old[df_all_results["mbbq_context"] == mbbq_context]
            elif (
                len(old) > 1
            ):
                old = old[df_all_results["bias_type"] == "overall"]
                if (
                    len(old) > 1
                    and len(old[df_all_results["bias_dataset"] == "crowspairs"]) > 0
                ):
                    old = old[df_all_results["bias_dataset"] == "crowspairs"]
                elif (
                    len(old) > 1
                    and len(old[df_all_results["mbbq_context"] == "amb"]) > 0
                ):
                    old = old[df_all_results["mbbq_context"] == "amb"]
            print(old)
            print(
                model,
                dataset,
                instruction,
                language,
                bias_type,
                bias_dataset,
                mbbq_context,
                lm_measure,
            )
            old = old[c].item()
            if c != "lm_score" or lm_measure not in ["fluency", "diversity"]:
                df_all_results.loc[i, c] = new - old

In [None]:
to_replace = {
    "panda": "SFT, Panda",
    "biasdpo": "DPO, BiasDPO",
    "biassft": "SFT, BiasDPO",
    "jigsaw": "SFT, Jigsaw",
    "detoxdpo": "DPO, DetoxDPO",
    "detoxsft": "SFT, DetoxDPO",
    "crowspairs": "CrowS-Pairs",
    "stereoset": "StereoSet",
    "gemma9b": "Gemma 2 9B",
    "llama3": "Llama 3",
    "llama3.1": "Llama 3.1",
    "aya": "Aya",
    "ayaexpanse": "Aya Expanse",
    "llama3instruct": "Llama 3 Instruct",
    "llama3.1instruct": "Llama 3.1 Instruct",
    "gemma9binstruct": "Gemma 2 9B IT",
    "diversity": "Diversity",
    "language consistency": "Language\nconsistency",
    "language understanding": "Question-\nanswering",
    "fluency": "Fluency",
    "amb": "Ambiguous",
    "disamb": "Disambiguated",
    "age": "Age",
    "disability": "Disability status",
    "gender": "Gender identity",
    "nationality": "Nationality",
    "physical-appearance": "Physical appearance",
    "profession": "Profession",
    "race-color": "Race",
    "race": "Race",
    "religion": "Religion",
    "sexual-orientation": "Sexual Orientation",
    "socioeconomic": "Socio-economic status",
}
columns_to_rename = {
    "ss_score": "Mean absolute\nchange in bias score",
    "bias_dataset": "Evaluation dataset",
    "dataset": "Method",
    "model": "Model",
    "emt": "Mean absolute change\nin toxicity score",
    "lm_measure": "Property",
    "lm_score": "Mean absolute\nchange in score",
    "mbbq_context": "MBBQ context type",
    "bias_type": "Bias Type",
    "language": "Language",
}
col_order = [
    "SFT, Panda",
    "SFT, BiasDPO",
    "DPO, BiasDPO",
]
tox_col_order = [
    "SFT, Jigsaw",
    "SFT, DetoxDPO",
    "DPO, DetoxDPO",
]
row_order = [
    "Gemma 9B",
    "Llama 3",
    "Llama 3.1",
    "Aya",
    "Aya Expanse",
    "Gemma 9B Instruct",
    "Llama 3 Instruct",
    "Llama 3.1 Instruct",
]
sns.set(font_scale=6, style="white")

## General results

### Non-English, base models

In [None]:
plot1 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])][
        df_all_results["bias_dataset"].isin(["crowspairs", "stereoset"])
    ][df_all_results["instruction"] == "base"][
        df_all_results["bias_type"] == "overall"
    ][
        df_all_results["lm_measure"] == "diversity"
    ][
        df_all_results["language"] != "English"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    col_order=col_order,
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in bias score",
    hue="Evaluation dataset",
    col="Method",
    height=15,
    aspect=1,
)
plot1.set_titles("{col_name}")
plot1.figure.savefig("figures/fig1.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot2 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])][
        df_all_results["instruction"] == "base"
    ][df_all_results["lm_measure"] == "diversity"][
        df_all_results["language"] != "English"
    ][
        ~pd.isna(df_all_results["emt"])
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    x="Model",
    y="Mean absolute change\nin toxicity score",
    col="Method",
    col_order=tox_col_order,
    height=15,
    aspect=1,
)
plot2.set_titles("{col_name}")
plot2.figure.savefig("figures/fig2.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot3 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            ["diversity", "language consistency", "fluency", "language understanding"]
        )
    ][~df_all_results["model"].isin(["mistral0.3", "mistral0.3instruct"])][
        df_all_results["instruction"] == "base"
    ][
        df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])
    ][
        df_all_results["language"] != "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    col_order=col_order,
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col="Method",
    order=[
        "Gemma 2 9B",
        "Llama 3.1",
    ],
    height=15,
    aspect=1,
)
plot3.set_titles("{col_name}")
plot3.figure.savefig("figures/fig3a.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot3 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            [
                "diversity",
                "language consistency",
                "fluency",
            ]
        )
    ][~df_all_results["model"].isin(["mistral0.3", "mistral0.3instruct"])][
        df_all_results["instruction"] == "base"
    ][
        df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])
    ][
        df_all_results["language"] != "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col="Method",
    col_order=tox_col_order,
    order=[
        "Gemma 2 9B",
        "Llama 3.1",
    ],
    height=15,
    aspect=1,
)
plot3.set_titles("{col_name}")
plot3.figure.savefig("figures/fig3b.pdf", dpi=100)

### Non-English, instruction models

In [None]:
plot4 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["biasdpo", "panda", "biassft"])][
        df_all_results["bias_dataset"].isin(["crowspairs", "stereoset"])
    ][df_all_results["instruction"] == "instruct"][
        df_all_results["bias_type"] == "overall"
    ][
        df_all_results["lm_measure"] == "diversity"
    ][
        df_all_results["language"] != "English"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    col_order=col_order,
    x="Model",
    y="Mean absolute\nchange in bias score",
    hue="Evaluation dataset",
    col="Method",
    height=15,
    aspect=1,
)
plot4.set_xticklabels(rotation=25)
plot4.set_titles("{col_name}")
plot4.figure.savefig("figures/fig4.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot5 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])][
        df_all_results["instruction"] == "instruct"
    ][df_all_results["lm_measure"] == "diversity"][
        df_all_results["language"] != "English"
    ][
        ~pd.isna(df_all_results["emt"])
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    x="Model",
    y="Mean absolute change\nin toxicity score",
    col="Method",
    col_order=tox_col_order,
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    height=15,
    aspect=1,
)
plot5.set_xticklabels(rotation=25)
plot5.set_titles("{col_name}")
plot5.figure.savefig("figures/fig5.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot6 = sns.catplot(
    df_all_results[df_all_results["instruction"] == "instruct"][
        df_all_results["bias_type"] == "overall"
    ][df_all_results["bias_dataset"] == "mbbq"][
        df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])
    ][
        df_all_results["language"] != "English"
    ]
    .sort_values(by="mbbq_context")
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    col_order=col_order,
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in bias score",
    hue="MBBQ context type",
    col="Method",
    legend_out=True,
    height=15,
    aspect=1,
)
plot6.set_xticklabels(rotation=25)
plot6.set_titles("{col_name}")
plot6.figure.savefig("figures/fig6.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot7 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            [
                "diversity",
                "language consistency",
                "fluency",
                "language understanding"
            ]
        )
    ][df_all_results["instruction"] == "instruct"][
        df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])
    ][
        df_all_results["language"] != "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    col_order=col_order,
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col="Method",
    height=15,
    aspect=1,
)
plot7.set_xticklabels(rotation=25)
plot7.set_titles("{col_name}")
sns.move_legend(plot7, "upper left", bbox_to_anchor=(0.8, 0.8))
plot7.figure.savefig("figures/fig7a.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot7 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            ["diversity", "language consistency", "fluency", "language understanding"]
        )
    ][df_all_results["instruction"] == "instruct"][
        df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])
    ][
        df_all_results["language"] != "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    col_order=tox_col_order,
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col="Method",
    height=15,
    aspect=1,
)
plot7.set_xticklabels(rotation=25)
plot7.set_titles("{col_name}")
sns.move_legend(plot7, "upper left", bbox_to_anchor=(0.8, 0.8))
plot7.figure.savefig("figures/fig7b.pdf", bbox_inches="tight", dpi=100)

### English, base models

In [None]:
plot8 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])][
        df_all_results["bias_dataset"].isin(["crowspairs", "stereoset"])
    ][df_all_results["instruction"] == "base"][
        df_all_results["bias_type"] == "overall"
    ][
        df_all_results["lm_measure"] == "diversity"
    ][
        df_all_results["language"] == "English"
    ]
    .sort_values(by="bias_dataset")
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    col_order=col_order,
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in bias score",
    hue="Evaluation dataset",
    col="Method",
    height=15,
    aspect=1,
)
plot8.set_titles("{col_name}")
plot8.figure.savefig(
    "figures/fig8.pdf",
    dpi=100,
    bbox_inches="tight",
)

In [None]:
plot9 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])][
        df_all_results["instruction"] == "base"
    ][df_all_results["lm_measure"] == "diversity"][
        df_all_results["language"] == "English"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    x="Model",
    y="Mean absolute change\nin toxicity score",
    col="Method",
    col_order=tox_col_order,
    height=15,
    aspect=1,
)
plot9.set_titles("{col_name}")
plot9.figure.savefig(
    "figures/fig9.pdf",
    dpi=100,
    bbox_inches="tight",
)

In [None]:
plot10 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            [
                "diversity",
                "language consistency",
                "fluency",
            ]
        )
    ][~df_all_results["model"].isin(["mistral0.3", "mistral0.3instruct"])][
        df_all_results["instruction"] == "base"
    ][
        df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])
    ][
        df_all_results["language"] == "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    col_order=col_order,
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col="Method",
    order=["Gemma 2 9B", "Llama 3.1"],
    height=15,
    aspect=1,
)
plot10.set_xticklabels(["Gemma 9B", "Llama 3.1"])
plot10.set_titles("{col_name}")
plot10.figure.savefig("figures/fig10a.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot10 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            [
                "diversity",
                "language consistency",
                "fluency",
            ]
        )
    ][~df_all_results["model"].isin(["mistral0.3", "mistral0.3instruct"])][
        df_all_results["instruction"] == "base"
    ][
        df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])
    ][
        df_all_results["language"] == "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    col_order=tox_col_order,
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col="Method",
    order=["Gemma 2 9B", "Llama 3.1"],
    height=15,
    aspect=1,
)
plot10.set_titles("{col_name}")
plot10.figure.savefig("figures/fig10b.pdf", dpi=100, bbox_inches="tight")

### English, instruction models

In [None]:
plot11 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])][
        df_all_results["bias_dataset"].isin(["crowspairs", "stereoset"])
    ][df_all_results["instruction"] == "instruct"][
        df_all_results["bias_type"] == "overall"
    ][
        df_all_results["lm_measure"] == "diversity"
    ][
        df_all_results["language"] == "English"
    ]
    .sort_values(by="bias_dataset")
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    col_order=col_order,
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in bias score",
    hue="Evaluation dataset",
    col="Method",
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    height=15,
    aspect=1,
)
plot11.set_titles("{col_name}")
plot11.set_xticklabels(rotation=25)
plot11.figure.savefig("figures/fig11.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot12 = sns.catplot(
    df_all_results[df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])][
        df_all_results["instruction"] == "instruct"
    ][df_all_results["lm_measure"] == "diversity"][
        df_all_results["language"] == "English"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    x="Model",
    y="Mean absolute change\nin toxicity score",
    col="Method",
    col_order=tox_col_order,
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    height=15,
    aspect=1,
)
plot12.set_titles("{col_name}")
plot12.set_xticklabels(rotation=25)
plot12.figure.savefig("figures/fig12.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot13 = sns.catplot(
    df_all_results[df_all_results["instruction"] == "instruct"][
        df_all_results["bias_type"] == "overall"
    ][df_all_results["bias_dataset"] == "mbbq"][
        df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])
    ][
        df_all_results["language"] == "English"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    hue="MBBQ context type",
    x="Model",
    y="Mean absolute\nchange in bias score",
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    col_order=col_order,
    col="Method",
    height=15,
    aspect=1,
)
plot13.set_titles("{col_name}")
plot13.set_xticklabels(rotation=25)
plot13.figure.savefig("figures/fig13.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot14 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            ["diversity", "language consistency", "fluency", "language understanding"]
        )
    ][df_all_results["instruction"] == "instruct"][
        df_all_results["dataset"].isin(["biasdpo", "biassft", "panda"])
    ][
        df_all_results["language"] == "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col_order=col_order,
    col="Method",
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    height=15,
    aspect=1,
)
plot14.set_titles("{col_name}")
plot14.set_xticklabels(rotation=25)
sns.move_legend(plot14, "upper left", bbox_to_anchor=(0.8, 0.8))
plot14.figure.savefig("figures/fig14a.pdf", dpi=100, bbox_inches="tight")

In [None]:
plot14 = sns.catplot(
    df_all_results[
        df_all_results["lm_measure"].isin(
            [
                "diversity",
                "language consistency",
                "fluency",
                "language understanding"
            ]
        )
    ][df_all_results["instruction"] == "instruct"][
        df_all_results["dataset"].isin(["jigsaw", "detoxsft", "detoxdpo"])
    ][
        df_all_results["language"] == "English"
    ]
    .groupby(
        ["model", "dataset", "instruction", "language", "lm_measure"], as_index=False
    )
    .first()
    .reset_index(drop=True)
    .replace(to_replace)
    .rename(columns=columns_to_rename),
    kind="bar",
    x="Model",
    y="Mean absolute\nchange in score",
    hue="Property",
    col="Method",
    col_order=tox_col_order,
    order=[
        "Aya",
        "Aya Expanse",
        "Gemma 2 9B IT",
        "Llama 3.1 Instruct",
    ],
    height=15,
    aspect=1,
)
plot14.set_titles("{col_name}")
plot14.set_xticklabels(rotation=25)
sns.move_legend(plot14, "upper left", bbox_to_anchor=(0.8, 0.8))
plot14.figure.savefig("figures/fig14b.pdf", dpi=100, bbox_inches="tight")

## Bias type analysis

In [None]:
sns.set(font_scale=8, style="white")
bias_ss = sns.catplot(
    x="Language",
    y="Mean absolute\nchange in bias score",
    hue="Bias Type",
    col="Method",
    kind="strip",
    size=40,
    height=18,
    aspect=1.6,
    col_order=["SFT, Panda", "DPO, BiasDPO"],
    data=(
        df_all_results[df_all_results["bias_type"] != "overall"][
            df_all_results["model"] == "llama3.1instruct"
        ][df_all_results["dataset"] != "base"][
            df_all_results["lm_measure"] == "diversity"
        ][
            df_all_results["dataset"].isin(["biasdpo", "panda"])
        ][
            df_all_results["bias_dataset"] == "stereoset"
        ]
        .replace(to_replace)
        .rename(columns=columns_to_rename)
    ),
)
bias_ss.set_titles("{col_name}")
bias_ss.refline(y=0, color="black")
bias_ss.set_xticklabels(rotation=25)
bias_ss.figure.savefig("figures/bias_ss.pdf", dpi=100, bbox_inches="tight")

In [None]:
bias_csp = sns.catplot(
    x="Language",
    y="Mean absolute\nchange in bias score",
    hue="Bias Type",
    col="Method",
    kind="swarm",
    size=40,
    height=18,
    aspect=1.6,
    col_order=["SFT, Panda", "DPO, BiasDPO"],
    data=df_all_results[df_all_results["bias_type"] != "overall"][
        df_all_results["model"] == "llama3.1instruct"
    ][df_all_results["dataset"] != "base"][df_all_results["lm_measure"] == "diversity"][
        df_all_results["dataset"].isin(["biasdpo", "panda"])
    ][
        df_all_results["bias_dataset"] == "crowspairs"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
)
bias_csp.set_titles("{col_name}")
bias_csp.refline(y=0, color="black")
bias_csp.set_xticklabels(rotation=30)
bias_csp.figure.savefig("figures/bias_csp.pdf", dpi=100, bbox_inches="tight")

In [None]:
bias_mbbq = sns.catplot(
    x="Language",
    y="Mean absolute\nchange in bias score",
    hue="Bias Type",
    col="Method",
    kind="strip",
    size=40,
    col_order=["SFT, Panda", "DPO, BiasDPO"],
    height=18,
    aspect=1.6,
    data=df_all_results[df_all_results["bias_type"] != "overall"][
        df_all_results["model"] == "llama3.1instruct"
    ][df_all_results["dataset"] != "base"][df_all_results["lm_measure"] == "diversity"][
        df_all_results["dataset"].isin(["biasdpo", "panda"])
    ][
        df_all_results["bias_dataset"] == "mbbq"
    ][
        df_all_results["mbbq_context"] == "amb"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename),
)
bias_mbbq.set_titles("{col_name}")
bias_mbbq.refline(y=0, color="black")
bias_mbbq.set_xticklabels(rotation=25)
bias_mbbq.figure.savefig("figures/bias_mbbq.pdf", dpi=100, bbox_inches="tight")

## Difference across languages

In [None]:
sns.set(font_scale=8, style="white")
fig = plt.figure(figsize=(40, 22))
plot_data = (
    df_all_results[df_all_results["bias_type"] == "overall"][
        df_all_results["dataset"] != "base"
    ][df_all_results["lm_measure"] == "diversity"][
        df_all_results["dataset"] == "panda"
    ][
        df_all_results["bias_dataset"] == "stereoset"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename)
)

colors = [(0.8666666666666667, 0.5176470588235295, 0.3215686274509804)] + [
    ((0.2980392156862745, 0.4470588235294118, 0.6901960784313725))
    for language in plot_data["Language"].unique()
    if language != "English"
]

lang_ord = ["English"] + [
    lang for lang in plot_data["Language"].unique() if lang != "English"
]

sns.barplot(
    x="Language",
    y="Mean absolute\nchange in bias score",
    data=plot_data,
    palette=colors,
    order=lang_ord,
)
fig.axes[0].set_xticklabels(
    df_all_results[df_all_results["bias_type"] == "overall"][
        df_all_results["dataset"] != "base"
    ][df_all_results["lm_measure"] == "diversity"][
        df_all_results["dataset"] == "panda"
    ][
        df_all_results["bias_dataset"] == "stereoset"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename)["Language"]
    .unique(),
    rotation=30,
)
fig.tight_layout()
fig.savefig("figures/bias_lang_ss.pdf", dpi=100)

In [None]:
fig = plt.figure(figsize=(40, 22))

plot_data = (
    df_all_results[df_all_results["bias_type"] == "overall"][
        df_all_results["dataset"] != "base"
    ][df_all_results["lm_measure"] == "diversity"][
        df_all_results["dataset"] == "panda"
    ][
        df_all_results["bias_dataset"] == "crowspairs"
    ]
    .replace(to_replace)
    .rename(columns=columns_to_rename)
)

colors = [(0.8666666666666667, 0.5176470588235295, 0.3215686274509804)] + [
    ((0.2980392156862745, 0.4470588235294118, 0.6901960784313725))
    for language in plot_data["Language"].unique()
    if language != "English"
]

lang_ord = ["English"] + [
    lang for lang in plot_data["Language"].unique() if lang != "English"
]

sns.barplot(
    x="Language",
    y="Mean absolute\nchange in bias score",
    data=plot_data,
    order=lang_ord,
    palette=colors,
)
fig.axes[0].set_xticklabels(
    lang_ord,
    rotation=30,
)
fig.tight_layout()
fig.savefig("figures/bias_lang_csp.pdf", dpi=100)

In [None]:
fig = plt.figure(figsize=(40, 22))

plot_data = (
    df_all_results[df_all_results["bias_type"] == "overall"][
        df_all_results["dataset"] != "base"
    ][df_all_results["dataset"] == "panda"][df_all_results["bias_dataset"] == "mbbq"]
    .replace(to_replace)
    .rename(columns=columns_to_rename)
)

colors = [(0.8666666666666667, 0.5176470588235295, 0.3215686274509804)] + [
    ((0.2980392156862745, 0.4470588235294118, 0.6901960784313725))
    for language in plot_data["Language"].unique()
    if language != "English"
]

lang_ord = ["English"] + [
    lang for lang in plot_data["Language"].unique() if lang != "English"
]

bias_lang_mbbq = sns.barplot(
    x="Language",
    y="Mean absolute\nchange in bias score",
    data=plot_data,
    order=lang_ord,
    palette=colors,
)
fig.tight_layout()
fig.savefig("figures/bias_lang_mbbq.pdf", dpi=100)

In [None]:
fig = plt.figure(figsize=(40,25))

plot_data = df_all_results[df_all_results["dataset"] != "base"][
        df_all_results["lm_measure"] == "diversity"
    ][df_all_results["dataset"] == "detoxdpo"][
        ~df_all_results["language"].isin(["Maltese", "Catalan", "Turkish"])
    ].replace(to_replace).rename(columns=columns_to_rename)

colors = [(0.8666666666666667, 0.5176470588235295, 0.3215686274509804)] + [
    ((0.2980392156862745, 0.4470588235294118, 0.6901960784313725))
    for language in plot_data["Language"].unique()
    if language != "English"
]

lang_ord = ["English"] + [
    lang for lang in plot_data["Language"].unique() if lang != "English"
]

toxic_lang = sns.barplot(
    x="Language",
    y="Mean absolute change\nin toxicity score",
    data=plot_data,
    palette=colors,
    order=lang_ord,
)
fig.axes[0].set_xticklabels(
    lang_ord,
    rotation=70,
)
fig.tight_layout()
fig.savefig("figures/toxic_lang.pdf", dpi=100)

# Language features

In [None]:
lang_dict = {
    "fra": "French",
    "deu": "German",
    "spa": "Spanish",
    "tur": "Turkish",
    "zho": "Chinese",
    "mlt": "Maltese",
    "ita": "Italian",
    "cat": "Catalan",
    "ara": "Arabic",
    "kor": "Korean",
    "ind": "Indonesian",
    "pol": "Polish",
    "por": "Portuguese",
    "jpn": "Japanese",
    "nld": "Dutch",
    "hin": "Hindi",
    "ces": "Czech",
    "rus": "Russian",
    "swe": "Swedish",
}
lang_features = {"language": [], "feature_type": [], "cos_sim": []}
for feature_type in [
    "geo",
    "fam",
    "syntax_average",
]:
    features = l2v.get_features(["eng"] + list(lang_dict.keys()), feature_type)
    for lang in lang_dict:
        lang_features["language"].append(lang_dict[lang])
        lang_features["feature_type"].append(feature_type)
        f_lang = np.array(features[lang])
        if f_lang.dtype == "<U2":
            f_lang = f_lang.astype("<U16")
        f_lang[f_lang == "--"] = "nan"
        f_lang = f_lang.astype("float")
        f_eng = np.array(features["eng"])
        f_eng[f_eng == "--"] = np.nan
        f_eng = f_eng.astype("float")
        missing_idx = np.union1d(np.where(np.isnan(f_lang)), np.where(np.isnan(f_eng)))
        f_lang = np.delete(f_lang, missing_idx)
        f_eng = np.delete(f_eng, missing_idx)
        lang_features["cos_sim"].append(
            np.dot(f_lang, f_eng) / (np.linalg.norm(f_lang) * np.linalg.norm(f_eng))
        )
df_lang_features = pd.DataFrame(lang_features)

In [None]:
df_lang_features

# Subword overlap

In [None]:
overlap_features = {"language": [], "model": [], "instruction": [], "overlap": []}
lang_dict = {
    "fra_Latn": "French",
    "deu_Latn": "German",
    "spa_Latn": "Spanish",
    "tur_Latn": "Turkish",
    "cat_Latn": "Catalan",
    "ita_Latn": "Italian",
    "mlt_Latn": "Maltese",
    "zho_Hans": "Chinese",
    "kor_Hang": "Korean",
    "arb_Arab": "Arabic",
    "ces_Latn": "Czech",
    "hin_Deva": "Hindi",
    "ind_Latn": "Indonesian",
    "jpn_Jpan": "Japanese",
    "nld_Latn": "Dutch",
    "pol_Latn": "Polish",
    "por_Latn": "Portuguese",
    "rus_Cyrl": "Russian",
    "swe_Latn": "Swedish",
}
for model_name, model, instruct in [
    ("aya-23-8B", "aya", "instruct"),
    ("aya-expanse-8b", "ayaexpanse", "instruct"),
    ("Meta-Llama-3.1-8B", "llama3.1", "base"),
    ("Meta-Llama-3.1-8B-Instruct", "llama3.1instruct", "instruct"),
    ("gemma-2-2b", "gemma2b", "base"),
    ("gemma-2-2b-it", "gemma2binstruct", "instruct"),
    ("gemma-2-9b", "gemma9b", "base"),
    ("gemma-2-9b-it", "gemma9binstruct", "instruct"),
]:
    with open(f"token_overlap/{model_name}_eng_Latn_tokens.pkl", "rb") as infile:
        english_tokens = pickle.load(infile)

    for language in [
        "fra_Latn",
        "deu_Latn",
        "spa_Latn",
        "tur_Latn",
        "cat_Latn",
        "ita_Latn",
        "mlt_Latn",
        "zho_Hans",
        "kor_Hang",
        "arb_Arab",
        "ces_Latn",
        "hin_Deva",
        "ind_Latn",
        "jpn_Jpan",
        "nld_Latn",
        "pol_Latn",
        "por_Latn",
        "rus_Cyrl",
        "swe_Latn",
    ]:
        with open(f"token_overlap/{model_name}_{language}_tokens.pkl", "rb") as infile:
            lang_tokens = pickle.load(infile)
        union = 0
        intersection = 0
        for i in range(len(english_tokens)):
            counter_eng = Counter(english_tokens[i])
            counter_lang = Counter(lang_tokens[i])
            in_both = sum(
                (Counter(english_tokens[i]) & Counter(lang_tokens[i])).values()
            )
            only_in_one = sum(
                (Counter(english_tokens[i]) | Counter(lang_tokens[i])).values()
            )
            union += only_in_one + in_both
            intersection += in_both
        overlap_features["language"].append(lang_dict[language])
        overlap_features["model"].append(model)
        overlap_features["instruction"].append(instruct)
        overlap_features["overlap"].append(intersection / union)

df_overlap = pd.DataFrame(overlap_features)

In [None]:
df_overlap

# Bilingual sentence retrieval

In [None]:
names = {
    "stereoset": {
        "de": "German",
        "es": "Spanish",
        "fr": "French",
        "tr": "Turkish",
        "en": "English",
        "kr": "Korean",
    },
    "crowspairs": {
        "ar": "Arabic",
        "ca": "Catalan",
        "de": "German",
        "es": "Spanish",
        "fr": "French",
        "it": "Italian",
        "en": "English",
        "mt": "Maltese",
        "zh": "Chinese",
    },
    "rtp-lx": {
        "ar": "Arabic",
        "cs": "Czech",
        "de": "German",
        "es": "Spanish",
        "fr": "French",
        "ko": "Korean",
        "it": "Italian",
        "en": "English",
        "zh": "Chinese",
        "hi": "Hindi",
        "id": "Indonesian",
        "ja": "Japanese",
        "nl": "Dutch",
        "pl": "Polish",
        "pt": "Portuguese",
        "ru": "Russian",
        "sv": "Swedish",
    },
}

In [None]:
bilingual_sent_data = {
    "language": [],
    "model": [],
    "instruction": [],
    "dataset": [],
    "retrieval_acc": [],
}
for model_name, model, instruct, datasets in [
    ("aya-23-8B", "aya", "instruct", ("crowspairs", "stereoset", "rtp-lx")),
    ("aya-expanse-8b", "ayaexpanse", "instruct", ("crowspairs", "stereoset", "rtp-lx")),
    (
        "Meta-Llama-3.1-8B-Instruct",
        "llama3.1instruct",
        "instruct",
        ("crowspairs", "stereoset", "rtp-lx"),
    ),
    ("Meta-Llama-3.1-8B", "llama3.1", "base", ("crowspairs", "stereoset", "rtp-lx")),
    ("gemma-2-9b", "gemma9b", "base", ("crowspairs", "stereoset", "rtp-lx")),
    (
        "gemma-2-9b-it",
        "gemma9binstruct",
        "instruct",
        ("crowspairs", "stereoset", "rtp-lx"),
    ),
]:
    for dataset in datasets:
        with open(
            f"bilingual_sent_retrieval/{dataset}_{model_name}_results.pkl", "rb"
        ) as infile:
            scores = pickle.load(infile)
        for language in scores:
            bilingual_sent_data["language"].append(names[dataset][language])
            bilingual_sent_data["model"].append(model)
            bilingual_sent_data["instruction"].append(instruct)
            bilingual_sent_data["dataset"].append(dataset)
            bilingual_sent_data["retrieval_acc"].append(scores[language])

df_bi_sent_acc = pd.DataFrame(bilingual_sent_data)

In [None]:
df_bi_sent_acc

# Aya data distribution

In [None]:
n = 513758189
lang_samples = {
    "Arabic": 6641429,
    "Catalan":0,
    "Czech": 4299946,
    "German": 5447064,
    "Spanish": 4499536,
    "French": 4955862,
    "Korean": 4161353,
    "Italian": 4526024,
    "English": 17838105,
    "Maltese":0,
    "Turkish": 4180274,
    "Chinese": 74972,
    "Hindi": 4380729,
    "Indonesian": 4166051,
    "Japanese": 6813519,
    "Dutch": 4340523,
    "Polish": 4452845,
    "Portuguese": 4407774,
    "Russian": 4666262,
    "Swedish":0,
}
data = {"language": [], "perc_aya_data": []}
for language in lang_samples:
    data["language"].append(language)
    data["perc_aya_data"].append((lang_samples[language] / n) * 100)

aya_dataset = pd.DataFrame(data)

In [None]:
aya_dataset

In [None]:
aya_dataset[
    aya_dataset["language"].isin(
        [
            "Arabic",
            "Catalan",
            "Czech",
            "German",
            "Spanish",
            "French",
            "Korean",
            "Italian",
            "English",
            "Maltese",
            "Turkish",
            "Chinese",
            "Hindi",
            "Indonesian",
            "Japanese",
            "Dutch",
            "Polish",
            "Portuguese",
            "Russian",
            "Swedish",
        ]
    )
].sort_values(by="perc_aya_data", ascending=False)

# CC data distribution

In [None]:
# csv from https://commoncrawl.github.io/cc-crawl-statistics/plots/languages
df_cc_language = pd.read_csv("languages.csv")

In [None]:
df_cc_language

In [None]:
languages = {
    "ara": "Arabic",
    "cat": "Catalan",
    "ces": "Czech",
    "deu": "German",
    "spa": "Spanish",
    "fra": "French",
    "kor": "Korean",
    "ita": "Italian",
    "eng": "English",
    "mlt": "Maltese",
    "tur": "Turkish",
    "zho": "Chinese",
    "hin": "Hindi",
    "ind": "Indonesian",
    "jpn": "Japanese",
    "nld": "Dutch",
    "pol": "Polish",
    "por": "Portuguese",
    "rus": "Russian",
    "swe": "Swedish",
}

In [None]:
date = "CC-MAIN-2024-30"
data = {"language": [], "perc_cc_data": []}
for language in languages:
    data["language"].append(languages[language])
    data["perc_cc_data"].append(
        df_cc_language[df_cc_language["crawl"] == date][
            df_cc_language["primary_language"] == language
        ]["%pages/crawl"].item()
    )

df_cc_perc = pd.DataFrame(data)

In [None]:
df_cc_perc

# Prepare for correlations / regression

## CrowSPairs

In [None]:
debiasing_csp = df_crowspairs[
    df_crowspairs["model"].isin(
        [
            "aya",
            "ayaexpanse",
            "gemma9b",
            "gemma9binstruct",
            "llama3.1",
            "llama3.1instruct",
        ]
    )
][df_crowspairs["dataset"].isin(["base", "panda", "biasdpo"])].sort_values(
    ["model", "dataset", "language", "bias_type"], ignore_index=True
)

In [None]:
n = len(debiasing_csp["language"].unique()) * len(debiasing_csp["bias_type"].unique())

debiasing_csp["base_ss_score"] = (
    debiasing_csp[debiasing_csp["dataset"] == "base"]["ss_score"]
    .iloc[
        [
            el
            for i in range(
                0,
                len(debiasing_csp[debiasing_csp["dataset"] == "base"]),
                n,
            )
            for el in list(range(i, i + n)) * 4
        ]
    ]
    .reset_index()["ss_score"]
)

In [None]:
debiasing_csp["bias_mitigation"] = (
    (debiasing_csp["ss_score"] - debiasing_csp["base_ss_score"])
    / debiasing_csp["base_ss_score"]
    * 100
)

In [None]:
debiasing_csp = debiasing_csp[debiasing_csp["dataset"] != "base"]
debiasing_csp = debiasing_csp[debiasing_csp["language"] != "English"]

In [None]:
debiasing_csp

## Toxicity

In [None]:
detox = df_toxicity[
    df_toxicity["model"].isin(
        [
            "aya",
            "ayaexpanse",
            "llama3.1",
            "llama3.1instruct",
            "gemma9b",
            "gemma9binstruct",
        ]
    )
].sort_values(["model", "dataset", "language"], ignore_index=True)

In [None]:
n = len(detox["language"].unique())

detox["base_emt"] = (
    detox[detox["dataset"] == "base"]["emt"]
    .iloc[
        [
            el
            for i in range(
                0,
                len(detox[detox["dataset"] == "base"]),
                n,
            )
            for el in list(range(i, i + n)) * 4
        ]
    ]
    .reset_index()["emt"]
)

In [None]:
detox["detoxification"] = (detox["emt"] - detox["base_emt"]) / detox["base_emt"] * 100

In [None]:
detox = detox[detox["dataset"] != "base"]
detox = detox[detox["language"] != "English"]

In [None]:
detox

# Correlations

## CrowSPairs

### Merge dataframes

In [None]:
debiasing_csp = pd.merge(debiasing_csp, df_lang_features, how="left", on="language")

debiasing_csp = pd.merge(
    debiasing_csp, df_overlap, how="left", on=["language", "model", "instruction"]
)

debiasing_csp = pd.merge(
    debiasing_csp,
    df_bi_sent_acc[df_bi_sent_acc["dataset"] == "crowspairs"].drop(columns=["dataset"]),
    how="left",
    on=["language", "model", "instruction"],
)

debiasing_csp = pd.merge(debiasing_csp, aya_dataset, how="left", on="language")

debiasing_csp = pd.merge(debiasing_csp, df_cc_perc, how="left", on="language")

In [None]:
debiasing_csp

### Correlations

In [None]:
print_corr_with_p(
    debiasing_csp[debiasing_csp["bias_type"] == "overall"][
        debiasing_csp["dataset"] == "panda"
    ][
        debiasing_csp["model"].isin(
            [
                "aya",
                "ayaexpanse",
                "llama3",
                "llama3.1",
                "gemma9b",
                "llama3instruct",
                "llama3.1instruct",
                "gemma9binstruct",
            ]
        )
    ].groupby(
        ["model", "feature_type"]
    )[
        ["bias_mitigation", "cos_sim"]
    ],
    method="spearman",
)

In [None]:
print_corr_with_p(
    debiasing_csp[debiasing_csp["bias_type"] == "overall"][
        debiasing_csp["dataset"] == "panda"
    ][
        debiasing_csp["model"].isin(
            [
                "aya",
                "ayaexpanse",
                "llama3",
                "llama3.1",
                "gemma9b",
                "llama3instruct",
                "llama3.1instruct",
                "gemma9binstruct",
            ]
        )
    ].groupby(
        ["model"]
    )[
        ["bias_mitigation", "overlap"]
    ],
    method="spearman",
)

In [None]:
print_corr_with_p(
    debiasing_csp[debiasing_csp["bias_type"] == "overall"][
        debiasing_csp["dataset"] == "panda"
    ][
        debiasing_csp["model"].isin(
            [
                "aya",
                "ayaexpanse",
                "llama3",
                "llama3.1",
                "gemma9b",
                "llama3instruct",
                "llama3.1instruct",
                "gemma9binstruct",
            ]
        )
    ].groupby(
        ["model"]
    )[
        ["retrieval_acc", "bias_mitigation"]
    ],
    method="spearman",
)

In [None]:
print_corr_with_p(
    debiasing_csp[debiasing_csp["bias_type"] == "overall"][
        debiasing_csp["dataset"] == "panda"
    ][
        debiasing_csp["model"].isin(
            [
                "aya",
                "ayaexpanse",
                "llama3",
                "llama3.1",
                "gemma9b",
                "llama3instruct",
                "llama3.1instruct",
                "gemma9binstruct",
            ]
        )
    ].groupby(
        ["model"]
    )[
        ["perc_aya_data", "bias_mitigation"]
    ],
    method="spearman",
)

In [None]:
print_corr_with_p(
    debiasing_csp[debiasing_csp["bias_type"] == "overall"][
        debiasing_csp["dataset"] == "panda"
    ][
        debiasing_csp["model"].isin(
            [
                "aya",
                "ayaexpanse",
                "llama3",
                "llama3.1",
                "gemma9b",
                "llama3instruct",
                "llama3.1instruct",
                "gemma9binstruct",
            ]
        )
    ].groupby(
        ["model"]
    )[
        ["perc_cc_data", "bias_mitigation"]
    ],
    method="spearman",
)

## Toxicity

### Merge dataframes

In [None]:
detox = pd.merge(detox, df_lang_features, how="left", on="language")

detox = pd.merge(detox, df_overlap, how="left", on=["language", "model", "instruction"])

detox = pd.merge(
    detox,
    df_bi_sent_acc[df_bi_sent_acc["dataset"] == "rtp-lx"].drop(columns=["dataset"]),
    how="left",
    on=["language", "model", "instruction"],
)

detox = pd.merge(detox, aya_dataset, how="left", on="language")

detox = pd.merge(detox, df_cc_perc, how="left", on="language")

In [None]:
detox

### Correlations

In [None]:
print_corr_with_p(
    detox[detox["dataset"]=="detoxdpo"].groupby(["model", "feature_type"])[["detoxification", "cos_sim"]],
    method="spearman",
)

In [None]:
print_corr_with_p(
    detox[detox["dataset"]=="detoxdpo"].groupby(["model"])[["detoxification", "overlap"]],
    method="spearman",
)

In [None]:
print_corr_with_p(
    detox[detox["dataset"]=="detoxdpo"].groupby(["model"])[["retrieval_acc", "detoxification"]],
    method="spearman",
)

In [None]:
print_corr_with_p(
    detox[detox["dataset"]=="detoxdpo"].groupby(["model"])[["perc_aya_data", "detoxification"]],
    method="spearman",
)

In [None]:
print_corr_with_p(
    detox[detox["dataset"]=="detoxdpo"].groupby(["model"])[["perc_cc_data", "detoxification"]],
    method="spearman",
)

# Data size analysis

In [None]:
file_names_size = [
    ("Meta-Llama-3.1-8B-Instruct", "llama3.1instruct", "panda", "instruct", 0, "bias"),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_2375",
        "llama3.1instruct",
        "panda",
        "instruct",
        10,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_4750",
        "llama3.1instruct",
        "panda",
        "instruct",
        20,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_7125",
        "llama3.1instruct",
        "panda",
        "instruct",
        30,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_9500",
        "llama3.1instruct",
        "panda",
        "instruct",
        40,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_11875",
        "llama3.1instruct",
        "panda",
        "instruct",
        50,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_14250",
        "llama3.1instruct",
        "panda",
        "instruct",
        60,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_16625",
        "llama3.1instruct",
        "panda",
        "instruct",
        70,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_19000",
        "llama3.1instruct",
        "panda",
        "instruct",
        80,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_21375",
        "llama3.1instruct",
        "panda",
        "instruct",
        90,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_panda_model_23742",
        "llama3.1instruct",
        "panda",
        "instruct",
        100,
        "bias",
    ),
    ("Meta-Llama-3.1-8B-Instruct", "llama3.1instruct", "biasdpo", "instruct", 0, "bias"),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_542",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        10,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_1084",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        20,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_1626",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        30,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_2168",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        40,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_2710",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        50,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_3252",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        60,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_3794",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        70,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_4336",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        80,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_4878",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        90,
        "bias",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_biasdpo_model_5420",
        "llama3.1instruct",
        "biasdpo",
        "instruct",
        100,
        "bias",
    ),
    ("Meta-Llama-3.1-8B-Instruct", "llama3.1instruct", "jigsaw", "instruct", 0, "toxicity"),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_2375",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        10,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_4750",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        20,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_7125",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        30,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_9500",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        40,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_11875",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        50,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_14250",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        60,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_16625",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        70,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_19000",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        80,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_21375",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        90,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_jigsaw_model_23742",
        "llama3.1instruct",
        "jigsaw",
        "instruct",
        100,
        "jigsaw",
    ),
    ("Meta-Llama-3.1-8B-Instruct", "llama3.1instruct", "detoxdpo", "instruct", 0, "toxicity"),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_584",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        10,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_1168",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        20,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_1752",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        30,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_2336",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        40,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_2920",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        50,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_3504",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        60,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_4088",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        70,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_4672",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        80,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_5256",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        90,
        "toxicity",
    ),
    (
        "Meta-Llama-3.1-8B-Instruct_lora_detoxdpo_model_5836",
        "llama3.1instruct",
        "detoxdpo",
        "instruct",
        100,
        "toxicity",
    ),
]
sns.set(font_scale=1.75, style="white")

## StereoSet

In [None]:
names = {
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "tr": "Turkish",
    "en": "English",
    "kr": "Korean",
}


def dict_func():
    return {}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc": [],
    "language": [],
    "ss_score": [],
    "lm_score": [],
    "bias_type": [],
}

for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    if task != "bias":
        continue
    for language in names:
        with open(f"stereoset/{model_name}_{language}_ssresults.pkl", "rb") as infile:
            score = pickle.load(infile)
        for bias_type in ["gender", "profession", "race", "religion", "overall"]:
            data["model"].append(model)
            #if data_perc != 0:
            #    data["dataset"].append(dataset + str(data_perc))
            #else:
            data["dataset"].append(dataset)
            data["instruction"].append(instruct)
            data["data_perc"].append(data_perc)
            data["language"].append(names[language])
            data["ss_score"].append(score["intrasentence"][bias_type]["SS Score"])
            data["lm_score"].append(score["intrasentence"][bias_type]["LM Score"])
            data["bias_type"].append(bias_type)

df_stereoset = pd.DataFrame(data)

In [None]:
df_stereoset

## CrowSPairs

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "zh": "Chinese",
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc": [],
    "language": [],
    "ss_score": [],
    "n_samples": [],
    "bias_type": [],
}

for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    if task != "bias":
        continue
    for language in names:
        with open(f"crowspairs/{model_name}_{language}_cspresults.pkl", "rb") as infile:
            score = pickle.load(infile)
        for bias_type in [
            "race-color",
            "socioeconomic",
            "gender",
            "disability",
            "nationality",
            "sexual-orientation",
            "physical-appearance",
            "religion",
            "age",
        ]:
            data["model"].append(model)
            #if data_perc != 0:
            #    data["dataset"].append(dataset + str(data_perc))
            #else:
            data["dataset"].append(dataset)
            data["instruction"].append(instruct)
            data["data_perc"].append(data_perc)
            data["language"].append(names[language])
            data["ss_score"].append(score[bias_type][0])
            data["n_samples"].append(score[bias_type][1])
            data["bias_type"].append(bias_type)


df_crowspairs = pd.DataFrame(data)

In [None]:
n_samples = {
    sorted(list(names.values()))[i]: df_crowspairs.groupby(
        ["model", "dataset", "instruction", "data_perc", "language"]
    )["n_samples"]
    .sum()
    .values[i]
    for i in range(len(list(names.values())))
}

In [None]:
new_rows = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc": [],
    "language": [],
    "n_samples": [],
    "ss_score": [],
    "bias_type": [],
}
for row in (
    df_crowspairs.groupby(["model", "dataset", "instruction", "data_perc", "language"])
    .apply(lambda x: np.average(x.ss_score, weights=x.n_samples))
    .reset_index()
    .iloc
):
    new_rows["model"].append(row["model"])
    new_rows["dataset"].append(row["dataset"])
    new_rows["instruction"].append(row["instruction"])
    new_rows["data_perc"].append(row["data_perc"])
    new_rows["language"].append(row["language"])
    new_rows["n_samples"].append(n_samples[row["language"]])
    new_rows["ss_score"].append(row[0])
    new_rows["bias_type"].append("overall")

df_crowspairs = pd.concat(
    [df_crowspairs, pd.DataFrame(data=new_rows)], ignore_index=True
)

In [None]:
df_crowspairs

## Bias plots

In [None]:
df_bias_results = df_stereoset.merge(
    df_crowspairs,
    on=["model", "dataset", "instruction", "data_perc", "language", "bias_type", "ss_score"],
    how="outer",
    indicator="bias_dataset",
).drop(columns=["lm_score", "n_samples"])

df_bias_results["bias_dataset"] = df_bias_results["bias_dataset"].cat.rename_categories(
    {"left_only": "stereoset", "right_only": "crowspairs",}
).cat.remove_categories(["both"])

df_bias_results

In [None]:
df_bias_results["language"][df_bias_results["language"]!="English"] = "Non-English"

In [None]:
fig = plt.figure(figsize=(7, 5))
sns.lineplot(
    x="Percentage of training",
    y="Bias score",
    data=df_bias_results[df_bias_results["bias_type"] == "overall"][
        df_bias_results["dataset"] == "panda"
    ]
    .replace({"crowspairs": "CrowS-Pairs", "stereoset": "StereoSet"})
    .rename(
        columns={
            "data_perc": "Percentage of training",
            "ss_score": "Bias score",
            "language": "Language",
            "bias_dataset": "Dataset",
        }
    ),
    style="Language",
    hue="Dataset",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_bias_panda.pdf", dpi=100, bbox_inches="tight")

In [None]:
fig = plt.figure(figsize=(7,5))
sns.lineplot(
    x="Percentage of training",
    y="Bias score",
    data=df_bias_results[df_bias_results["bias_type"] == "overall"][
        df_bias_results["dataset"] == "biasdpo"
    ]
    .replace({"crowspairs": "CrowS-Pairs", "stereoset": "StereoSet"})
    .rename(
        columns={
            "data_perc": "Percentage of training",
            "ss_score": "Bias score",
            "language": "Language",
            "bias_dataset": "Dataset",
        }
    ),
    style="Language",
    hue="Dataset",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_bias_biasdpo.pdf", dpi=100, bbox_inches="tight")

## Toxicity

In [None]:
names = {
    "ar": "Arabic",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc": [],
    "language": [],
    "emt": [],
    "task": [],
}

for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    if task != "toxicity":
        continue
    for language in names:
        with open(f"toxicity/{model_name}_{language}_rtp-lx_lang.pkl", "rb") as infile:
            right_language = pickle.load(infile)
        with open(
            f"toxicity/{model_name}_{language}_rtp-lx_tox_scores.pkl", "rb"
        ) as infile:
            toxicity_scores = pickle.load(infile)
        data["emt"].append(
            np.mean(
                np.max(toxicity_scores, axis=1, where=right_language == 1, initial=0)
            )
        )
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["data_perc"].append(data_perc)
        data["language"].append(names[language])
        data["task"].append(task)

df_toxicity = pd.DataFrame(data)

In [None]:
df_toxicity["language"][df_toxicity["language"]!="English"] = "Non-English"

In [None]:
fig = plt.figure(figsize=(7,5))
tox_size = sns.lineplot(
    x="Percentage of training",
    y="Toxicity score",
    data=df_toxicity[df_toxicity["dataset"]=="jigsaw"].rename(
        columns={
            "data_perc": "Percentage of training",
            "emt": "Toxicity score",
            "ss_score": "Bias score",
            "language": "Language",
        }
    ),
    style="Language",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_tox_jigsaw.pdf", dpi=100, bbox_inches="tight")

In [None]:
fig = plt.figure(figsize=(7,5))
tox_size = sns.lineplot(
    x="Percentage of training",
    y="Toxicity score",
    data=df_toxicity[df_toxicity["dataset"] == "detoxdpo"].rename(
        columns={
            "data_perc": "Percentage of training",
            "emt": "Toxicity score",
            "ss_score": "Bias score",
            "language": "Language",
        }
    ),
    style="Language",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_tox_detoxdpo.pdf", dpi=100, bbox_inches="tight")

## Perplexity

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
valid_ids = {}
for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    with open(f"perplexity/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    if dataset not in valid_ids:
        valid_ids[dataset] = {}
    for language in score:
        ids = set(
            [i for i, v in enumerate(score[language]["selected"]) if v != "wrong_lang"]
        )
        if language in valid_ids[dataset]:
            valid_ids[dataset][language] = valid_ids[dataset][language].intersection(
                ids
            )
        else:
            valid_ids[dataset][language] = ids

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc": [],
    "language": [],
    "perplexity": [],
    "task": [],
}

for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    with open(f"perplexity/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    for language in score:
        if language == "mt":
            num_sent = 65
        else:
            num_sent = 100
        if len(valid_ids[dataset][language]) < num_sent:
            continue
        data["perplexity"].append(
            np.nanmedian(
                [score[language]["selected"][i] for i in valid_ids[dataset][language]]
            )
        )
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["data_perc"].append(data_perc)
        data["language"].append(names[language])
        data["task"].append(task)

df_perplexity = pd.DataFrame(data)

## Language consistency

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc":[],
    "language": [],
    "lc_lpr": [],
    "task": [],
}

for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    with open(f"lang_confusion/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    for language in names:
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["data_perc"].append(data_perc)
        data["language"].append(names[language])
        data["lc_lpr"].append(score[("tatoeba", language)]["lpr"]*100)
        data["task"].append(task)

df_lang_confusion = pd.DataFrame(data)

In [None]:
df_lang_confusion

## Diversity

In [None]:
names = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "mt": "Maltese",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
valid_ids = {}
for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    with open(f"diversity/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    if dataset not in valid_ids:
        valid_ids[dataset] = {}
    for language in score:
        ids = set(
            [i for i, v in enumerate(score[language]["selected"]) if v != "wrong_lang"]
        )
        if language in valid_ids[dataset]:
            valid_ids[dataset][language] = valid_ids[dataset][language].intersection(
                ids
            )
        else:
            valid_ids[dataset][language] = ids

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc": [],
    "language": [],
    "div_uni": [],
    "task": [],
}

for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    with open(f"diversity/{model_name}_results.pkl", "rb") as infile:
        score = pickle.load(infile)
    for language in names:
        if language == "mt":
            num_sent = 65
        else:
            num_sent = 100
        if len(valid_ids[dataset][language]) < num_sent:
            continue
        data["div_uni"].append(
            np.mean(
                [score[language]["selected"][i] for i in valid_ids[dataset][language]]
            )
            * 100
        )
        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["data_perc"].append(data_perc)
        data["language"].append(names[language])
        data["task"].append(task)

df_diversity = pd.DataFrame(data)

In [None]:
df_diversity

## Global-MMLU

In [None]:
names = {
    "ar": "Arabic",
    "cs": "Czech",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "ko": "Korean",
    "it": "Italian",
    "en": "English",
    "tr": "Turkish",
    "zh": "Chinese",
    "hi": "Hindi",
    "id": "Indonesian",
    "ja": "Japanese",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ru": "Russian",
    "sv": "Swedish"
}

In [None]:
data = {
    "model": [],
    "dataset": [],
    "instruction": [],
    "data_perc": [],
    "language": [],
    "acc": [],
    "task": [],
}

for model_name, model, dataset, instruct, data_perc, task in file_names_size:
    for language in names:
        for dirpath, dirnames, filenames in os.walk(
            f"global_mmlu/{model_name}_{language}"
        ):
            if filenames:
                file_name = filenames[0]
                break
        result = json.load(open(f"{dirpath}/{file_name}"))

        data["model"].append(model)
        data["dataset"].append(dataset)
        data["instruction"].append(instruct)
        data["data_perc"].append(data_perc)
        data["language"].append(names[language])
        data["task"].append(task)
        data["acc"].append(
            result["results"][f"global_mmlu_full_{language}"]["acc,none"] * 100
        )

df_global_mmlu = pd.DataFrame(data)

## LM Plots

In [None]:
df_lm_results = (
    df_diversity.rename(columns={"div_uni": "lm_score"})
    .merge(
        df_lang_confusion.rename(
            columns={"lc_lpr": "lm_score"}
        ),
        on=["model", "dataset", "instruction", "data_perc", "language", "task","lm_score"],
        how="outer",
        indicator="lm_measure",
    )
)

df_lm_results["lm_measure"] = df_lm_results["lm_measure"].cat.rename_categories(
    {"left_only": "diversity", "right_only": "language consistency"}
)

df_perplexity_updated = df_perplexity.rename(columns={"perplexity": "lm_score"})
df_perplexity_updated.insert(
    len(df_perplexity_updated.columns), "lm_measure", "perplexity"
)

df_lm_results = df_lm_results.merge(
    df_perplexity_updated,
    on=["model", "dataset", "instruction", "data_perc", "language", "task", "lm_score", "lm_measure"],
    how="outer",
)

df_global_mmlu_updated = df_global_mmlu.rename(columns={"acc": "lm_score"})
df_global_mmlu_updated.insert(
    len(df_global_mmlu_updated.columns), "lm_measure", "language understanding"
)
df_lm_results = df_lm_results.merge(
    df_global_mmlu_updated,
    on=["model", "dataset", "instruction", "data_perc", "language", "task", "lm_score", "lm_measure"],
    how="outer",
)

In [None]:
df_lm_results["language"][df_lm_results["language"]!="English"] = "Non-English"

In [None]:
fig = plt.figure(figsize=(7, 5))
div_size = sns.lineplot(
    x="Percentage of training",
    y="Score",
    data=df_lm_results[df_lm_results["dataset"] == "panda"]
    .rename(
        columns={
            "data_perc": "Percentage of training",
            "lm_score": "Score",
            "language": "Language",
            "lm_measure": "Property",
        }
    )
    .replace(
        {
            "diversity": "Diversity",
            "perplexity": "Perplexity",
            "language consistency": "Language\nconsistency",
            "language understanding": "Question-\nanswering",
        }
    )
    .sort_values(by="Property"),
    hue="Property",
    style="Language",
    marker="o",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_lm_panda.pdf", dpi=100, bbox_inches="tight")

In [None]:
fig = plt.figure(figsize=(7, 5))
div_size = sns.lineplot(
    x="Percentage of training",
    y="Score",
    data=df_lm_results[df_lm_results["dataset"] == "biasdpo"]
    .rename(
        columns={
            "data_perc": "Percentage of training",
            "lm_score": "Score",
            "language": "Language",
            "lm_measure": "Property",
        }
    )
    .replace(
        {
            "diversity": "Diversity",
            "perplexity": "Perplexity",
            "language consistency": "Language\nconsistency",
            "language understanding": "Question-\nanswering",
        }
    )
    .sort_values(by="Property"),
    hue="Property",
    style="Language",
    marker="o",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_lm_biasdpo.pdf", dpi=100, bbox_inches="tight")

In [None]:
fig = plt.figure(figsize=(7, 5))
div_size = sns.lineplot(
    x="Percentage of training",
    y="Score",
    data=df_lm_results[df_lm_results["dataset"] == "jigsaw"]
    .rename(
        columns={
            "data_perc": "Percentage of training",
            "lm_score": "Score",
            "language": "Language",
            "lm_measure": "Property",
        }
    )
    .replace(
        {
            "diversity": "Diversity",
            "perplexity": "Perplexity",
            "language consistency": "Language\nconsistency",
            "language understanding": "Question-\nanswering",
        }
    )
    .sort_values(by="Property"),
    hue="Property",
    style="Language",
    marker="o",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_lm_jigsaw.pdf", dpi=100, bbox_inches="tight")

In [None]:
fig = plt.figure(figsize=(7, 5))
div_size = sns.lineplot(
    x="Percentage of training",
    y="Score",
    data=df_lm_results[df_lm_results["dataset"] == "detoxdpo"]
    .rename(
        columns={
            "data_perc": "Percentage of training",
            "lm_score": "Score",
            "language": "Language",
            "lm_measure": "Property",
        }
    )
    .replace(
        {
            "diversity": "Diversity",
            "perplexity": "Perplexity",
            "language consistency": "Language\nconsistency",
            "language understanding": "Question-\nanswering",
        }
    )
    .sort_values(by="Property"),
    hue="Property",
    style="Language",
    marker="o",
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
fig.savefig("figures/size_lm_detoxdpo.pdf", dpi=100, bbox_inches="tight")