# Welfare Analysis
Summarise distribution of welfare of different groups, under different sampling schemes.

# Setup

In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
import math
import pickle

# from src.utils.helper_funcs import find_project_root

# PROJECT_ROOT = find_project_root()
# DATA_DIR = PROJECT_ROOT / 'data'
DATA_DIR = "../../data/"
RES_DIR = "../../results/welfare/"

In [2]:
# paths
utterances_path = DATA_DIR + "utterances.jsonl"
survey_path = DATA_DIR + "survey.jsonl"

In [3]:
### load data
utterances = pd.read_json(utterances_path, lines=True)
survey = pd.read_json(survey_path, lines=True)

### clean data
# Unnest survey data

nested_columns = [
    "location",
    "religion",
    "ethnicity",
    "order_lm_usecases",
    "order_stated_prefs",
]
# Normalize each column and join back to the original DataFrame
for col in nested_columns:
    df_expanded = pd.json_normalize(survey[col])
    df_expanded.columns = [
        f"{col}_{subcol}" for subcol in df_expanded.columns
    ]  # Prefixing column names
    survey = survey.join(df_expanded)

In [4]:
# clean utterances
utterances["short_model_name"] = utterances["model_name"].apply(
    lambda x: x.split("/")[-1]
)

## Prepare data

Keys varibles are: gender, age, birth_country_region, lm_categorised_ethnicity, score and normalised_score

In [5]:
##merged dataset of utterances

merged = utterances.merge(survey, on="user_id", how="left")
merged = merged[
    (merged["turn"] == 0) & (merged["included_in_balanced_subset_x"] == True)
]  # & (merged['included_in_balanced_subset'] == True)] #only keep first interaction
models = merged["short_model_name"].unique().tolist()

In [6]:
# generate veto: if score less than 10, veto
merged["veto"] = merged["score"].apply(lambda x: 1 if x <= 10 else 0)

### Impute scores with matrix factorisation

Do we impute user average scores, or interaction scores? think more about this 

### generate sampling dictionaries


In [7]:
### generate sampling dictionary

sampling_dict = {
    # non country demographics
    "white_male": survey[
        (survey["gender"] == "Male") & (survey["ethnicity_simplified"] == "White")
    ]["user_id"].tolist(),
    "male": survey[(survey["gender"] == "Male")]["user_id"].tolist(),
    "nonmale": survey[(survey["gender"] != "Male")]["user_id"].tolist(),
    "white": survey[(survey["ethnicity_simplified"] == "White")]["user_id"].tolist(),
    "above_45": survey[
        (survey["age"].isin(["45-54 years old", "55-64 years old", "65+ years old"]))
    ]["user_id"].tolist(),
    "all": survey["user_id"].tolist(),
    # us breakdown
    "us_white": survey[
        (survey["included_in_US_REP"] == True)
        & (survey["ethnicity_simplified"] == "White")
    ]["user_id"].tolist(),
    "us_nonwhite": survey[
        (survey["included_in_US_REP"] == True)
        & (survey["ethnicity_simplified"] != "White")
    ]["user_id"].tolist(),
    "us_male": survey[
        (survey["included_in_US_REP"] == True) & (survey["gender"] == "Male")
    ]["user_id"].tolist(),
    "us_nonmale": survey[
        (survey["included_in_US_REP"] == True) & (survey["gender"] != "Male")
    ]["user_id"].tolist(),
    "us_above_45": survey[
        (survey["included_in_US_REP"] == True)
        & (survey["age"].isin(["45-54 years old", "55-64 years old", "65+ years old"]))
    ]["user_id"].tolist(),
    "us_below_45": survey[
        (survey["included_in_US_REP"] == True)
        & (
            survey["age"].isin(["45-54 years old", "55-64 years old", "65+ years old"])
            == False
        )
    ]["user_id"].tolist(),
    "us": survey[(survey["included_in_US_REP"] == True)]["user_id"].tolist(),
    # uk breakdown
    "uk_white": survey[
        (survey["included_in_UK_REP"] == True)
        & (survey["ethnicity_simplified"] == "White")
    ]["user_id"].tolist(),
    "uk_nonwhite": survey[
        (survey["included_in_UK_REP"] == True)
        & (survey["ethnicity_simplified"] != "White")
    ]["user_id"].tolist(),
    "uk_male": survey[
        (survey["included_in_UK_REP"] == True) & (survey["gender"] == "Male")
    ]["user_id"].tolist(),
    "uk_nonmale": survey[
        (survey["included_in_UK_REP"] == True) & (survey["gender"] != "Male")
    ]["user_id"].tolist(),
    "uk_above_45": survey[
        (survey["included_in_UK_REP"] == True)
        & (survey["age"].isin(["45-54 years old", "55-64 years old", "65+ years old"]))
    ]["user_id"].tolist(),
    "uk_below_45": survey[
        (survey["included_in_UK_REP"] == True)
        & (
            survey["age"].isin(["45-54 years old", "55-64 years old", "65+ years old"])
            == False
        )
    ]["user_id"].tolist(),
    "uk": survey[(survey["included_in_UK_REP"] == True)]["user_id"].tolist(),
}

## Define helper functions

### 1) welfare table

Given distribution, compute distribution of welfare (rating on approach response).

In [8]:
# Welfare table for scores
welfare_table = pd.pivot_table(
    merged,
    values="score",
    index="user_id",
    columns="short_model_name",
    aggfunc="mean",
    fill_value=None,
)

In [12]:
# Welfare table for choices
choice_table = pd.pivot_table(
    merged,
    values="if_chosen",
    index="user_id",
    columns="short_model_name",
    aggfunc="mean",
    fill_value=None,
)

In [13]:
# Welfare table for vetos - 1 if user gives score below 10
veto_table = pd.pivot_table(
    merged,
    values="veto",
    index="user_id",
    columns="short_model_name",
    aggfunc="mean",
    fill_value=None,
)

### 2) functions for generating model selection empirical distributions

First, we will select models based on mean (normalised) score.

Next, we will select models based on pairwise battles. Winner can be chosen by multiple social choice rules i.e. elo, rank centrality

In [9]:
def gen_model_distribution(
    sample_size,
    sampling_pool_ids,
    emp_dist_gran=100,
    method="max_mean_rating",
    welfare_table_=welfare_table,
):
    """
    Generate empirical sampling distribution of models chosen. We weight all individuals in rater_sample evenly - for example, if one rater rates a model twice,
      and another rates it once, take the average rating of each rater for that model.
    Args,
        sameple_size (int): sample size
        sampling_pool_ids (list): list of ids to sample from (with replacement)
        emp_dist_gran (int): granularity of the empirical distribution
    Returns,
        Empirical sampling distribution of models chosen, given sampling scheme
    """

    model_dist = ["t"] * emp_dist_gran  # model empirical distribution
    for j in range(emp_dist_gran):
        # initalise candidate models
        scores = [-1] * len(models)
        rater_sample = random.choices(sampling_pool_ids, k=sample_size)

        filtered_data = welfare_table_.reindex(rater_sample)
        # loop through candidate models
        if method == "max_mean_rating":  # pick model with highest rating
            for i in range(len(models)):
                sample_scores = filtered_data[models[i]]
                if np.isnan(sample_scores).all():
                    scores[i] = 1
                else:
                    scores[i] = np.nanmean(sample_scores)

        elif method == "max_pc_chosen":  # pick model with highest choice rate
            for i in range(len(models)):
                sample_scores = filtered_data[models[i]]
                if np.isnan(sample_scores).all():
                    scores[i] = 0
                else:
                    scores[i] = np.nanmean(sample_scores)

        elif method == "min_pc_veto":  # pick model with minimum veto
            for i in range(len(models)):
                sample_scores = filtered_data[models[i]]
                if np.isnan(sample_scores).all():
                    scores[i] = 1  # or handle it in another appropriate wa
                else:
                    scores[i] = 1 - np.nanmean(sample_scores)
        else:
            print(
                "error - please select one method from: max_normalised_rating, max_pc_chosen,min_pc_veto"
            )

        # pick winning model
        max_indices = [
            index for index, value in enumerate(scores) if value >= max(scores)
        ]  # find argmax
        if len(max_indices) == 0:  # if all na, pick random
            winner_index = random.choice([index for index, value in enumerate(scores)])
        else:  # random pick out of winners
            winner_index = random.choice(max_indices)
        # add to list
        model_dist[j] = models[winner_index]

    return model_dist

# Final tables

## Plot helper functions

In [10]:
def gen_welfare_df_full(
    population,
    welfare_metric_table=welfare_table,
    model_choice_rule="max_mean_rating",
    country="us",
    emp_dist_gran_=100,
):
    dict_res = {}
    # df_res_temp = pd.DataFrame(columns = ['sampling_scheme','mean_social_welfare','percentile_5','percentile_25','percentile_75','percentile_95'])
    welfare_table_temp = welfare_metric_table.reindex(sampling_dict[population])
    # loop through sample sizes
    for n in [10, 20, 50, 100]:
        dist_temp = [
            welfare_table_temp[i_].mean()
            for i_ in gen_model_distribution(
                n,
                sampling_dict[country],
                emp_dist_gran=emp_dist_gran_,
                method=model_choice_rule,
                welfare_table_=welfare_metric_table,
            )
        ]
        sampling_scheme = country + "_rep_" + str(n)
        dict_res[sampling_scheme] = dist_temp

    for scheme in [country + i_ for i_ in ["_male", "_white", "_above_45"]]:
        dist_temp = [
            welfare_table_temp[i__].mean()
            for i__ in gen_model_distribution(
                100,
                sampling_dict[scheme],
                emp_dist_gran=emp_dist_gran_,
                method=model_choice_rule,
                welfare_table_=welfare_metric_table,
            )
        ]
        sampling_scheme = scheme + "_100"
        dict_res[sampling_scheme] = dist_temp

    return dict_res

## Main Figure

In [50]:
n_emp_dist_gran = 1000
random.seed(1)

# rating data
for c in ["us", "uk"]:
    for s in ["", "nonmale", "nonwhite", "below_45"]:
        if s == "":
            pop = c
        else:
            pop = c + "_" + s
        output_dict = gen_welfare_df_full(
            population=pop, country=c, emp_dist_gran_=n_emp_dist_gran
        )
        print("saving " + pop)
        with open(RES_DIR + "welfare_" + pop + "_rating.pickle", "wb") as file:
            pickle.dump(output_dict, file)


# choice data
for c in ["us", "uk"]:
    for s in ["", "nonmale", "nonwhite", "below_45"]:
        if s == "":
            pop = c
        else:
            pop = c + "_" + s
        output_dict = gen_welfare_df_full(
            population=pop,
            country=c,
            emp_dist_gran_=n_emp_dist_gran,
            welfare_metric_table=choice_table,
            model_choice_rule="max_pc_chosen",
        )
        print("saving " + pop)
        with open(RES_DIR + "welfare_" + pop + "_choice.pickle", "wb") as file:
            pickle.dump(output_dict, file)

saving us
saving us_nonmale
saving us_nonwhite
saving us_below_45
saving uk
saving uk_nonmale
saving uk_nonwhite
saving uk_below_45
saving us
saving us_nonmale
saving us_nonwhite
saving us_below_45
saving uk
saving uk_nonmale
saving uk_nonwhite
saving uk_below_45
