In [59]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
from cider.validation_metrics.dependencies import (
    convert_threshold_to_percentile, 
    calculate_weighted_spearmanr, 
    calculate_weighted_pearsonr,
    calculate_metrics_binary_valued_consumption,
    calculate_utility,
    calculate_rank_residuals_by_characteristic,
    calculate_demographic_parity_per_characteristic,
    calculate_independence_btwn_proxy_and_characteristic,
    calculate_precision_and_recall_independence_characteristic)
from cider.validation_metrics.schemas import ConsumptionColumn
from cider.validation_metrics.core import (
    compute_auc_roc_precision_recall_with_percentile_grid, 
    compute_utility_grid, 
    calculate_optimal_utility_and_cash_transfer_size_table,
    calculate_rank_residuals_table_by_characteristic,
    calculate_demographic_parity_table_per_characteristic,
    combine_tables_on_characteristic,)
from cider.validation_metrics.plotting import (
    plot_roc_precision_recall_curves,
    plot_utility_values,
    plot_rank_residual_distributions_per_characteristic_value,
    plot_all_fairness_metrics_per_characteristic_value)
import numpy as np
import pandas as pd

In [61]:
# Make synthetic data

n_rows = 1000
n_proxies = 10
synthetic_data = {}

np.random.seed(2)
synthetic_data['household_id'] = range(n_rows)
synthetic_data['groundtruth_consumption'] = np.random.rand(n_rows) * 10
for i in range(10):
    synthetic_data["proxy_consumption"] = synthetic_data['groundtruth_consumption'] + np.random.uniform(size=n_rows, low=0, high=i+5)
synthetic_data['weight'] = np.random.randint(10, 100, size=n_rows)

synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df

Unnamed: 0,household_id,groundtruth_consumption,proxy_consumption,weight
0,0,4.359949,4.992880,75
1,1,0.259262,3.910949,83
2,2,5.496625,17.490485,21
3,3,4.353224,16.202991,53
4,4,4.203678,8.119494,85
...,...,...,...,...
995,995,5.985047,13.115803,89
996,996,3.589201,13.024588,13
997,997,6.803915,8.792285,17
998,998,8.531998,13.013903,24


In [None]:
# Calculate Spearman's R
spearman_r = calculate_weighted_spearmanr(synthetic_df, 3)
print(f"Spearman's R: {spearman_r}")

# Calculate Pearson's R
pearson_r = calculate_weighted_pearsonr(synthetic_df, 3)
print(f"Pearson's R: {pearson_r}")

In [None]:
convert_threshold_to_percentile(2.5, synthetic_df)

In [None]:
calculate_metrics_binary_valued_consumption(synthetic_df, 50, 20)

In [None]:
calculate_utility(
    synthetic_df,
    20.0, 
    ConsumptionColumn.GROUNDTRUTH,
    5000)

In [None]:
fixed_groundtruth_percentile = 40  # Bottom 40% are considered "poor"
auc_roc_precision_recall_df = compute_auc_roc_precision_recall_with_percentile_grid(
    synthetic_df,
    fixed_groundtruth_percentile,
    99)
auc_roc_precision_recall_df

In [None]:
# PLot ROC Curve
fig, ax = plot_roc_precision_recall_curves(auc_roc_precision_recall_df, fixed_groundtruth_percentile)
fig

In [None]:
cash_transfer_at_ubi_rate = 0.1 * synthetic_df['weight'].sum()
utility_grid_df = compute_utility_grid(
    synthetic_df,
    cash_transfer_amount=cash_transfer_at_ubi_rate,
    num_grid_points=99,
    constant_relative_risk_aversion=3.0)
utility_grid_df

In [None]:
optimal_utility_df = calculate_optimal_utility_and_cash_transfer_size_table(
    synthetic_df,
    cash_transfer_amount=cash_transfer_at_ubi_rate,
    num_grid_points=10,
    constant_relative_risk_aversion=3.0
)
optimal_utility_df

In [None]:
fig, ax = plot_utility_values(
    utility_grid_df, 
    optimal_utility_df.loc["proxy_consumption", "optimal_population_percentile"],
    optimal_utility_df.loc["proxy_consumption", "maximum_utility"],
    cash_transfer_at_ubi_rate, 3)
fig

In [None]:
# Add characteristic for fairness analysis
allowed_gender_values = {'male', 'female', 'other'}
synthetic_df_gender = synthetic_df.copy()
synthetic_df_gender['characteristic'] = np.random.choice(list(allowed_gender_values), size=len(synthetic_df_gender))


In [None]:
rank_residual_df = calculate_rank_residuals_by_characteristic(
    synthetic_df_gender)
rank_residual_df

In [None]:
calculate_demographic_parity_per_characteristic(
    synthetic_df_gender,
    threshold_percentile=50)

In [None]:
# Plot rank residual distributions per characteristic value
fig, ax = plot_rank_residual_distributions_per_characteristic_value(rank_residual_df, "Gender")
fig

In [None]:
pivot_independence, results = calculate_independence_btwn_proxy_and_characteristic(
    synthetic_df_gender,
    threshold_percentile=50)
results

In [None]:
precision_per_group, recall_per_group, results = calculate_precision_and_recall_independence_characteristic(
    synthetic_df_gender,
    50,
    50)
results

In [None]:
df, anova_f_statistic, anova_p_value = calculate_rank_residuals_table_by_characteristic(
    synthetic_df_gender)
print(f"ANOVA F-statistic: {anova_f_statistic}, p-value: {anova_p_value}")
df

In [None]:
demographic_table = calculate_demographic_parity_table_per_characteristic(
    synthetic_df_gender,
    50
)
demographic_table

In [None]:
combined_results, statistics = combine_tables_on_characteristic(
    synthetic_df_gender,
    50
)

In [None]:
statistics

In [None]:
fig, ax = plot_all_fairness_metrics_per_characteristic_value(combined_results, statistics, "Gender")
fig