# Evaluate an plot the results of the model

## Load packages and data

In [1]:
import os
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

from pathlib import Path
from itertools import product
project_root = (
    Path.cwd()
    .parents[0]
)
sys.path.append(str(project_root))
sys.path.append(str(f'{project_root}/run-simulation/py-files'))
from utils_calibration import calibrate_propensity_score, compute_ipw_estimate, calibration_errors
from utils_dgps import dgp_wrapper
from utils_eval import plot_dist, plot_calibration_metrics, plot_ate_metrics, plot_overlap_ratio, plot_calibration_comparison
from utils_eval import plot_ps_treatment_comparison, plot_overlap_comparison, plot_mirrored_propensity_histogram, evaluate_estimation, add_headers

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

In [4]:
# Treatment Effect:
n_obs = 1000000
overlap = 0
R2_d = 0
share_treated = 0
dim_x = 4

dgp_dict = {
    'n_obs':n_obs}

dgp_type = 'sim_v06_nonlinear' # sim_v06_nonlinear
data_dict = dgp_wrapper(dgp_type=dgp_type, **dgp_dict)
ate_nl = data_dict['treatment_effect']

print(f'True Treatment Effect, Setting 3 Simulation Nonlinear: {ate_nl}')

True Treatment Effect, Setting 3 Simulation Nonlinear: 1.6233978853619264


In [5]:
df=pd.read_pickle('results/results_nonlinear.pkl')
dgp_type = 'sim_v06_nonlinear' # sim_v06_nonlinear
theta = ate_nl
df=pd.DataFrame(df)
window_size = 1.5
dim_x = 4

In [6]:
df['R2_d'] = 0 
df['share_treated'] = 0  
df['dim_x'] = dim_x
df['overlap'] = 0

## Basic Overview

In [7]:
grouping_columns = ["n_obs", "R2_d", "dim_x", "learner_g", "learner_m", "method", "calib_method", "clipping_threshold", "overlap", "share_treated"]
df.groupby(grouping_columns).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,coefs,ses,cover,ci_length,K,rmses,ipw_coefs,plr_coefs,repetition,theta,ece_u,ece_q,ece_u_5,ece_q_5,ece_l2,mce
n_obs,R2_d,dim_x,learner_g,learner_m,method,calib_method,clipping_threshold,overlap,share_treated,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
200,0,4,LGBM,LGBM,alg-1-uncalibrated,uncalibrated,1.000000e-12,0,0,2.139171e+00,9.684803e-01,0.84,3.796373e+00,199.74,0.508948,4.854095e+00,1.511463,49.5,1.128739,0.052723,0.053279,0.045093,0.045430,0.078455,0.388895
200,0,4,LGBM,LGBM,alg-1-uncalibrated,uncalibrated,1.000000e-02,0,0,1.989310e+00,7.868949e-01,0.83,3.084571e+00,198.95,0.508939,4.456523e+00,1.511481,49.5,1.128739,0.052723,0.053277,0.045086,0.045428,0.078452,0.388890
200,0,4,LGBM,LGBM,alg-1-uncalibrated,uncalibrated,1.000000e-01,0,0,1.690416e+00,3.657324e-01,0.75,1.433645e+00,156.61,0.504878,2.470303e+00,1.523574,49.5,1.128739,0.049375,0.050470,0.044918,0.044259,0.073705,0.364696
200,0,4,LGBM,LGBM,alg-2-nested-cross-fitting-calib,isotonic,1.000000e-12,0,0,6.281662e+09,1.238250e+10,0.99,4.853853e+10,32.76,0.490622,3.735922e+10,1.521517,49.5,1.128739,0.050559,0.059159,0.044219,0.049861,0.069924,0.277123
200,0,4,LGBM,LGBM,alg-2-nested-cross-fitting-calib,isotonic,1.000000e-02,0,0,2.197570e+00,1.284783e+00,0.91,5.036258e+00,32.76,0.490244,5.025621e+00,1.523230,49.5,1.128739,0.050423,0.059045,0.044116,0.049774,0.069686,0.276738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000,0,4,RF,RF,alg-5-full-sample-calib,platt,1.000000e-02,0,0,1.610510e+00,4.302421e-02,0.00,1.686518e-01,3999.97,0.469782,1.234869e+00,1.631470,49.5,1.124230,0.008189,0.010039,0.006612,0.008083,0.013060,0.141869
4000,0,4,RF,RF,alg-5-full-sample-calib,platt,1.000000e-01,0,0,1.610510e+00,4.302421e-02,0.00,1.686518e-01,3999.97,0.469782,1.234869e+00,1.631470,49.5,1.124230,0.008189,0.010039,0.006612,0.008083,0.013060,0.141869
4000,0,4,RF,RF,oracle,uncalibrated,1.000000e-12,0,0,1.622003e+00,4.584196e-02,0.00,1.796972e-01,4000.00,0.457204,1.619080e+00,1.657002,49.5,1.124230,0.010898,0.011990,0.009502,0.010341,0.016298,0.178667
4000,0,4,RF,RF,oracle,uncalibrated,1.000000e-02,0,0,1.622003e+00,4.584196e-02,0.00,1.796972e-01,4000.00,0.457204,1.619080e+00,1.657002,49.5,1.124230,0.010898,0.011990,0.009502,0.010341,0.016298,0.178667


## Unique Values

In [8]:
df= df.sort_values(by=["n_obs","dim_x","clipping_threshold"])
for col in grouping_columns:
    print(f"{col}: {df[col].unique()}")

n_obs: [200 500 1000 2000 4000]
R2_d: [0]
dim_x: [4]
learner_g: ['Linear' 'RF' 'LGBM']
learner_m: ['Logit' 'RF' 'LGBM']
method: ['alg-1-uncalibrated' 'alg-2-nested-cross-fitting-calib'
 'alg-3-cross-fitted-calib' 'alg-4-single-split-calib'
 'alg-5-full-sample-calib' 'oracle']
calib_method: ['uncalibrated' 'isotonic' 'platt' 'ivap']
clipping_threshold: [1.e-12 1.e-02 1.e-01]
overlap: [0]
share_treated: [0]


In [9]:
df_results = pd.DataFrame(columns=[
    'n_obs', 'R2_d', 'dim_x', 'learner_g', 'learner_m', 'method','calib_method', 
    'clipping_threshold', 'rmses', 'K', 'ece_u', 'ece_q', 'ece_u_5', 'ece_q_5', 
    'ece_l2','mce','procedure', 'estimate', 'overlap', 'share_treated'
])

In [10]:
df= df.rename(columns={"coefs": "IRM"})
df= df.rename(columns={"ipw_coefs": "IPW"})
df= df.rename(columns={"plr_coefs": "PLR"})

# filter DataFrame

for calib_method in enumerate(["uncalibrated", "isotonic", "ivap", "oracle", "platt"]):
    df_new = df[df["calib_method"].isin(calib_method)]

    # melt DataFrame for plotting
    grouping_columns = ["n_obs", "dim_x", "R2_d", "learner_g", "learner_m", "method", "calib_method", "clipping_threshold", 
                        "rmses", "K","ece_u","ece_q","ece_u_5","ece_q_5","ece_l2","mce", "overlap", "share_treated"]
    df_new = df_new.melt(
        id_vars=grouping_columns,
        value_vars=["IRM", "IPW", "PLR"],
        var_name="procedure",
        value_name="estimate")

    df_results = pd.concat([df_results, df_new], ignore_index=True)

In [11]:
method_mapping = {
    "alg-1-uncalibrated": "Alg-1-uncalib",
    "alg-2-nested-cross-fitting-calib": "Alg-2-nested-cf",
    "alg-3-cross-fitted-calib": "Alg-3-cf",
    "alg-4-single-split-calib": "Alg-4-single-split",
    "alg-5-full-sample-calib": "Alg-5-full-sample",
    "oracle": "Oracle"
}
df_results["method"] = df_results["method"].replace(method_mapping)
calib_mapping = {
    "uncalibrated": "Uncalib",
    "isotonic": "Iso",
    "platt": "Platt",
    "ivap": "IVAP"}
df_results["calib_method"] = df_results["calib_method"].replace(calib_mapping)
df_results.insert(1, "Method", df_results[['method','calib_method']].agg('-'.join, axis=1))

In [12]:
# fixed values
overlap = 0
share_treated = 0
R2_d = 0

n_obs = 2000
clipping_threshold = 0.01
learner_g = "LGBM"
learner_m = "LGBM"

n_obs_list = [500, 2000, 4000]

overlap_list = [0]
R2_d_list = [0]

clipping_thresholds = [1e-12, 0.01, 0.1]
learner_dict_g = {'Linear','LGBM','RF'}
learner_dict_m = {'Logit','LGBM', 'RF'}


# Filter cali method
df= df_results.copy()
calib_methods=["Uncalib", "Iso"]#, "ivap", "oracle", "platt"]
df = df[df["calib_method"].isin(calib_methods)]

Method_mapping = {
    "Oracle-Uncalib": "Oracle",
    "Alg-1-uncalib-Uncalib": "Alg-1-Uncalib"}
df.loc[:, "Method"] = df["Method"].replace(Method_mapping)

df= df.sort_values(by=["procedure","Method"])

## Distribution of Estimates

In [13]:
common_params = {
    'df': df,
    'theta': theta,
    'window_size': window_size
}

# 1) Learner_m plot
plot_dist(
    directory='results/figures_nl/learner_m/',
    varying_col='learner_m',
    fixed_conditions={
        'n_obs': n_obs,
        'dim_x': dim_x,
        'learner_g': learner_g,
        'clipping_threshold': clipping_threshold,
        'overlap': overlap,
        'R2_d': R2_d,
        'share_treated': share_treated
    },
    **common_params
)

# 2) Learner_g plot
plot_dist(
    directory='results/figures_nl/learner_g/',
    varying_col='learner_g',
    fixed_conditions={
        'n_obs': n_obs,
        'dim_x': dim_x,
        'learner_m': learner_m,
        'clipping_threshold': clipping_threshold,
        'overlap': overlap,
        'R2_d': R2_d,
        'share_treated': share_treated
    },
    **common_params
)

# 3) n_obs plot
plot_dist(
    directory='results/figures_nl/n_obs/',
    varying_col='n_obs',
    fixed_conditions={
        'dim_x': dim_x,
        'learner_g': learner_g,
        'learner_m': learner_m,
        'clipping_threshold': clipping_threshold,
        'overlap': overlap,
        'R2_d': R2_d,
        'share_treated': share_treated
    },
    varying_filter=n_obs_list,
    filename_zero_fmt='05d',
    **common_params
)

# 4) Clipping threshold plot
plot_dist(
    directory='results/figures_nl/clipping_threshold/',
    varying_col='clipping_threshold',
    fixed_conditions={
        'n_obs': n_obs,
        'dim_x': dim_x,
        'learner_g': learner_g,
        'learner_m': learner_m,
        'overlap': overlap,
        'R2_d': R2_d,
        'share_treated': share_treated
    },
    **common_params
)

## Graphs: Treatment and Calibration Errors by Sample Size and Dimension

In [14]:
df = df_results.copy()
df["bias"]= df["estimate"]-theta
Method_mapping = {
    "Oracle-Uncalib": "Oracle",
    "Alg-1-uncalib-Uncalib": "Alg-1-Uncalib"}
df["Method"] = df["Method"].replace(Method_mapping)
groupby_cols = ['n_obs', 'dim_x', 'R2_d', 'learner_g', 
                'learner_m', 'clipping_threshold', 
                'procedure', 'Method', 'overlap', 
                'share_treated']

# Columns to calculate the mean from
mean_cols = ['ece_q', 'ece_q_5', 'ece_u', 'ece_u_5', 'ece_l2', 'mce']
mean_results = df.groupby(groupby_cols)[mean_cols].mean().reset_index()

# Prepare the calibration errors DataFrame
calib_errors = mean_results.rename(columns={"ece_q": "ECE Quantile (b=10)"})
calib_errors['ECE Quantile (b=5)'] = mean_results['ece_q_5']
calib_errors['ECE Uniform (b=10)'] = mean_results['ece_u']
calib_errors['ECE Uniform (b=5)'] = mean_results['ece_u_5']
calib_errors['L2-CE Uniform (b=10)'] = mean_results['ece_l2']
calib_errors['MCE Uniform (b=10)'] = mean_results['mce']

# Replace clipping_threshold values for readability
calib_errors['clipping_threshold'] = calib_errors['clipping_threshold'].replace({
    1.e-12: 'Unclipped', 
    1.e-2: 'Clipped_0.01', 
    1.e-1: 'Clipped_0.1'
})
calib_errors['Method_Clip']= calib_errors[['Method','clipping_threshold']].agg('-'.join, axis=1)
calib_errors = calib_errors.sort_values(["Method_Clip","n_obs","R2_d","overlap","share_treated","dim_x","learner_g","learner_m"])

#### Plot Calibration Errors

In [15]:
# fixed values
learner_g = "LGBM"
clipping = 0.1
R2_d_list = [0]
R2_d = 0
share_treated = 0
overlap = 0

overlap_list = [0]
n_obs_list = [200,500,1000,2000,4000]
clipping_thresholds = [0.01,0.1]
learner_dict_m = {'Logit','LGBM','RF'}

In [16]:
# Configuration for metrics
METRICS_CONFIG = [
    {'column': 'ECE Uniform (b=5)', 'ylabel': 'ECE Uniform (bins=5)'},
    {'column': 'ECE Uniform (b=10)', 'ylabel': 'ECE Uniform (bins=10)'},
    {'column': 'ECE Quantile (b=5)', 'ylabel': 'ECE Quantile (bins=5)'},
    {'column': 'ECE Quantile (b=10)', 'ylabel': 'ECE Quantile (bins=10)'},
    {'column': 'L2-CE Uniform (b=10)', 'ylabel': 'L2-CE Uniform (bins=10)'},
    {'column': 'MCE Uniform (b=10)', 'ylabel': 'MCE Uniform (bins=10)'}
]

methods = ['Alg-1-Uncalib-Unclipped','Alg-2-nested-cf-Iso-Unclipped', 'Alg-3-cf-Iso-Unclipped',
        'Alg-4-single-split-Iso-Unclipped', 'Alg-5-full-sample-Iso-Unclipped']

clipping_thresholds=[1e-12]

plot_calibration_metrics(
    directory='results/figures_nl/calib_errors/',
    data=calib_errors,
    n_obs_list=n_obs_list,
    dim_x=dim_x,
    learner_g=learner_g,
    clipping_thresholds=clipping_thresholds,
    learner_dict_m=learner_dict_m,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    metrics_config=METRICS_CONFIG,
    palette_colors=['#018571', "#252525", "#091e75", "#662506", "#850157", "#5c5c5c"],
    panel_labels=True,
    methods = methods
)

### Plot treatment effect metrics

In [17]:
df_eval = df.groupby(['n_obs', 'dim_x', 'R2_d', 'learner_g', 'learner_m', 
                      'clipping_threshold', 'procedure', 'Method', 
                      'overlap', 'share_treated']
                    ).estimate.apply(lambda x: evaluate_estimation(x, theta=theta)).reset_index()
column_to_rename = df_eval.columns[-2]
df_eval = df_eval.rename(columns={column_to_rename: "Metrics"})

In [18]:
n_obs_list = [200, 500, 1000, 2000, 4000]
dim_x_list = [4]
share_treated = 0
R2_d_list = [0]
overlap_list = [0]

clipping_thresholds = [1e-12, 0.01, 0.1]
learner_dict_g = ['Linear', 'LGBM', 'RF']
learner_dict_m = ['Logit', 'LGBM', 'RF']
procedure_list = ['IPW', 'IRM', 'PLR']

results = pd.DataFrame(columns=['MAE', 'Mean Bias', 'RMSE', 'Std. dev.', 'n_obs', 'dim_x', 
                                'R2_d', 'procedure', 'Method', 'overlap', 'share_treated'])


for n_obs, clipping_threshold, learner_g, learner_m, procedure, dim_x, overlap in product(
    n_obs_list, clipping_thresholds, learner_dict_g, learner_dict_m, procedure_list, dim_x_list, overlap_list):
    
    # Filter data based on current parameter combination
    df_eval_new = df_eval[
        (df_eval['n_obs'] == n_obs) &
        (df_eval['clipping_threshold'] == clipping_threshold) &
        (df_eval['learner_m'] == learner_m) &
        (df_eval['procedure'] == procedure) &
        (df_eval['dim_x'] == dim_x) &
        (df_eval['R2_d'] == R2_d) &
        (df_eval['learner_g'] == learner_g) &
        (df_eval["overlap"] == overlap) & 
        (df_eval["share_treated"] == share_treated)
    ]

    # Pivot and format the new DataFrame
    df_eval_new = df_eval_new.pivot_table('estimate', ['Method'], 'Metrics').iloc[:, 0:4]
    df_eval_new = df_eval_new.assign(n_obs=n_obs, learner_g=learner_g, learner_m=learner_m,
                                     procedure=procedure, dim_x=dim_x, R2_d=R2_d,
                                     clipping_threshold=clipping_threshold, overlap=overlap,
                                     share_treated=share_treated, Method=df_eval_new.index)

    # Append to results
    results = pd.concat([results, df_eval_new], ignore_index=True)

# Replace clipping thresholds
results['clipping_threshold'] = results['clipping_threshold'].replace(
    {1.e-12: 'Unclipped', 0.01: 'Clipped_0.01', 0.1: 'Clipped_0.1'})

# Add 'Method_Clip' column
results.insert(1, "Method_Clip", results[['Method', 'clipping_threshold']].agg('-'.join, axis=1))
# For correct matches in legend
results = results.sort_values(["Method_Clip","n_obs","R2_d","overlap","share_treated","dim_x","learner_g","learner_m"])

In [40]:
pal = ['#018571', "#252525","#091e75", "#662506", "#850157", "#5c5c5c"]
sns.set_theme(style="whitegrid", context="paper")

share_treated = 0
R2_d_list = [0]
overlap = 0
clipping_thresholds = [0.01, 0.1]
learner_g = "LGBM"
n_obs_list = [200, 500, 1000, 2000, 4000]
clipping_threshold = 0.01

In [41]:
procedures = ['IPW', 'IRM', 'PLR']
metrics = ['RMSE', 'MAE', 'Std. dev.']

# Configuration for different plot types
CONFIG_1 = {
    'methods': [
        f'Alg-1-Uncalib-Clipped_{clipping_threshold}',
        'Alg-1-Uncalib-Unclipped',
        f'Alg-3-cf-Iso-Clipped_{clipping_threshold}',
        'Alg-3-cf-Iso-Unclipped',
        f'Alg-3-cf-IVAP-Clipped_{clipping_threshold}',
        'Alg-3-cf-Platt-Unclipped'
    ],
    'labels': [
        'Alg-1-clipped',
        'Alg-1-uncalib',
        'Alg-3-cf-Iso-Clipped',
        'Alg-3-cf-Iso',
        'Alg-3-cf-IVAP-Clipped',
        'Alg-3-cf-Platt'
    ]
}

# First plot type
plot_ate_metrics(
    directory_base='results/figures_nl/ate_errors_alg_3_',
    data=results,
    procedures=procedures,
    metrics=metrics,
    methods_config=CONFIG_1,
    learner_m_list=['LGBM'],
    clipping_threshold=clipping_threshold,
    n_obs_list=n_obs_list,
    dim_x=dim_x,
    learner_g=learner_g,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    palette_colors=['#018571', "#252525", "#091e75", "#662506", "#850157", "#5c5c5c"],
    fig_size=(14, 12),
    style_kwargs={
        'title_size': 18,
        'label_size': 16,
        'tick_size': 12,
        'panel_label_size': 18,
        'marker_size': 12
    },
    add_headers_func=add_headers
)

In [42]:
CONFIG_2 = {
    'methods': [
        f'Alg-1-Uncalib-Clipped_{clipping_threshold}',
        f'Alg-2-nested-cf-Iso-Clipped_{clipping_threshold}',
        'Alg-3-cf-Iso-Unclipped',
        f'Alg-4-single-split-Iso-Clipped_{clipping_threshold}',
        'Alg-5-full-sample-Iso-Unclipped'
    ],
    'labels': [
        'Alg-1-clipped',
        'Alg-2-nested-cf-Iso-Clipped',
        'Alg-3-cf-Iso',
        'Alg-4-single-split-Iso-Clipped',
        'Alg-5-full-sample-Iso'
    ]
}

plot_ate_metrics(
    directory_base='results/figures_nl/ate_errors_',
    data=results,
    procedures=procedures,
    metrics=metrics,
    methods_config=CONFIG_2,
    learner_m_list=['LGBM'],
    clipping_threshold=clipping_threshold,
    n_obs_list=n_obs_list,
    dim_x=dim_x,
    learner_g=learner_g,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    palette_colors=['#018571', "#252525", "#091e75", "#662506", "#850157"],
    fig_size=(14, 12),
    style_kwargs={
        'title_size': 18,
        'label_size': 16,
        'tick_size': 12,
        'panel_label_size': 18,
        'marker_size': 12
    },
    add_headers_func=add_headers
)

## Calibrated Propensity scores

In [7]:
# Treatment Effect:
n_obs = 2000
overlap = 0
R2_d = 0
share_treated = 0
dim_x = 4


dgp_dict = {
    'n_obs':n_obs}

dgp_type = 'sim_v06_nonlinear'
data_dict = dgp_wrapper(dgp_type=dgp_type, **dgp_dict)



data_dict= data_dict
dgp_type = dgp_type 
theta = 1.62543

treatment =  data_dict['treatment']
x =  data_dict['covariates']
m_0 =  data_dict['propensity_score']
outcome = data_dict['outcome']


In [8]:
# Fit models
log_model = LogisticRegression()
lgbm_model = LGBMClassifier(verbose=-1)
rf_model = RandomForestClassifier()
m_hat_log = cross_val_predict(log_model, x, treatment, cv=5, method="predict_proba")[:, 1]
m_hat_lgbm = cross_val_predict(lgbm_model, x, treatment, cv=5, method="predict_proba")[:, 1]
m_hat_rf = cross_val_predict(rf_model, x, treatment, cv=5, method="predict_proba")[:, 1]

In [9]:
calib_methods = [
    ('alg-1-uncalibrated', 'uncalibrated'),
    ('alg-2-nested-cross-fitting-calib', 'isotonic'),
    ('alg-2-nested-cross-fitting-calib', 'platt'),
    ('alg-2-nested-cross-fitting-calib', 'ivap'),
    ('alg-3-cross-fitted-calib', 'isotonic'),
    ('alg-3-cross-fitted-calib', 'platt'),
    ('alg-3-cross-fitted-calib', 'ivap'),
    ('alg-4-single-split-calib', 'isotonic'),
    ('alg-4-single-split-calib', 'platt'),
    ('alg-4-single-split-calib', 'ivap'),
    ('alg-5-full-sample-calib', 'isotonic'),
    ('alg-5-full-sample-calib', 'platt'),
    ('alg-5-full-sample-calib', 'ivap'),
    ('oracle', 'uncalibrated'),
]

In [10]:
propensity_score_dict = {
    # 'm_0': m_0,
    'm_hat_log': m_hat_log,
    'm_hat_lgbm': m_hat_lgbm,
    'm_hat_rf': m_hat_rf
}
clipping_thresholds = [1e-12, 0.01]

df_propensities = pd.DataFrame({
    'treatment': treatment,
    'outcome': outcome,
    'm_0': m_0,
    'm_hat_log': m_hat_log,
    'm_hat_lgbm': m_hat_lgbm,    
    'm_hat_rf': m_hat_rf,
    'x_1':x["x1"]})

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
smpls = list(kf.split(X=np.zeros(len(treatment)), y=treatment))
df_ps_calibrated=pd.DataFrame()

for m_hat_name, m_hat in propensity_score_dict.items():
    print(f"Calibrating {m_hat_name}")
    for clipping_threshold in clipping_thresholds:        
        for i_calib_method, (method, calib_method) in enumerate(calib_methods):
            method = method
            calib_method = calib_method
            # calibration inclues clipping

            m_hat_calib = calibrate_propensity_score(
                propensity_score=m_hat,
                treatment=treatment,
                calib_method=calib_method,
                method = method,
                clipping_threshold=clipping_threshold,
                covariates=x,
                learner_m=log_model,
                true_propensity_score=theta,
                smpls = smpls
                )
            
            #df_propensities[f'{m_hat_name}_{calib_method}_{clip}'] = m_hat_calib
            df_propensities["calib_method"]=calib_method
            df_propensities["method"]=method
            df_propensities["clipping_threshold"]=clipping_threshold
            df_propensities["m_hat_name"]=m_hat_name
            df_propensities["ps"]=m_hat_calib
            
            df_ps_calibrated = pd.concat([df_ps_calibrated, df_propensities], ignore_index=True)


Calibrating m_hat_log
Calibrating m_hat_lgbm
Calibrating m_hat_rf


In [11]:
# Replace clipping thresholds
df_ps_calibrated['clipping_threshold'] = df_ps_calibrated['clipping_threshold'].replace({1.e-12: '', 0.01: '-Clipped'})

method_mapping = {
    "alg-1-uncalibrated": "Alg-1-uncalib",
    "alg-2-nested-cross-fitting-calib": "Alg-2-nested-cf",
    "alg-3-cross-fitted-calib": "Alg-3-cf",
    "alg-4-single-split-calib": "Alg-4-single-split",
    "alg-5-full-sample-calib": "Alg-5-full-sample",
    "oracle": "Oracle"
}
df_ps_calibrated["method"] = df_ps_calibrated["method"].replace(method_mapping)

calib_mapping = {
    "uncalibrated": "Uncalib",
    "isotonic": "Iso",
    "platt": "Platt",
    "ivap": "IVAP"}
df_ps_calibrated["calib_method"] = df_ps_calibrated["calib_method"].replace(calib_mapping)

df_ps_calibrated.insert(1, "Method", df_ps_calibrated[['method','calib_method']].agg('-'.join, axis=1))

Method_mapping = {
    "Oracle-Uncalib": "Oracle",
    "Alg-1-uncalib-Uncalib": "Alg-1-Uncalib"}
df_ps_calibrated["Method"] = df_ps_calibrated["Method"].replace(Method_mapping)
# Add 'Method_Clip' column
df_ps_calibrated.insert(1, "Method_Clip", df_ps_calibrated[['Method', 'clipping_threshold']].agg(''.join, axis=1))
df_ps_calibrated['m_hat_name'] = df_ps_calibrated['m_hat_name'].str.replace('m_hat_log','Logistic')
df_ps_calibrated['m_hat_name'] = df_ps_calibrated['m_hat_name'].str.replace('m_hat_lgbm','LGBM') 
df_ps_calibrated['m_hat_name'] = df_ps_calibrated['m_hat_name'].str.replace('m_hat_rf','RF')
df_ps_calibrated= df_ps_calibrated.rename(columns={"m_hat_name": "Learner"})
m_hats = ['Logistic','LGBM', 'RF']
df_ps_calibrated = df_ps_calibrated[df_ps_calibrated["Learner"].isin(m_hats)]      
Method_clip_mapping = {
    "Alg-1-Uncalib-Clipped": "Alg-1-Clipped"}
df_ps_calibrated["Method_Clip"] = df_ps_calibrated["Method_Clip"].replace(Method_clip_mapping)

#### Isotonic Regression

In [12]:
clipping_threshold=1e-12

plot_calibration_comparison(
    directory='results/figures_nl/ps_calib/',
    df=df_ps_calibrated,
    methods=['Alg-1-Uncalib'],
    n_obs=n_obs,
    dim_x=dim_x,
    clipping_threshold=clipping_threshold,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    scatter_color='#023eff',
    ps_color='#662506',
    true_score_color='#018571',
    alpha_scatter=0.4,
    alpha_line=0.5
)

In [13]:
# Reset font weight to default (normal)
plt.rcParams['font.weight'] = 'normal'

# Reset text rendering to default settings
plt.rcParams.update({
    "text.usetex": False,            # Default: False
    "font.family": plt.rcParamsDefault["font.family"]  # Restore original default
})

In [14]:
plot_ps_treatment_comparison(
    directory='results/figures_nl/ps_treatment_iso/',
    df=df_ps_calibrated,
    methods=['Alg-1-Uncalib','Alg-2-nested-cf-Iso','Alg-3-cf-Iso',
            'Alg-4-single-split-Iso','Alg-5-full-sample-Iso'],
    n_obs=n_obs,
    dim_x=dim_x,
    clipping_threshold=clipping_threshold,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    plot_fn=plot_overlap_ratio
)

#### IVAP

In [15]:
plot_ps_treatment_comparison(
    directory='results/figures_nl/ps_treatment_ivap/',
    df=df_ps_calibrated,
    methods=['Alg-1-Uncalib','Alg-2-nested-cf-IVAP','Alg-3-cf-IVAP',
            'Alg-4-single-split-IVAP','Alg-5-full-sample-IVAP'],
    n_obs=n_obs,
    dim_x=dim_x,
    clipping_threshold=clipping_threshold,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    plot_fn=plot_overlap_ratio
)

#### Platt

In [16]:
plot_ps_treatment_comparison(
    directory='results/figures_nl/ps_treatment_platt/',
    df=df_ps_calibrated,
    methods=['Alg-1-Uncalib','Alg-2-nested-cf-Platt','Alg-3-cf-Platt',
            'Alg-4-single-split-Platt','Alg-5-full-sample-Platt'],
    n_obs=n_obs,
    dim_x=dim_x,
    clipping_threshold=clipping_threshold,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    plot_fn=plot_overlap_ratio
)

### Plot Propensity Scores by Treatment

In [17]:
# IRM
n_obs = 2000
dgp_type="sim_v06_nonlinear"

dgp_dict = {
    'n_obs':n_obs}

data_dict = dgp_wrapper(dgp_type=dgp_type, **dgp_dict)

treatment = data_dict['treatment']
outcome = data_dict['outcome']
m_0 = data_dict['propensity_score']
x = data_dict['covariates']

In [18]:
plot_mirrored_propensity_histogram(
    treatment=treatment,
    m_0=m_0,
    directory='results/figures_nl/ps_treatment/',
    n_obs=2000,
    dim_x=4,
    clipping_threshold=1e-12,
    R2_d=0.0,
    overlap=0.0,
    share_treated=0.0,
    panel_label=None,  # Omit for no panel label
    bins=50
)

In [19]:
df = pd.DataFrame({
    "treatment": data_dict['treatment'],
    "outcome": data_dict['outcome'],
    "propensity_score": data_dict['propensity_score'],
    "x_1": data_dict['covariates']["x1"],
    "x_2": data_dict['covariates']["x2"]
})

# Calculate the median of column 'x_1'
median_x1 = np.median(df['x_1'])
median_x2 = np.median(df['x_2'])

# Create the df_subgroups_2 column (True if 'x' > median)
df['x1'] = df['x_1'] > median_x1
df['x2'] = df['x_2'] > median_x2

# Create a DataFrame for the subgroups
df_subgroups = df[['x1', 'x2']]

In [20]:
# Create a DataFrame for the subgroups
df_subgroups = df[['x1', 'x2']]

plot_overlap_comparison(
    df=df,
    directory='results/figures_nl/ps_treatment_2/',
    ps_col='propensity_score',
    treatment_col='treatment',
    metric='ratio',
    subgroups=df_subgroups,
    n_obs=n_obs,
    dim_x=dim_x,
    clipping_threshold=1e-12,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    style_kwargs={
        'colors': {'Treated': '#87CEEB', 'Control': '#FA8072'},
        'dash_color': '#333333'
    }
)

plot_overlap_comparison(
    df=df,
    directory='results/figures_nl/ps_treatment_2/',
    ps_col='propensity_score',
    treatment_col='treatment',
    metric='ratio',
    subgroups=None,
    n_obs=n_obs,
    dim_x=dim_x,
    clipping_threshold=1e-12,
    R2_d=R2_d,
    overlap=overlap,
    share_treated=share_treated,
    style_kwargs={
        'colors': {'Treated': '#87CEEB', 'Control': '#FA8072'},
        'dash_color': '#333333'
    },
    panel_labels=None
)