# $\pi$ - Strata proportions estimation

In [1]:
from typing import List, Dict, Tuple

import pandas as pd
from numpy import random

from consts import default_random_seed
from sample_generation import create_sample
from strata import Strata

random.seed(default_random_seed)

In [2]:
def bound_strata_proportions(df: pd.DataFrame) -> Dict[Strata, Tuple[float, float]]:
    p_t1_d0 = df.loc[(df.D_obs==0)&(df.t==1)].shape[0]/df.loc[df.t==1].shape[0]
    p_t0_d0 = df.loc[(df.D_obs==0)&(df.t==0)].shape[0]/df.loc[df.t==0].shape[0]

    pi_h_lower = max(0, p_t0_d0 - p_t1_d0)
    pi_h_upper = min(p_t0_d0, 1 - p_t1_d0)

    pi_as_lower = p_t0_d0 - pi_h_upper
    pi_as_upper = p_t0_d0 - pi_h_lower

    pi_p_lower = p_t1_d0 - p_t0_d0 + pi_h_lower
    pi_p_upper = p_t1_d0 - p_t0_d0 + pi_h_upper

    pi_d_lower = 1 - p_t1_d0 - pi_h_upper
    pi_d_upper = 1 - p_t1_d0 - pi_h_lower

    return {Strata.H: (pi_h_lower, pi_h_upper), Strata.AS: (pi_as_lower, pi_as_upper),
            Strata.P: (pi_p_lower, pi_p_upper), Strata.D: (pi_d_lower, pi_d_upper)}


In [3]:
def check_strata_for_different_beta(beta_d_list: List[float] = [[0.0, 0.0, 0.0], [-2.0, -2.0, 1.0], [0.0, 5.0, 0.0], [0.0, 10.0, 0.0], [0.0, 3.7, 0.0]]):
    for beta_d in beta_d_list:
        print(f"\nFor beta_d={beta_d}:")
        sample_for_bounds = create_sample(beta_d = beta_d)
        t1 = sample_for_bounds.loc[sample_for_bounds.t==1].shape[0]
        t0 = sample_for_bounds.loc[sample_for_bounds.t==0].shape[0]
        sample_size = sample_for_bounds.shape[0]
        print(f"\t{t1} got T=1 ({round(100*t1/sample_size,2)}%), {t0} got T=0 ({round(100*t0/sample_size,2)}%)")
        prprtn_bounds = bound_strata_proportions(sample_for_bounds)

        for stratum , bounds in prprtn_bounds.items():
            stratum_size = sample_for_bounds.loc[sample_for_bounds.stratum==stratum.name].shape[0]
            stratum_size_t1 = sample_for_bounds.loc[(sample_for_bounds.stratum==stratum.name)&(sample_for_bounds.t==1)].shape[0]
            stratum_size_t0 = sample_for_bounds.loc[(sample_for_bounds.stratum==stratum.name)&(sample_for_bounds.t==0)].shape[0]
            true_pi  = round(100*stratum_size/sample_size,2)
            lower_bound = round(100*bounds[0],2)
            upper_bound = round(100*bounds[1],2)

            within_bounds = "✔" if lower_bound<=true_pi<=upper_bound else "✘"

            print(f"\tStratum {stratum.name} real value is: {true_pi}% ({stratum_size} samples), and it is bounded by: [{lower_bound}%, {upper_bound}%]   {within_bounds}")
            print(f"\tIn T=1, {stratum_size_t1} are {stratum.name} ({round(100*stratum_size_t1/t1,2)}%). In T=0, {stratum_size_t0} are {stratum.name} ({round(100*stratum_size_t0/t0,2)}%). {round(100*(stratum_size_t1/t1-stratum_size_t0/t0)/(stratum_size_t1/t1),2) if  (stratum_size_t1/t1) else 'None'} % difference")

In [4]:
check_strata_for_different_beta()


For beta_d=[0.0, 0.0, 0.0]:
	5064 got T=1 (50.64%), 4936 got T=0 (49.36%)
	Stratum H real value is: 24.91% (2491 samples), and it is bounded by: [0%, 48.56%]   ✔
	In T=1, 1287 are H (25.41%). In T=0, 1204 are H (24.39%). 4.02 % difference
	Stratum AS real value is: 24.87% (2487 samples), and it is bounded by: [0.0%, 48.56%]   ✔
	In T=1, 1294 are AS (25.55%). In T=0, 1193 are AS (24.17%). 5.41 % difference
	Stratum P real value is: 25.13% (2513 samples), and it is bounded by: [2.07%, 50.63%]   ✔
	In T=1, 1270 are P (25.08%). In T=0, 1243 are P (25.18%). -0.41 % difference
	Stratum D real value is: 25.09% (2509 samples), and it is bounded by: [0.81%, 49.37%]   ✔
	In T=1, 1213 are D (23.95%). In T=0, 1296 are D (26.26%). -9.61 % difference

For beta_d=[-2.0, -2.0, 1.0]:
	5064 got T=1 (50.64%), 4936 got T=0 (49.36%)
	Stratum H real value is: 1.77% (177 samples), and it is bounded by: [0%, 1.86%]   ✔
	In T=1, 81 are H (1.6%). In T=0, 96 are H (1.94%). -21.59 % difference
	Stratum AS real v

Interesting to see the results for extreme distribution (for example with $\beta_D=[0,10,0]$)
