In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neighbors import NearestNeighbors
from econml.dml import LinearDML
import warnings
from tqdm import tqdm

# Suppress warnings for cleaner output (e.g., convergence warnings in high-dim logistic reg)
warnings.filterwarnings('ignore')

## Non Linear data generation with rare disease settings

In [2]:
# Data generation function
def generate_rare_disease_data(n=500, p=100, setting='continuous',seed=123):
    np.random.seed(seed)
    
    # 1. Generate Covariates (100 variables)
    X = np.random.normal(0, 1, size=(n, p))
    
    # 2. Define the &quot;True&quot; Confounding Mechanism (The Nuisance Function)
    # Only the first 5 variables (indices 0-4) actually matter.
    # Interaction: X0 * X1
    # Non-linear: X2 squared
    # Linear: X3, X4
    nuisance_term = 0.5 * X[:,0] * X[:,1] + 0.4 * (X[:,2]**2) + 0.3 * X[:,3] + 0.2 * X[:,4]
    # 3. Treatment Assignment (Propensity)
    # P(T=1 | X) depends on the nuisance term
    logit_p = -0.5 + nuisance_term
    prob_t = 1 / (1 + np.exp(-logit_p))
    T = np.random.binomial(1, prob_t)
    TRUE_EFFECT = 1000.0 # True Causal Effect ($1000)
    
    # Outcome depends on T, the nuisance term (confounding), and noise
    # Note: We scale the nuisance term to ensure confounding is strong enough to bias naive results 
    confounding_effect = 1000 * nuisance_term
    
    Y = 2000 + (TRUE_EFFECT * T) + confounding_effect + np.random.normal(0,
    200, n)
    
    return X, T, Y, TRUE_EFFECT

In [3]:
class CausalEstimators:

# A helper class to run various causal inference methods:
# 1. Naive (Unadjusted Difference in Means)
# 2. Regression Adjustment (Linear/Logistic)
# 3. IPTW (Inverse Probability of Treatment Weighting)
# 4. PSM (Propensity Score Matching 1:1)
# 5. DML (Double Machine Learning)

    def __init__(self, X, T, Y, outcome_type= 'continuous'):
        self.X = X
        self.T = T
        self.Y = Y
        self.outcome_type = outcome_type
        # Create a DataFrame for easier handling in naive methods
        self.df = pd.DataFrame(X)
        self.df['T'] = T
        self.df['Y'] = Y
    def run_naive(self):
    
        treated = self.df[self.df['T'] == 1]['Y'].mean()
        control = self.df[self.df['T'] == 0]['Y'].mean()
        return treated - control

    def run_regression_adjustment(self):

        # Combine T and X into one feature matrix
        features = np.column_stack((self.T, self.X))
        model = LinearRegression()
        model.fit(features, self.Y)
        # The coefficient of T (index 0) is the additive effect
        return model.coef_[0]

    def run_iptw(self):
        # Estimate Propensity Scores
        ps_model = LogisticRegression(solver='liblinear', max_iter=2000)
        ps_model.fit(self.X, self.T)
        ps = ps_model.predict_proba(self.X)[:, 1]
        
        # Clip to prevent division by zero or extreme weights (common in small N)
        ps = np.clip(ps, 0.05, 0.95)
        # Calculate weights: 1/PS for treated, 1/(1-PS) for control
        weights = np.where(self.T == 1, 1/ps, 1/(1-ps))
        model = LinearRegression()
        model.fit(self.T.reshape(-1, 1), self.Y, sample_weight=weights)
        return model.coef_[0]

    def run_psm(self):
        # Estimate Propensity Scores
        ps_model = LogisticRegression(solver='liblinear', max_iter=2000)
        ps_model.fit(self.X, self.T)
        ps = ps_model.predict_proba(self.X)[:, 1]
        
        treated_idx = np.where(self.T == 1)[0]
        control_idx = np.where(self.T == 0)[0]
        
        # Safety check for separation
        if len(control_idx) == 0 or len(treated_idx) == 0:
            return np.nan
        # Match each treated unit to nearest control unit based on PS
        nbrs = NearestNeighbors(n_neighbors=1).fit(ps[control_idx].reshape(-1, 1))
        distances, indices = nbrs.kneighbors(ps[treated_idx].reshape(-1, 1))
        matched_control_idx = control_idx[indices.flatten()]
        
        # Difference in Means of matched pairs (Risk Difference for binary)
        return np.mean(self.Y[treated_idx]) - np.mean(self.Y[matched_control_idx])
    
    def run_dml(self):
        # Use shallow trees (depth 2) to prevent overfitting on small N=500
        y_model = GradientBoostingRegressor(n_estimators=50, max_depth=2,
        random_state=42)
        t_model = GradientBoostingClassifier(n_estimators=50, max_depth=2,
        random_state=42)
    
        # Set cv=3 for small sample size (5-fold might be too thin)
        est = LinearDML(model_y=y_model,
                        model_t=t_model,
                        discrete_treatment=True,
                        cv=3,
                        random_state=42)
        est.fit(self.Y, self.T, X=self.X)
        # Return Average Treatment Effect
        return est.effect(self.X).mean()


In [4]:
# --- Let's test 1 continuous outcome to see if it works---
X, T, Y, true_eff = generate_rare_disease_data(n=500, p=100, setting='continuous',seed = 1)
sim = CausalEstimators(X, T, Y, outcome_type='continuous')

print(f"Scenario A: Continuous Outcome (e.g., Total Pay)")
print(f"True Causal Effect: {true_eff:.2f}")
print("-" * 65)
print(f"{'Method':<30} | {'Estimate':<15} | {'Bias':<10}")
print("-" * 65)

results_continuous = {
"Naive (Unadjusted)": sim.run_naive(),
"Regression (Linear):": sim.run_regression_adjustment(),
"IPTW (Logistic PS)": sim.run_iptw(),
"PSM (1:1 Nearest Neighbor)": sim.run_psm(),
"DML (Gradient Boosting)": sim.run_dml()
}

for method, val in results_continuous.items():
    bias = val - true_eff
    print(f"{method:<30} | {val:<15.2f} | {bias:<10.2f}")
    
print("\n" + "="*65 + "\n")

Scenario A: Continuous Outcome (e.g., Total Pay)
True Causal Effect: 1000.00
-----------------------------------------------------------------
Method                         | Estimate        | Bias      
-----------------------------------------------------------------
Naive (Unadjusted)             | 1537.29         | 537.29    
Regression (Linear):           | 1458.03         | 458.03    
IPTW (Logistic PS)             | 1441.00         | 441.00    
PSM (1:1 Nearest Neighbor)     | 1355.20         | 355.20    
DML (Gradient Boosting)        | 1244.95         | 244.95    




## Baseline simulation with only 10 covaraites

In [5]:
# --- Let's try the baseline group with only 10 predictors---
def run_baseline_simulation(n_epochs=100):
    # 1. Initialize a list to store results
    results_list = []

    print(f"Starting simulation with {n_epochs} epochs...")

    for epoch in tqdm(range(n_epochs)):
        # --- 1. CONTINUOUS OUTCOME ---
        X, T, Y, true_eff = generate_rare_disease_data(n=500, p=10, setting='continuous',seed = epoch)
        sim = CausalEstimators(X, T, Y, outcome_type='continuous')
                
        # 3. Apply your 5 different methods
        # Replace these with your actual function calls (e.g., DML, PSM, etc.)
        
        # Method 1: Naive (Unadjusted)
        res_m1 = sim.run_naive()
        
        # Method 2: Regression (Linear)
        res_m2 = sim.run_regression_adjustment()
        
        # Method 3: IPTW (Logistic PS)
        res_m3 = sim.run_iptw()
        
        # Method 4: PSM (1:1 Nearest Neighbor)
        res_m4 = sim.run_psm()
        
        # Method 5: DML (Gradient Boosting)
        res_m5 = sim.run_dml()

        # 4. Record results for each method in a dictionary
        # We store them as separate rows to make it "tidy" (Long Format)
        methods = {
            "Naive (Unadjusted)": res_m1,
            "Regression (Linear):": res_m2,
            "IPTW (Logistic PS)": res_m3,
            "PSM (1:1 Nearest Neighbor)": res_m4,
            "DML (Gradient Boosting)": res_m5
        }

        for method_name, value in methods.items():
            results_list.append({
                "epoch": epoch,
                "method": method_name,
                "estimate": value,
                "bias": abs(value - true_eff), # You can add metrics here
            })

    # 5. Convert the list of dictionaries to a pandas DataFrame
    df_results = pd.DataFrame(results_list)
    
    return df_results


In [6]:
# Execute 1000 epochs
df = run_baseline_simulation(n_epochs=1000)

Starting simulation with 1000 epochs...


100%|██████████| 1000/1000 [04:26<00:00,  3.75it/s]


In [7]:
# Quick summary of mean estimates results across methods for baseline simulation
summary = df.groupby('method')['estimate'].agg(['mean', 'std']).reset_index()
print("\nSimulation Estimates Summary:")
print(summary)


Simulation Estimates Summary:
                       method         mean        std
0     DML (Gradient Boosting)  1227.974213  56.936613
1          IPTW (Logistic PS)  1467.023480  70.355294
2          Naive (Unadjusted)  1570.636797  76.395820
3  PSM (1:1 Nearest Neighbor)  1460.048294  86.533939
4        Regression (Linear):  1463.164108  69.237099


In [8]:
# Quick summary of mean bias results across methods for baseline simulation
summary = df.groupby('method')['bias'].agg(['mean', 'std']).reset_index()
print("\nSimulation Bias Summary:")
print(summary)


Simulation Bias Summary:
                       method        mean        std
0     DML (Gradient Boosting)  227.974213  56.936613
1          IPTW (Logistic PS)  467.023480  70.355294
2          Naive (Unadjusted)  570.636797  76.395820
3  PSM (1:1 Nearest Neighbor)  460.048294  86.533939
4        Regression (Linear):  463.164108  69.237099


## Linear data generation with rare disease settings as the alternative group

In [9]:
def generate_linear_data(n=500, p=100, setting='continuous',seed=123):
    np.random.seed(seed)
    
    # 1. Generate Covariates (100 variables)
    X = np.random.normal(0, 1, size=(n, p))
    
    # 2. Define the &quot;True&quot; Confounding Mechanism (The Nuisance Function)
    # Only the first 5 variables (indices 0-4) actually matter.
    # Linear: X3, X4
    nuisance_term = 0.5 * X[:,0] + 0.2 * X[:,1] + 0.4 * X[:,2] - 0.3 * X[:,3] - 0.4 * X[:,4] -0.1 * X[:,5]
    # 3. Treatment Assignment (Propensity)
    # P(T=1 | X) depends on the nuisance term
    logit_p = -0.5 + nuisance_term
    prob_t = 1 / (1 + np.exp(-logit_p))
    T = np.random.binomial(1, prob_t)
    TRUE_EFFECT = 1000.0 # True Causal Effect ($1000)
    
    # Outcome depends on T, the nuisance term (confounding), and noise
    # Note: We scale the nuisance term to ensure confounding is strong enough to bias naive results 
    confounding_effect = 1000 * nuisance_term
    
    Y = 2000 + (TRUE_EFFECT * T) + confounding_effect + np.random.normal(0,
    200, n)
    
    return X, T, Y, TRUE_EFFECT

In [10]:
def run_linear_simulation(n_epochs=100):
    # 1. Initialize a list to store results
    results_list = []

    print(f"Starting simulation with {n_epochs} epochs...")

    for epoch in tqdm(range(n_epochs)):
        # --- 1. CONTINUOUS OUTCOME ---
        X, T, Y, TRUE_EFFECT = generate_linear_data(n=500, p=100, setting='continuous',seed = epoch)
        sim = CausalEstimators(X, T, Y, outcome_type='continuous')
                
       # 3. Apply your 5 different methods
        # Replace these with your actual function calls (e.g., DML, PSM, etc.)
        
        # Method 1: Naive (Unadjusted)
        res_m1 = sim.run_naive()
        
        # Method 2: Regression (Linear)
        res_m2 = sim.run_regression_adjustment()
        
        # Method 3: IPTW (Logistic PS)
        res_m3 = sim.run_iptw()
        
        # Method 4: PSM (1:1 Nearest Neighbor)
        res_m4 = sim.run_psm()
        
        # Method 5: DML (Gradient Boosting)
        res_m5 = sim.run_dml()

        # 4. Record results for each method in a dictionary
        # We store them as separate rows to make it "tidy" (Long Format)
        methods = {
            "Naive (Unadjusted)": res_m1,
            "Regression (Linear):": res_m2,
            "IPTW (Logistic PS)": res_m3,
            "PSM (1:1 Nearest Neighbor)": res_m4,
            "DML (Gradient Boosting)": res_m5
        }

        for method_name, value in methods.items():
            results_list.append({
                "epoch": epoch,
                "method": method_name,
                "estimate": value,
                "bias": abs(value - TRUE_EFFECT), # You can add metrics here
            })

    # 5. Convert the list of dictionaries to a pandas DataFrame
    df_results = pd.DataFrame(results_list)
    
    return df_results


In [11]:
# Execute control group
df = run_linear_simulation(n_epochs=1000)

Starting simulation with 1000 epochs...


100%|██████████| 1000/1000 [22:48<00:00,  1.37s/it]


In [12]:
# Quick summary of mean estimates results across methods
summary = df.groupby('method')['estimate'].agg(['mean', 'std']).reset_index()
print("\nSimulation Estimates Summary:")
print(summary)


Simulation Estimates Summary:
                       method         mean         std
0     DML (Gradient Boosting)  1125.843271   39.865842
1          IPTW (Logistic PS)  1071.292682   63.457977
2          Naive (Unadjusted)  1621.361501   74.798941
3  PSM (1:1 Nearest Neighbor)  1011.265999  108.758924
4        Regression (Linear):   999.706382   22.266702


In [13]:
# Quick summary of mean bias results across methods
summary = df.groupby('method')['bias'].agg(['mean', 'std']).reset_index()
print("\nSimulation Bias Summary:")
print(summary)


Simulation Bias Summary:
                       method        mean        std
0     DML (Gradient Boosting)  125.879505  39.751168
1          IPTW (Logistic PS)   79.898673  52.196369
2          Naive (Unadjusted)  621.361501  74.798941
3  PSM (1:1 Nearest Neighbor)   84.617135  69.197736
4        Regression (Linear):   17.545173  13.702229


## Main Outcome with 100 covariates

In [14]:
# --- Let's try the baseline group with only 10 predictors---
def run_simulation(n_epochs=100):
    # 1. Initialize a list to store results
    results_list = []

    print(f"Starting simulation with {n_epochs} epochs...")

    for epoch in tqdm(range(n_epochs)):
        # --- 1. CONTINUOUS OUTCOME ---
        X, T, Y, true_eff = generate_rare_disease_data(n=500, p=100, setting='continuous',seed = epoch)
        sim = CausalEstimators(X, T, Y, outcome_type='continuous')
                
        # 3. Apply your 5 different methods
        # Replace these with your actual function calls (e.g., DML, PSM, etc.)
        
        # Method 1: Naive (Unadjusted)
        res_m1 = sim.run_naive()
        
        # Method 2: Regression (Linear)
        res_m2 = sim.run_regression_adjustment()
        
        # Method 3: IPTW (Logistic PS)
        res_m3 = sim.run_iptw()
        
        # Method 4: PSM (1:1 Nearest Neighbor)
        res_m4 = sim.run_psm()
        
        # Method 5: DML (Gradient Boosting)
        res_m5 = sim.run_dml()

        # 4. Record results for each method in a dictionary
        # We store them as separate rows to make it "tidy" (Long Format)
        methods = {
            "Naive (Unadjusted)": res_m1,
            "Regression (Linear):": res_m2,
            "IPTW (Logistic PS)": res_m3,
            "PSM (1:1 Nearest Neighbor)": res_m4,
            "DML (Gradient Boosting)": res_m5
        }

        for method_name, value in methods.items():
            results_list.append({
                "epoch": epoch,
                "method": method_name,
                "estimate": value,
                "bias": abs(value - true_eff), # You can add metrics here
            })

    # 5. Convert the list of dictionaries to a pandas DataFrame
    df_results = pd.DataFrame(results_list)
    
    return df_results

In [15]:
# Execute 1000 epochs
df = run_simulation(n_epochs=1000)

Starting simulation with 1000 epochs...


100%|██████████| 1000/1000 [23:34<00:00,  1.41s/it]


In [16]:
# Quick summary of mean estimates results across methods for main simulation
summary = df.groupby('method')['estimate'].agg(['mean', 'std']).reset_index()
print("\nSimulation Estimates Summary:")
print(summary)


Simulation Estimates Summary:
                       method         mean         std
0     DML (Gradient Boosting)  1274.520450   59.966134
1          IPTW (Logistic PS)  1471.734148   85.462450
2          Naive (Unadjusted)  1569.734357   73.435230
3  PSM (1:1 Nearest Neighbor)  1466.742563  110.328137
4        Regression (Linear):  1463.448216   78.343686


In [17]:
# Quick summary of mean bias results across methods for main simulation
summary = df.groupby('method')['bias'].agg(['mean', 'std']).reset_index()
print("\nSimulation Bias Summary:")
print(summary)


Simulation Bias Summary:
                       method        mean         std
0     DML (Gradient Boosting)  274.520450   59.966134
1          IPTW (Logistic PS)  471.734148   85.462450
2          Naive (Unadjusted)  569.734357   73.435230
3  PSM (1:1 Nearest Neighbor)  466.742563  110.328137
4        Regression (Linear):  463.448216   78.343686
