In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Generate potential outcomes
def potential_outcomes(X, mu0_fun, mu1_fun):
    Y0 = []
    Y1 = []
    for sample in X:
        err_0 = np.random.normal(loc=0, scale=1)
        err_1 = np.random.normal(loc=0, scale=1)
        Yi_0 = mu0_fun(sample) + err_0
        Yi_1 = mu1_fun(sample) + err_1
        Y0.append(Yi_0)
        Y1.append(Yi_1)
    Y0 = np.clip(np.round(Y0), 0.0, 1.0)
    Y1 = np.clip(np.round(Y1), 0.0, 1.0)
    return Y0, Y1

In [3]:
# Generate feature vectors (MIMIC-IV simulation)
def generate_feature_vectors(N, feature_names):
    X = np.zeros((N, len(feature_names)))
    for i, feature in enumerate(feature_names):
        if feature == "sex":
            X[:, i] = np.random.binomial(1, 0.5, N) 
        else:
            X[:, i] = np.random.uniform(0, 1, N)
    return X

# Propensity score function (based on confounders)
def propensity_score(x, con_indices, beta):
    confounder_sum = np.dot(x[con_indices], beta)  
    return 1 / (1 + np.exp(-confounder_sum))


# Response functions: Uses approach of simulation 6 with other confounders added:
mu0 = lambda x: 2*x[0] - 1*x[2] + 0.4*x[3] - x[4] - 0.1*x[8] + 0.9*x[14] + 0.4*x[17] + 0.2 * x[19] + x[20] - 1
mu1 = lambda x: mu0(x) - 2*x[18] # Other features will also influence the outcome

In [4]:
# Simulate data for a single experiment
def generate_data(N, feature_names, con_indices, mu0_fun, mu1_fun, treated_frac):
    X = generate_feature_vectors(N, feature_names)
    beta = np.random.normal(0, 1, len(con_indices))
    e = np.array([propensity_score(x, con_indices, beta) for x in X])
    
    # Adjusting e to achieve desired propensity score (e.g., 10% treated uses treated_fraction 0.1):
    e = (treated_frac / np.mean(e)) * e
    e = np.clip(e, 0, 1)
    
    Y0, Y1 = potential_outcomes(X, mu0_fun, mu1_fun)
    T = np.random.binomial(1, e, size=N)
    Y = T * Y1 + (1 - T) * Y0
    return X, T, Y0, Y1, Y

In [5]:
def make_dataframe(X, T, Y0, Y1, Y, columns):    
    df = pd.DataFrame(X, columns=columns)
    df['T'] = T
    df['Y0'] = Y0
    df['Y1'] = Y1
    df['Y'] = Y
    df['cate'] = df['Y1'] - df['Y0']
    return df

In [6]:
# Calculate the MSE
def calculate_mse(test_df):
    return mean_squared_error(test_df['cate'], test_df['pred_cate'])

In [7]:
from Simulation.X_learner_confounder import X_learner_lgbm
from Simulation.T_learner import T_learner_lgbm
from Simulation.S_learner import S_learner_lgbm


# Perform multiple experiments with different sample sizes
def perform_experiments(N, X_sim, L_sim, con_indices, mu0_fun, mu1_fun, treated_frac):
    s_mse_list, t_mse_list, x_mse_list = [], [], []

    for n in N:
        X, T, Y0, Y1, Y = generate_data(n, X_sim, con_indices, mu0_fun, mu1_fun, treated_frac)
        df_sim = make_dataframe(X, T, Y0, Y1, Y, X_sim)
        train_sim, test_sim = train_test_split(df_sim, test_size=0.3, random_state=None)

        # Get CATE estimates (for S-, T- and X-learner)
        s_cate_train, s_cate_test = S_learner_lgbm(train_sim, test_sim, L_sim, 'T', 'Y')
        t_cate_train, t_cate_test = T_learner_lgbm(train_sim, test_sim, L_sim, 'T', 'Y')
        x_cate_train, x_cate_test = X_learner_lgbm(train_sim, test_sim, L_sim, X_sim, 'T', 'Y')
        
        # Calculate MSE
        s_mse_list.append(calculate_mse(s_cate_test))
        t_mse_list.append(calculate_mse(t_cate_test))
        x_mse_list.append(calculate_mse(x_cate_test))

    return s_mse_list, t_mse_list, x_mse_list

def iterate_experiments(N, num_exp, X_sim, L_sim, con_indices, mu0_fun, mu1_fun, treated_frac):
    s_mse_tot, t_mse_tot, x_mse_tot = [], [], []

    for _ in range(num_exp):
        s_mse, t_mse, x_mse = perform_experiments(N, X_sim, L_sim, con_indices, mu0_fun, mu1_fun, treated_frac)
        s_mse_tot.append(s_mse)
        t_mse_tot.append(t_mse)
        x_mse_tot.append(x_mse)

    return s_mse_tot, t_mse_tot, x_mse_tot

### Hyperparameter Tuning

In [13]:
# Simulation setup MIMIC-like data (unbalanced + confounded)
N_list = [300, 1000, 3000, 6000, 10000]
num_experiments = 5
confounder_indices =  [0, 2, 3, 4, 8, 14, 15, 19, 20]
treated_fraction = 0.10
X = ["age", "sex", "weight", "height", "pf_ratio", "po2", "pco2", "ph", "driving_pressure", "lung_compliance", "map", "bilirubin", "creatinine", "platelets", "urea", "fio2", "hco3", "heart_rate", "minute_volume", "peep", "plateau_pressure", "respiratory_rate", "syst_blood_pressure", "diastolic_blood_pressure"]
L = ["age", "weight", "height", "pf_ratio", "driving_pressure", "fio2", "peep", "plateau_pressure"] #Pretending this is our list of confounders

In [14]:
# LGBM Regressor as model
s_mse_total, t_mse_total, x_mse_total= iterate_experiments(N_list, num_experiments, X, L, confounder_indices, mu0, mu1, treated_fraction)

s_mse_lgbm = np.mean(s_mse_total, axis=0)
t_mse_lgbm = np.mean(t_mse_total, axis=0)
x_mse_lgbm = np.mean(x_mse_total, axis=0)

In [15]:
from matplotlib import pyplot as plt

# Plotting the average MSE for different num of samples
plt.plot(N_list, s_mse_lgbm, marker='o', label='S-learner')
plt.plot(N_list, t_mse_lgbm, marker='o', label='T-learner')
plt.plot(N_list, x_mse_lgbm, marker='o', label='X-learner')
plt.xlabel('Number of samples')
plt.ylabel('MSE')
plt.title('MIMIC Simulation: LGBM Regressor')
plt.legend()
plt.savefig(f"30_5_2_20.png")
plt.show()

In [16]:
print("LGBM tuning:")
print("S-learner: ")
print(s_mse_lgbm)
print("T-learner: ")
print(t_mse_lgbm)
print("X-learner: ")
print(x_mse_lgbm)