#### Schätzung kausaler Effekte mittels DoubleML unter Verwendung linearer Regression als Outcome-Modell und logistischer Regression, Random Forest und Gradient Boosting als Propensity Score-Modell und schauen, welche Auswirkungen die Discarding und Truncation Strategie auf die Schätzleistung von DoubleML haben. 

In [None]:
# Notwendige Bibliotheken und Funktionen importieren
import pandas as pd
import numpy as np
import sys, os
# hier den Pfad zu der Funktion pscore_discard aus functions.py einfügen
sys.path.append(os.path.abspath("."))
from DGP import propensity_eq, potential_outcome_eq, make_irm_data
from functions import pscore_discard
from doubleml import DoubleMLIRM, DoubleMLData
from doubleml.utils.resampling import DoubleMLResampling
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import clone
from joblib import Parallel, delayed
import warnings
# Alle Warnungen unterdrücken
warnings.filterwarnings("ignore")

### Truncation

In [None]:
# Seed für die Reproduzierbarkeit der Simulationen festlegen
np.random.seed(1234)

# Funktion für die Parallelisierung der Simulationen erstellen
def run_truncation(i, n_obs, R2_d, R2_y, true_theta, ml_g, ml_m, score):
    # Leere Liste für die Ergebnisse erstellen
    results = []
    # Seed für jede Simulation festlegen
    np.random.seed(i)
    # Daten generieren
    df_dict = make_irm_data(n_obs=n_obs, dim_x=dim_x, theta=true_theta, R2_d=R2_d, R2_y=R2_y)
    # DoubleMLData Objekt erstellen
    data = DoubleMLData(df_dict['df'], 'Y', 'D', ['x_0', 'x_1'])
    orcl_df = df_dict['df_orcl']
    # Wahren Propensity Score speichern
    trueps = df_dict['df_orcl']['ps']
    for trim_value in [0.001, 0.01, 0.05, 0.1]:
        # DoubleML Objekt erstellen
        dml_obj = DoubleMLIRM(data, ml_g, ml_m, trimming_threshold=trim_value, score=score)
        dml_obj.fit()
        # geschätzte Propensity Scores speichern
        predicted_ps = dml_obj.predictions['ml_m'].flatten()
        dml_summary = dml_obj.summary
        dml_summary['trim_value'] = trim_value
        dml_summary['learner'] = str(ml_g) + "," + str(ml_m)
        # Anteil der Beobachtungen, die auf der oberen Grenze trunkiert wurden
        dml_summary['share_trimmed_top'] = (((predicted_ps == (1-trim_value)).sum())/n_obs)*100 
        # Anteil der Beobachtungen, die auf der unteren Grenze trunkiert wurden
        dml_summary['share_trimmed_bottom'] = (((predicted_ps == trim_value).sum())/n_obs)*100 
        # Anteil der behandelten Beobachtungen für Population und angepasste Stichprobe identisch, da keine Beobachtungen entfernt wurden
        dml_summary['share_treated'] = (((df_dict['df']['D'] == 1).sum())/n_obs)*100 
        # Anteil der trunkierten Beobachtungen mit wahren PS   
        dml_summary['share_trim_orcl_top'] = ((np.where(trueps >= (1-trim_value), 1, 0).sum())/trueps.shape[0])*100
        dml_summary['share_trim_orcl_bottom'] = ((np.where(trueps <= trim_value, 1, 0).sum())/trueps.shape[0])*100    

        #Nuisance Loss berechnen
        dml_summary['loss_ml_g0'] = dml_obj.nuisance_loss['ml_g0'][0][0]
        dml_summary['loss_ml_g1'] = dml_obj.nuisance_loss['ml_g1'][0][0]
        dml_summary['loss_ml_m'] = dml_obj.nuisance_loss['ml_m'][0][0]

       # Daten
        orcl_data_g0 = orcl_df[orcl_df['D'] == 0]
        orcl_data_g1 = orcl_df[orcl_df['D'] == 1]

        # RMSE für das Outcome Modell -> ml_g0, ml_g1  und Log Loss für die Propensity Score-Modell -> ml_m berechnen
        dml_summary['loss_g0'] = np.sqrt(np.mean((orcl_data_g0['Y'] - dml_obj.predictions['ml_g0'].flatten()[orcl_data_g0.index])**2)).round(6)
        dml_summary['loss_g1'] = np.sqrt(np.mean((orcl_data_g1['Y'] - dml_obj.predictions['ml_g1'].flatten()[orcl_data_g1.index])**2)).round(6)
        dml_summary['loss_m'] = -np.mean(orcl_df['D'] * np.log(predicted_ps) + (1 - orcl_df['D']) * np.log(1 - predicted_ps)).round(6)

        # Oracle ATTE/ATE berechnen
        if score == "ATTE":
            dml_summary["oracle"] = np.mean(orcl_df.loc[orcl_df.D==1, "Y_1"] - orcl_df.loc[orcl_df.D==1, "Y_0"]) 
        else: # ATE
            dml_summary["oracle"] = np.mean(orcl_df.Y_1 - orcl_df.Y_0) 
        
        # Ergebnisse speichern
        results.append(dml_summary)    
    return results

# Parameter für die Simulationen festlegen
R2_d = 0.8
R2_y = 0.8
dim_x = 2
n_obs = 1000
num_repetitions = 1000
true_theta = 0
learner_list = [
    {"ml_g": LinearRegression(), "ml_m": LogisticRegressionCV()},
    {"ml_g": LinearRegression(), "ml_m": RandomForestClassifier()},
    {"ml_g": LinearRegression(), "ml_m": GradientBoostingClassifier()}]
score_list = ["ATTE", "ATE"] 

trunc_dict_atte1 = {}
trunc_dict_ate1 = {}

# Simulationen parallel durchführen
for i_learners, learners in enumerate(learner_list):
    for score in score_list:
        results = Parallel(n_jobs=-1)(delayed(run_truncation)(i, n_obs, R2_d, R2_y, true_theta, ml_g=clone(learners['ml_g']), ml_m=clone(learners['ml_m']), score=score) for i in range(num_repetitions))
        if score == "ATTE":
            trunc_dict_atte1[i_learners] = results
        else:
            trunc_dict_ate1[i_learners] = results

In [None]:
# Absolute Verzerrung berechnen und prüfen, ob der Referenzwert innerhalb des Konfidenzintervalls liegt
trunc_dict_atte2 = {}
trunc_dict_ate2 = {}

for i_learners, learners in enumerate(learner_list):
    for res in [trunc_dict_atte1,  trunc_dict_ate1]:
        dfs = res[i_learners]
        # Kombinieren die DataFrames innerhalb jeder inneren Liste und dann zu einem einzigen DataFrame speichern
        combined_dfs = [pd.concat(inner_list, ignore_index=True) for inner_list in dfs]
        result_df = pd.concat(combined_dfs, ignore_index=True)
        
        # Absolute Verzerrung berechnen
        if res is trunc_dict_atte1:
            result_df["bias"] = result_df["coef"] - result_df["oracle"]
            result_df["abs_bias"] = np.abs(result_df["coef"] - result_df["oracle"])
            # Prüfen, ob der Oracle Wert innerhalb des Konfidenzintervalls liegt
            result_df["in_ci"] = np.where((result_df["oracle"] >= result_df["2.5 %"]) & (result_df["oracle"] <= result_df["97.5 %"]), 1, 0) 
        else:
            result_df["bias"] = result_df["coef"] - true_theta
            result_df['abs_bias'] = np.abs(result_df["coef"] - true_theta)
            # Prüfen, ob der wahre Wert innerhalb des Konfidenzintervalls liegt
            result_df["in_ci"] = np.where((true_theta >= result_df["2.5 %"]) & (true_theta <= result_df["97.5 %"]), 1, 0)
              
        # Ergebnisse speichern
        if res is trunc_dict_atte1:
            trunc_dict_atte2[i_learners] = result_df
        else:
            trunc_dict_ate2[i_learners] = result_df 

In [7]:
# Mittlere absolute Verzerrung, Varianz des Schätzers, MSE und Coverage berechnen
trunc_dict_atte3 = {}
trunc_dict_ate3 = {}

for i_learners, learners in enumerate(learner_list):
    for res in [trunc_dict_atte2,  trunc_dict_ate2]:
        result_df = res[i_learners]

        for col in result_df.trim_value.unique():
            result_df.loc[result_df.trim_value == col, "mean_abs_bias"] = result_df.loc[result_df.trim_value == col, "abs_bias"].mean()
            result_df.loc[result_df.trim_value == col, "var"] = result_df.loc[result_df.trim_value == col, "coef"].var()
            result_df.loc[result_df.trim_value == col, "MSE"] = (result_df.loc[result_df.trim_value == col, "bias"].pow(2)).mean() 
            result_df.loc[result_df.trim_value == col, "coverage"] = (result_df.loc[result_df.trim_value == col, "in_ci"].sum()) / result_df.loc[result_df.trim_value == col].shape[0]

        if res is trunc_dict_atte2:
            trunc_dict_atte3[i_learners] = result_df
        else:
            trunc_dict_ate3[i_learners] = result_df

In [8]:
# Ergebnisse als csv Datei speichern
trunc_atte = pd.concat(trunc_dict_atte3, ignore_index=True)
trunc_ate = pd.concat(trunc_dict_ate3, ignore_index=True)
trunc_atte.to_csv("trunc_atte_linear+logit_rf_gb.csv", index=False)
trunc_ate.to_csv("trunc_ate_linear+logit_rf_gb.csv", index=False)

### Discarding

In [None]:
# Seed für die Reproduzierbarkeit der Simulationen festlegen
np.random.seed(1234)
# Funktion für die Parallelisierung der Simulationen
def run_discarding(i, n_obs, R2_d, R2_y, true_theta, ml_g, ml_m, score):
    # Leere Liste für die Ergebnisse erstellen
    results_discard = []
    # Seed für jede Simulation festlegen
    np.random.seed(i)
    # Daten generieren
    df_dict = make_irm_data(n_obs=n_obs, dim_x=dim_x, theta=true_theta, R2_d=R2_d, R2_y=R2_y)
    data = df_dict['df']
    # Wahren Propensity Scores speichern
    trueps = df_dict['df_orcl']['ps']
    # Cross-Fitting
    resampling_obj = DoubleMLResampling(n_folds=5, n_rep=1, n_obs=n_obs, stratify=data.D)
    smpls = resampling_obj.split_samples()

    # Propensity Scores schätzen
    pscore_est = cross_val_predict(ml_m, data.drop(columns=["Y", "D"]), data.D, method='predict_proba', cv=resampling_obj.resampling)[:,1]
    
    for trim_value in [0.001, 0.01, 0.05, 0.1]:
        # Discarden der Beobachtungen mit Propensity Scores unterhalb oder oberhalb des Schwellenwerts
        smpls_new, data_trimmed, pscore_trimmed = pscore_discard(data, pscore_est, smpls, trim_value)
        
        # Erstellen DML Daten Objekt
        dml_data = DoubleMLData.from_arrays(x=data_trimmed.drop(columns=["Y", "D"]),
                                            y=data_trimmed["Y"],
                                            d=data_trimmed["D"])
        # DoubleML Objekt erstellen
        dml_obj = DoubleMLIRM(dml_data, ml_g, ml_m, trimming_threshold=1e-12, draw_sample_splitting = False, score=score) 
        dml_obj.set_sample_splitting(smpls_new)
        dml_obj.fit(external_predictions={"d":{"ml_m": pscore_trimmed}})
        dml_summary = dml_obj.summary        
        dml_summary["trim_value"] = trim_value 
        dml_summary['learner'] = str(ml_g) + "," + str(ml_m)
        # Anteil der behandelten Beobachtungen in der Population
        dml_summary['share_treated_pop'] = ((data['D']== 1).sum()/data.shape[0])*100   
        # Anteil der behandelten Beobachtungen in der Stichprobe nach dem Discarding
        dml_summary["share_treated_sample"] = (((data_trimmed['D'] == 1).sum())/data_trimmed.shape[0])*100  
        # Anteil der Beobachtungen, die oberhalb (1-trim_value) Schwellenwerts discarded wurden
        dml_summary['share_trimmed_top'] = ((np.where(pscore_est >= (1-trim_value), 1, 0).sum())/pscore_est.shape[0])*100 
        # Anteil der Beobachtungen, die unterhalb des Schwellenwerts discarded wurden
        dml_summary['share_trimmed_bottom'] = ((np.where(pscore_est <= trim_value, 1, 0).sum())/pscore_est.shape[0])*100 
        # Anteil der discarded Beobachtungen mit wahren Propensity Scores   
        dml_summary['share_trim_orcl_top'] = ((np.where(trueps >= (1-trim_value), 1, 0).sum())/trueps.shape[0])*100
        dml_summary['share_trim_orcl_bottom'] = ((np.where(trueps <= trim_value, 1, 0).sum())/trueps.shape[0])*100 
        
        # Subpopulation nach dem Discarding
        df_orcl = df_dict["df_orcl"].loc[data_trimmed.index].reset_index(drop=True)
        predicted_ps = pscore_trimmed.flatten()

        # Log Loss berechnen
        dml_summary['loss_ml_g0'] = dml_obj.nuisance_loss['ml_g0'][0][0]
        dml_summary['loss_ml_g1'] = dml_obj.nuisance_loss['ml_g1'][0][0]
        dml_summary['loss_ml_m'] = dml_obj.nuisance_loss['ml_m'][0][0]

        # RMSE für das Outcome Modell -> ml_g0, ml_g1  und Log Loss für das Propensity Score-Modell -> ml_m berechnen
        orcl_data_g0 = df_orcl[df_orcl['D'] == 0]
        orcl_data_g1 = df_orcl[df_orcl['D'] == 1]
        dml_summary['loss_g0'] = np.sqrt(np.mean((orcl_data_g0['Y'] - dml_obj.predictions['ml_g0'].flatten()[orcl_data_g0.index])**2)).round(6)
        dml_summary['loss_g1'] = np.sqrt(np.mean((orcl_data_g1['Y'] - dml_obj.predictions['ml_g1'].flatten()[orcl_data_g1.index])**2)).round(6)
        dml_summary['loss_m'] = -np.mean(df_orcl['D'] * np.log(predicted_ps) + (1 - df_orcl['D']) * np.log(1 - predicted_ps)).round(6)

        # Oracle ATTE/ATE berechnen
        if score == "ATTE":
            dml_summary["oracle"] = np.mean(df_orcl.loc[df_orcl.D==1,"Y_1"] - df_orcl.loc[df_orcl.D==1,"Y_0"])
        else:
            dml_summary["oracle"] = np.mean(df_orcl["Y_1"] - df_orcl["Y_0"])

        # Ergebnisse speichern
        results_discard.append(dml_summary)
    return results_discard

# Parameter für die Simulationen
R2_d = 0.8
R2_y = 0.8
dim_x = 2
n_obs = 1000
num_repetitions = 1000
true_theta = 0
learner_list = [
    {"ml_g": LinearRegression(), "ml_m": LogisticRegressionCV()},
    {"ml_g": LinearRegression(), "ml_m": RandomForestClassifier()},
    {"ml_g": LinearRegression(), "ml_m": GradientBoostingClassifier()}
    ]
score_list = ["ATTE", "ATE"] 

# Simulationen parallel durchführen
disc_dict_atte1 = {}
disc_dict_ate1 = {}
for i_learners, learners in enumerate(learner_list):
    for score in score_list:
        results_discard = Parallel(n_jobs=-1)(delayed(run_discarding)(i, n_obs, R2_d, R2_y, true_theta,  ml_g=clone(learners['ml_g']), ml_m=clone(learners['ml_m']), score=score) for i in range(num_repetitions))
        if score == "ATTE":
            disc_dict_atte1[i_learners] = results_discard
        else:
            disc_dict_ate1[i_learners] = results_discard          

In [None]:
# Absolute Verzerrung berechnen und prüfen, ob der Referenzwert innerhalb des Konfidenzintervalls liegt
disc_dict_atte2 = {}
disc_dict_ate2 = {}

for i_learners, learners in enumerate(learner_list):
    for res in [disc_dict_atte1, disc_dict_ate1]:
        dfs = res[i_learners]
        # Kombinieren die DataFrames innerhalb jeder inneren Liste und dann zu einem einzigen DataFrame speichern
        combined_dfs = [pd.concat(inner_list, ignore_index=True) for inner_list in dfs]
        result_df = pd.concat(combined_dfs, ignore_index=True)  

        if res is disc_dict_atte1:
            result_df['bias'] = result_df['coef'] - result_df['oracle']
            result_df['abs_bias'] = np.abs(result_df['coef'] - result_df['oracle'])
            # Prüfen, ob der Oracle-Wert innerhalb des Konfidenzintervalls liegt
            result_df["in_ci"] = np.where((result_df["oracle"] >= result_df["2.5 %"]) & (result_df["oracle"] <= result_df["97.5 %"]), 1, 0)
        else:
           result_df['bias'] = result_df['coef'] - true_theta
           result_df['abs_bias'] = np.abs(result_df['coef'] - true_theta) 
           # Prüfen, ob der wahre Wert innerhalb des Konfidenzintervalls liegt
           result_df["in_ci"] = np.where((true_theta >= result_df["2.5 %"]) & (true_theta <= result_df["97.5 %"]), 1, 0)       
        
        # Ergebnisse speichern
        if res is disc_dict_atte1:
            disc_dict_atte2[i_learners] = result_df
        else:
            disc_dict_ate2[i_learners] = result_df 

In [None]:
# Mittlere absolute Verzerrung, Varianz des Schätzers, MSE und Coverage berechnen
disc_dict_atte3 = {}
disc_dict_ate3 = {}

for i_learners, learners in enumerate(learner_list):
    for res in [disc_dict_atte2, disc_dict_ate2]:
        df = res[i_learners]  
        for col in df.trim_value.unique():
            df.loc[df.trim_value == col, "mean_abs_bias"] = df.loc[df.trim_value == col, "abs_bias"].mean() 
            df.loc[df.trim_value == col, "var"] = df.loc[df.trim_value == col, "coef"].var()
            df.loc[df.trim_value == col, "MSE"] = (df.loc[df.trim_value == col, "bias"].pow(2)).mean()
            df.loc[df.trim_value == col, "coverage"] = (df.loc[df.trim_value == col, "in_ci"].sum()) / df.loc[df.trim_value == col].shape[0] 
            
        if res is disc_dict_atte2:
            disc_dict_atte3[i_learners] = df
        else:
            disc_dict_ate3[i_learners] = df

In [None]:
# Ergebnisse als csv Datei speichern
disc_atte = pd.concat(disc_dict_atte3, ignore_index=True)
disc_ate = pd.concat(disc_dict_ate3, ignore_index=True)
disc_atte.to_csv("disc_atte_linear+logit_rf_gb.csv", index=False)
disc_ate.to_csv("disc_ate_linear+logit_rf_gb.csv", index=False)
