In [None]:
''' 
    This jupyter notebook generates the partial dependence plots in the paper.

    This includes: Figures 8, 9, 10, and 11


    Note: This file requires installing the causalml package. Because the T-Learner in the EconML library does not offer an implementation that computes feature importance, the CausalML was used for this purpose. 
    However, due to installation issues and conflicts with Conda, the library was installed from the source where a new virtual environment was created. This created the issue
    of conflicting environments as all other scripts utilized another virtual environment with a more current Python version. 
    To circumvent this problem, all the simulated data generated above is saved in a .pkl file. Then, in the 'TLearner_feat_imp.py' file, the simulated data .kl files were 
    unpacked and the T-Learner was estimated. The results are finally saved in a .npy file. 

    Since TLearner_feat_imp.py runs in another environment, the subprocess package was used to reconcile both virtual environments.

    Make sure to provide the correct environment where the CausalML package is installed so that the program can run properly. 
    
    See https://causalml.readthedocs.io/en/latest/installation.html to install the library and https://github.com/uber/causalml/issues/678 to troubleshoot potential installation
    issues. 
'''


In [14]:
import pandas as pd
import numpy as np
from Gen_data import SimulationStudy
import Methods_all_sample as method
from Analysis_new import get_split
import subprocess
import pickle
import feature_importance

In [15]:
#Save  to a pkl file as to ensure that both EconML methods and the causalml T-learner utilize the same data
def save_to_pkl(data_name, data):
    with open(f'{data_name}.pkl', 'wb') as pickle_file:
        pickle.dump(data, pickle_file)

In [16]:
#Generate simulation and split into train and test
def get_sim(function: str) -> tuple[dict]:    
    sim: SimulationStudy = SimulationStudy(p=35, mean_correlation=0.5, cor_variance=0.2, n=4000, no_feat_cate=3, non_linear = function, seed=220924)
    simulation = sim.create_dataset()
    train_df, test_df, X_train, Y_train, T_train, X_test, T_test, Y_test, true_cate_train, true_cate_test = get_split(simulation)

    df_dict_TLearner = {'train_df': train_df, 'X_train': X_train, 'Y_train': Y_train, 'T_train': T_train, 'X_test': X_test, 'T_test': T_test}

    df_dict = {'Y_train': Y_train, 'T_train': T_train, 'X_train': X_train, 'X_test': X_test, 'T_test': T_test, 'true_cate_train': true_cate_train, 'true_cate_test': true_cate_test}

    return df_dict_TLearner, df_dict



In [17]:
#Save the simulation and pass it to the save_to_pkl function
def save_simulation(sim_type: str):
    for data_name, data in sim_type.items():
        save_to_pkl(data_name, data)


In [18]:
#Extract the necessary data from the dictionary
def get_data(dict:dict) -> tuple[dict]:
    return  dict['Y_train'], dict['T_train'], dict['X_train'], dict['X_test'], dict['T_test'], dict['true_cate_train'], dict['true_cate_test'] 


In [19]:
#Run the T_learner_feat_imp.py in the correct environment
def run_TLearner_causalml(env = r'C:\Users\joaov\anaconda3\envs\causalml-py38\python.exe'):
    subprocess.run([env, 'TLearner_feat_imp.py'])

In [20]:
#Get the partial dependence plots from each estimator
def get_plots(data_dict: dict, estimator, setting: str):

    Y_train, T_train, X_train, X_test, T_test, true_cate_train, true_cate_test = get_data(data_dict)
    
    if estimator == 'T-Learner':
        est, estimated_cate_train, estimated_cate_test, RMSE_test, RMSE_train = method.TLearner_estimator(Y_train, T_train, X_train, X_test, T_test, 
                                                                                                    true_cate_train, true_cate_test)
        
        important_feats_t = np.load('feat_importance_TLearner.npy', allow_pickle=True)
        feats_dict = important_feats_t.item()
        series = feats_dict[1]
        feat_importance = series.index

        feature_importance.partial_dependence_plots_ML(X_test, feat_importance, est = est, setting=setting)
              
    elif estimator == 'GRF':
        est, feat_importance, estimated_cate_train, estimated_cate_test, RMSE_test, RMSE_train = method.GRF_estimator(Y_train, T_train, X_train, X_test, T_test, 
                                                                                                                      true_cate_train, true_cate_test)
        
        important_feats_grf = feature_importance.get_important_feats(X_test, feat_importance)
        feature_importance.partial_dependence_plots_ML(X_test, important_feats_grf, est = est, setting=setting)
        
    elif estimator == 'CF DML':
        est, feat_importance, estimated_cate_train, estimated_cate_test, RMSE_test, RMSE_train = method.CF_DML(Y_train, T_train, X_train, X_test, T_test, 
                                                                                                    true_cate_train, true_cate_test)
        
        important_feats_cfdml = feature_importance.get_important_feats(X_test, feat_importance)
        feature_importance.partial_dependence_plots_ML(X_test, important_feats_cfdml, est = est, setting=setting)
    
    else:
        conf_int, coeff, estimated_cate_ols_train, estimated_cate_ols_test, OLS_RMSE_train, OLS_RMSE_test = method.ols_estimator(Y_train=Y_train, T_train=T_train, X_train=X_train, X_test=X_test, 
                                                                                                                                 T_test=T_test, true_cate_train=true_cate_train, true_cate_test=true_cate_test, ci=True)
        
        
        feature_importance.plot_OLS(X_test, coeff, conf_int, setting=setting)
        


        


Linear Setting

In [21]:
np.random.seed(220924)
df_linear_T, sim_lin = get_sim('linear')
save_simulation(df_linear_T)
run_TLearner_causalml() #make sure to set env=path where the causalml package is installed
get_plots(sim_lin, 'OLS', setting='linear')
get_plots(sim_lin, 'T-Learner', setting='linear')
get_plots(sim_lin, 'GRF', setting='linear')
get_plots(sim_lin, 'CF DML', setting='linear')

Quadratic Setting

In [22]:
np.random.seed(220924)
df_quadratic_T, sim_quad = get_sim('quadratic')
save_simulation(df_quadratic_T)
run_TLearner_causalml() #make sure to set env=path where the causalml package is installed
get_plots(sim_quad, 'OLS', setting='quadratic')
get_plots(sim_quad, 'T-Learner', setting='quadratic')
get_plots(sim_quad, 'GRF', setting='quadratic')
get_plots(sim_quad, 'CF DML', setting='quadratic')

Check that both libraries deliver the exact same CATEs

In [23]:
#cate_t_causalml= np.load('causalml_cate_tlearner.npy', allow_pickle=True)

In [24]:
#plt.figure(figsize=(8, 6))
#plt.hist([estimated_cate_test, cate_t_causalml.reshape(1,-1).flatten()], bins=30, alpha=0.5, label=['Econ ML', 'CausalML'])
#plt.xlabel('CATE Estimates')
#plt.ylabel('Frequency')
#plt.title('Overlap of CATE Estimates from EconML and CausalML')
#plt.legend(loc='upper right')
#plt.show()