In [1]:
# Imports
import pandas as pd
import numpy as np
import doubleml as dml
from doubleml.datasets import fetch_401K
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.base import clone

import torch
import os
import time
from functools import partial

In [2]:
# Set up basic model: Specify variables for data-backend
features_base = ['age', 'inc', 'educ', 'fsize', 'marr',
                 'twoearn', 'db', 'pira', 'hown']

# Initialize DoubleMLData (data-backend of DoubleML)
Data = fetch_401K(return_type='DataFrame')
data_dml_base = dml.DoubleMLData(Data,
                                 y_col='net_tfa',
                                 d_cols='e401',
                                 x_cols=features_base)

In [3]:
# Random Forest (IRM)
randomForest = RandomForestRegressor(n_estimators=1000,max_features=3, max_depth=3, min_samples_leaf=3)
randomForest_class = RandomForestClassifier(n_estimators=1000,max_features=3, max_depth=3, min_samples_leaf=3)

np.random.seed(1)
dml_irm_forest = dml.DoubleMLIRM(data_dml_base,
                                 ml_g = randomForest,
                                 ml_m = randomForest_class,
                                 trimming_threshold = 0.01,
                                 n_folds = 5,
                                 score = "ATTE",
                                 n_rep = 100)

dml_irm_forest.fit(store_predictions=True)
forest_summary = dml_irm_forest.summary

print(forest_summary)

              coef      std err         t         P>|t|       2.5 %  \
e401  11618.083022  1437.423322  8.082576  6.341287e-16  8800.78508   

            97.5 %  
e401  14435.380964  


In [4]:
# Random Forest (PLR)
learner = RandomForestRegressor(n_estimators=1000,max_features=3, max_depth=3, min_samples_leaf=3)

ml_l = clone(learner)

ml_m = clone(learner)

np.random.seed(1)

dml_plr_obj = dml.DoubleMLPLR(data_dml_base, ml_l, ml_m,n_folds = 5,n_rep = 100)

print(dml_plr_obj.fit())


------------------ Data summary      ------------------
Outcome variable: net_tfa
Treatment variable(s): ['e401']
Covariates: ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']
Instrument variable(s): None
No. Observations: 9915

------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2

------------------ Machine learner   ------------------
Learner ml_l: RandomForestRegressor(max_depth=3, max_features=3, min_samples_leaf=3,
                      n_estimators=1000)
Learner ml_m: RandomForestRegressor(max_depth=3, max_features=3, min_samples_leaf=3,
                      n_estimators=1000)
Out-of-sample Performance:
Learner ml_l RMSE: [[56267.89713491]
 [56020.51386144]
 [55680.80664099]
 [55977.74999314]
 [55956.8797679 ]
 [56092.29973621]
 [56491.43887508]
 [56422.88652047]
 [56195.37525486]
 [56505.37197798]
 [56806.95644952]
 [56228.58443031]
 [56328.310315  ]
 [55940.85433287]
 [55881.26906432]
 [55557.525051