# AutoDoubleML - Automated Double Machine Learning
This is the first trial proof of concept notebook for the AutoDoubleML project.

The example follows the example of analysing the the impact of 401(k) on financial wealth from [Chernozhukov et al. (2018)](https://arxiv.org/abs/1608.00060).

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

from doubleml import DoubleMLData, DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM
import doubleml as dml

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

## Data preparation

In [2]:
from doubleml.datasets import fetch_401K
data = fetch_401K(return_type='DataFrame')
data.head()

Unnamed: 0,nifa,net_tfa,tw,age,inc,fsize,educ,db,marr,twoearn,e401,p401,pira,hown
0,0.0,0.0,4500.0,47,6765.0,2,8,0,0,0,0,0,0,1
1,6215.0,1015.0,22390.0,36,28452.0,1,16,0,0,0,0,0,0,1
2,0.0,-2000.0,-2000.0,37,3300.0,6,12,1,0,0,0,0,0,0
3,15000.0,15000.0,155000.0,58,52590.0,2,16,0,1,1,0,0,0,1
4,0.0,0.0,58000.0,32,21804.0,1,11,0,0,0,0,0,0,1


## Data Backend

In [3]:
# Set up basic model: Specify variables for data-backend
features_base = ['age', 'inc', 'educ', 'fsize', 'marr',
                 'twoearn', 'db', 'pira', 'hown']

# Initialize DoubleMLData (data-backend of DoubleML)
data_dml_base = dml.DoubleMLData(data,
                                 y_col='net_tfa',
                                 d_cols='e401',
                                 x_cols=features_base)
print(data_dml_base)


------------------ Data summary      ------------------
Outcome variable: net_tfa
Treatment variable(s): ['e401']
Covariates: ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']
Instrument variable(s): None
No. Observations: 9915

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9915 entries, 0 to 9914
Columns: 14 entries, nifa to hown
dtypes: float32(4), int8(10)
memory usage: 251.9 KB



### Make pipline

In [4]:
randomForest1 = make_pipeline(StandardScaler(), RandomForestRegressor())
randomForest2 = make_pipeline(StandardScaler(), RandomForestRegressor())
dml_plr_base = dml.DoubleMLPLR(data_dml_base, randomForest1, randomForest2, n_folds=3)

# Fit the model
dml_plr_base.fit()
dml_plr_base.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
e401,9961.255586,1355.560753,7.348439,2.005349e-13,7304.405331,12618.10584


# The AutoSKLearn example

In [5]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
import autosklearn.regression

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data_dml_base.data[features_base], data_dml_base.data['net_tfa'], test_size=0.33, random_state=42)
x_train

Unnamed: 0,age,inc,educ,fsize,marr,twoearn,db,pira,hown
3844,33,19200.0,17,1,0,0,0,0,0
4280,25,34794.0,12,3,1,1,1,0,1
1221,36,65505.0,17,4,1,1,1,0,1
4906,32,27960.0,12,1,0,0,0,0,0
2548,60,35277.0,12,3,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
5734,55,36552.0,14,5,1,0,0,1,1
5191,52,28776.0,12,5,1,1,0,0,1
5390,29,28800.0,14,1,0,0,1,0,0
860,62,22974.0,12,2,1,1,0,0,1


In [17]:
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=600,
    max_models_on_disc=5,
    memory_limit = 223372036854775807,
    ensemble_size = 3,
)
automl.fit(x_train, y_train, dataset_name="diabetes")

  automl = autosklearn.regression.AutoSklearnRegressor(


[ERROR] [2023-05-06 16:12:10,375:Client-AutoML(1):diabetes] (' Dummy prediction failed with run state StatusType.CRASHED and additional output: {\'error\': \'Result queue is empty\', \'exit_status\': "<class \'pynisher.limit_function_call.AnythingException\'>", \'subprocess_stdout\': \'\', \'subprocess_stderr\': \'Process pynisher function call:\\nTraceback (most recent call last):\\n  File "/Users/joepaul/opt/anaconda3/envs/AutoDoubleML/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap\\n    self.run()\\n  File "/Users/joepaul/opt/anaconda3/envs/AutoDoubleML/lib/python3.11/multiprocessing/process.py", line 108, in run\\n    self._target(*self._args, **self._kwargs)\\n  File "/Users/joepaul/opt/anaconda3/envs/AutoDoubleML/lib/python3.11/site-packages/pynisher/limit_function_call.py", line 108, in subprocess_func\\n    resource.setrlimit(resource.RLIMIT_AS, (mem_in_b, mem_in_b))\\nOverflowError: Python int too large to convert to C long\\n\', \'exitcode\': 1, \'configu

ValueError: (' Dummy prediction failed with run state StatusType.CRASHED and additional output: {\'error\': \'Result queue is empty\', \'exit_status\': "<class \'pynisher.limit_function_call.AnythingException\'>", \'subprocess_stdout\': \'\', \'subprocess_stderr\': \'Process pynisher function call:\\nTraceback (most recent call last):\\n  File "/Users/joepaul/opt/anaconda3/envs/AutoDoubleML/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap\\n    self.run()\\n  File "/Users/joepaul/opt/anaconda3/envs/AutoDoubleML/lib/python3.11/multiprocessing/process.py", line 108, in run\\n    self._target(*self._args, **self._kwargs)\\n  File "/Users/joepaul/opt/anaconda3/envs/AutoDoubleML/lib/python3.11/site-packages/pynisher/limit_function_call.py", line 108, in subprocess_func\\n    resource.setrlimit(resource.RLIMIT_AS, (mem_in_b, mem_in_b))\\nOverflowError: Python int too large to convert to C long\\n\', \'exitcode\': 1, \'configuration_origin\': \'DUMMY\'}.',)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (4057355053.py, line 1)