In [21]:
import numpy as np

from doubleml.datasets import fetch_401K
from doubleml import DoubleMLData, DoubleMLIIVM

from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier, XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

sns.set_theme()
colors = sns.color_palette()
plt.rcParams['figure.figsize'] = 10., 7.5
sns.set_theme(font_scale=1.5)
sns.set_style('whitegrid', {'axes.spines.top': False,
                            'axes.spines.bottom': False,
                            'axes.spines.left': False,
                            'axes.spines.right': False})

In [3]:
data = fetch_401K(return_type='DataFrame')

In [7]:
data[['e401', 'net_tfa']].groupby('e401').mean().diff()

Unnamed: 0_level_0,net_tfa
e401,Unnamed: 1_level_1
0,
1,19559.34375


In [8]:
data[['p401', 'net_tfa']].groupby('p401').mean().diff()

Unnamed: 0_level_0,net_tfa
p401,Unnamed: 1_level_1
0,
1,27371.582031


In [10]:
x_cols = ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']

data_dml = DoubleMLData(data, y_col='net_tfa', d_cols='p401', z_cols='e401', x_cols=x_cols)

In [28]:
regressor = RandomForestRegressor(n_estimators=500, random_state=42)
classifier = RandomForestClassifier(n_estimators=500, random_state=42)

model = DoubleMLIIVM(data_dml, regressor, classifier, classifier)

In [29]:
model.fit()
summary = model.summary

In [30]:
summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
p401,11205.615212,2180.338598,5.139392,2.756285e-07,6932.230086,15479.000337
