# Double ML - modeling

`df_mix`

## 0. setup

In [93]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import LabelEncoder
import sklearn
import os
from matplotlib.pyplot import hist
import scipy.stats as stats
import math
import statsmodels.api as sm

In [94]:
# set random seed for numpy
RANDOM_SEED=42
np.random.seed(RANDOM_SEED)

In [95]:
def find_p(estimate, std):
    z_value = estimate / std
    p_value = stats.norm.sf(abs(z_value))*2
    return round(estimate, 4), round(std, 4), round(p_value, 4)

In [96]:
def label_encode_column(df, column):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    return df

## 1. functions

### 1.1 Specify Nuisance Function Models

The next step is to specify models for 

*   $\mu(z)=\mathbb{E}(Y|x)$
*   $m(x) = P(A=1|x)$

In [97]:
# make a function that returns a sklearn model for later use in k-folding
def make_mu_model():
  #return KNeighborsClassifier(n_neighbors=300)
  return RandomForestRegressor(random_state=RANDOM_SEED, n_estimators=300, max_depth=None)
  #return RandomForestClassifier(n_estimators=100, max_depth=5)

# specify a model for m(z,x)
def make_m_model():
  #return LogisticRegression(max_iter=1000, warm_start=True, random_state=RANDOM_SEED)
  return RandomForestClassifier(n_estimators=200, max_depth=None)

### 1.2 Functions that use cross fitting to get predicted $\hat{\mu}$, $\hat{m}$, $\hat{p}$ for each unit

In [98]:
# helper functions to implement the cross fitting

def m_k_fold_fit_and_predict(make_model, X:pd.DataFrame, A:np.array, n_splits:int):
    """
    Implements K fold cross-fitting for the model predicting the outcome Y. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns two arrays containing the predictions for all units untreated, all units treated  

    Args:
    model: function that returns sklearn model (that implements fit and either predict_prob or predict)
    X: dataframe of variables to adjust for
    A: array of treatments
    n_splits: number of splits to use
    """
    predictions = np.full_like(A, np.nan, dtype=float)
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    for train_index, test_index in kf.split(X, A):
      X_train = X.loc[train_index]
      A_train = A[train_index]

      m = make_model()
      m.fit(X_train, A_train)
      predictions[test_index] = m.predict_proba(X.loc[test_index])[:,1]

    assert np.isnan(predictions).sum() == 0  # Ensure no predictions are NaN
    return predictions

def mu_k_fold_fit_and_predict(make_model, X:pd.DataFrame, y:np.array, n_splits:int, output_type:str):
    """
    Implements K fold cross-fitting for the model predicting the outcome Y. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns two arrays containing the predictions for all units untreated, all units treated  

    Args:
    model: function that returns sklearn model (that implements fit and either predict_prob or predict)
    X: dataframe of variables to adjust for
    y: array of outcomes
    n_splits: number of splits to use
    output_type: type of outcome, "binary" or "continuous"

    """
    predictions = np.full_like(y, np.nan, dtype=float)
    if output_type == 'binary':
      kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    elif output_type == 'continuous':
      kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    
    for train_index, test_index in kf.split(X, y):
      X_train = X.loc[train_index]
      y_train = y.loc[train_index]
      mu = make_model()
      mu.fit(X_train, y_train)

      if output_type =='binary':
        predictions[test_index] = mu.predict_proba(X.loc[test_index])[:, 1]
      elif output_type == 'continuous':
        predictions[test_index] = mu.predict(X.loc[test_index])

    assert np.isnan(predictions).sum() == 0
    return predictions

### 1.3 ATE

In [99]:
def ate_estimator(X, y, n_splits=5):
    """
    Fit a logistic regression model as a proxy for a probit model to predict the binary outcome.
    Args:
    X: DataFrame of predictors, including treatment and control variables.
    y: Binary outcome variable.
    n_splits: Number of splits for cross-validation.

    Returns:
    A fitted LogisticRegression model.
    """
    # Initialize cross-validator
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    predictions = np.zeros(len(y))  # Initialize predictions array
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train = y[train_index]

        # Fit logistic regression as a proxy for probit
        model = LogisticRegression(max_iter=1000, solver='liblinear')
        model.fit(X_train, y_train)

        # Predict probabilities for the positive class
        predictions[test_index] = model.predict_proba(X_test)[:, 1]

    return model


### 1.4 Run a trial

In [100]:
def run(df, outcome_l, treatment_l, block_l, fe, stationary_c):
    df_1 = df[outcome_l + treatment_l + block_l].dropna()

    outcome = df_1[outcome_l].reset_index(drop=True).squeeze()
    treatment = df_1[treatment_l].reset_index(drop=True).squeeze()
    block = df_1[block_l].reset_index(drop=True).squeeze()

    # Combining treatment and block variables for modeling
    X = pd.concat([treatment.reset_index(drop=True), block.reset_index(drop=True)], axis=1)
    y = outcome

    model = ate_estimator(X, y)  
    tau_hat = model.coef_[0].mean()  # Estimated effect
    std_hat = model.coef_[0].std()  # Standard error
    tau_hat, std_hat, p = find_p(tau_hat, std_hat) # p-value
    return outcome_l[0], treatment_l[0], tau_hat, std_hat, p, fe, stationary_c


## 2. Analysis

### 2.1 `df_mix`

In [101]:
# read in the dataframe
df = pd.read_csv('df_mix.csv')

PermissionError: [Errno 1] Operation not permitted: 'df_mix.csv'

In [None]:
# in order to run random forest with categorical variable
df = label_encode_column(df, 'country')

In [None]:
res = pd.DataFrame(columns=['outcome', 'treatment', 'tau_hat', 'std_hat', 'p_val', 'fixed_effects', 'stationary_controls'])

In [None]:
df.columns

Index(['country', 't', 'onset2COWCS', 'decade', 'democracy', 'logmountain',
       'ethnic_fractionalization', 'religion_fractionalization',
       'language_fractionalization', 'leg_british', 'opec', 'logpop_M_diff',
       'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned',
       'oilreserves_full', 'oilreserves', 'oilreserves_public', '1', '2', '6',
       '10', '19'],
      dtype='object')

In [None]:
def run_all(df, outcome_l, treatment_l, block_fe_l, block_sta_l, block_other_l):
    '''
    For a given treatment i.e. sector.
    '''
    res = pd.DataFrame(columns=['outcome', 'treatment', 'tau_hat', 'std_hat', 'p_val', 'fixed_effects', 'stationary_controls'])

    for fe in [True, False]:
        block_l = block_other_l[:]  # Initialize block_l with a copy of block_other_l
        if fe:
            block_l += block_fe_l

        for sta in [True, False]:
            if sta:
                block_l += block_sta_l

            res_row = run(df, outcome_l, treatment_l, block_l, fe, sta)
            print(res_row)
            res.loc[len(res)] = list(res_row)
    return res


Treatment: oilreserves

In [None]:
oilreserves_res = run_all(df, 
        outcome_l = ['onset2COWCS'], 
        treatment_l = ['oilreserves'], 
        block_fe_l = ['country', 't'], 
        block_sta_l = ['logmountain', 'ethnic_fractionalization', 'religion_fractionalization',
                       'language_fractionalization'], 
        block_other_l = ['democracy', 'logpop_M_diff', 'logpopdens_diff', 
                         'ecgrowth_demeaned'])

('onset2COWCS', 'oilreserves', -0.0193, 0.0981, 0.844, True, True)
('onset2COWCS', 'oilreserves', -0.0193, 0.0981, 0.844, True, False)
('onset2COWCS', 'oilreserves', -0.0479, 0.2312, 0.8359, False, True)
('onset2COWCS', 'oilreserves', -0.0479, 0.2312, 0.8359, False, False)


In [None]:
oilreserves_res

Unnamed: 0,outcome,treatment,tau_hat,std_hat,p_val,fixed_effects,stationary_controls
0,onset2COWCS,oilreserves,-0.0193,0.0981,0.844,True,True
1,onset2COWCS,oilreserves,-0.0193,0.0981,0.844,True,False
2,onset2COWCS,oilreserves,-0.0479,0.2312,0.8359,False,True
3,onset2COWCS,oilreserves,-0.0479,0.2312,0.8359,False,False


Treatment: oilreserves_full

In [None]:
oilreserves_full_res = run_all(df, 
        outcome_l = ['onset2COWCS'], 
        treatment_l = ['oilreserves_full'], 
        block_fe_l = ['country', 't'], 
        block_sta_l = ['logmountain', 'ethnic_fractionalization', 'religion_fractionalization',
                       'language_fractionalization', 'leg_british', 'opec'], 
        block_other_l = ['democracy', 'logpop_M_diff', 'logpopdens_diff', 
                         'logoutreg_diff', 'ecgrowth_demeaned'])

('onset2COWCS', 'oilreserves_full', -0.0002, 0.0005, 0.772, True, True)
('onset2COWCS', 'oilreserves_full', -0.0002, 0.0005, 0.772, True, False)
('onset2COWCS', 'oilreserves_full', -0.0, 0.0, 0.7548, False, True)
('onset2COWCS', 'oilreserves_full', -0.0, 0.0, 0.7548, False, False)


In [None]:
oilreserves_full_res


Unnamed: 0,outcome,treatment,tau_hat,std_hat,p_val,fixed_effects,stationary_controls
0,onset2COWCS,oilreserves_full,-0.0002,0.0005,0.772,True,True
1,onset2COWCS,oilreserves_full,-0.0002,0.0005,0.772,True,False
2,onset2COWCS,oilreserves_full,-0.0,0.0,0.7548,False,True
3,onset2COWCS,oilreserves_full,-0.0,0.0,0.7548,False,False


Treatment: oilreserves_public

In [None]:
oilreserves_public_res = run_all(df, 
        outcome_l = ['onset2COWCS'], 
        treatment_l = ['oilreserves_public'], 
        block_fe_l = ['country', 't'], 
        block_sta_l = ['logmountain', 'ethnic_fractionalization', 'religion_fractionalization',
                       'language_fractionalization', 'leg_british', 'opec'], 
        block_other_l = ['democracy', 'logpop_M_diff', 'logpopdens_diff', 
                         'logoutreg_diff', 'ecgrowth_demeaned'])

('onset2COWCS', 'oilreserves_public', 0.0003, 0.0039, 0.934, True, True)
('onset2COWCS', 'oilreserves_public', 0.0003, 0.0039, 0.934, True, False)
('onset2COWCS', 'oilreserves_public', -0.009, 0.1568, 0.9542, False, True)
('onset2COWCS', 'oilreserves_public', -0.009, 0.1568, 0.9542, False, False)


In [None]:
oilreserves_public_res

Unnamed: 0,outcome,treatment,tau_hat,std_hat,p_val,fixed_effects,stationary_controls
0,onset2COWCS,oilreserves_public,0.0003,0.0039,0.934,True,True
1,onset2COWCS,oilreserves_public,0.0003,0.0039,0.934,True,False
2,onset2COWCS,oilreserves_public,-0.009,0.1568,0.9542,False,True
3,onset2COWCS,oilreserves_public,-0.009,0.1568,0.9542,False,False


Final Result:

In [None]:
dfs = [oilreserves_res, oilreserves_full_res, oilreserves_public_res]
stacked_df = pd.concat(dfs)
final_res = stacked_df.reset_index(drop=True)

In [None]:
final_res.insert(0, 'gvc_type', 'mix')
final_res

Unnamed: 0,gvc_type,outcome,treatment,tau_hat,std_hat,p_val,fixed_effects,stationary_controls
0,mix,onset2COWCS,oilreserves,-0.0193,0.0981,0.844,True,True
1,mix,onset2COWCS,oilreserves,-0.0193,0.0981,0.844,True,False
2,mix,onset2COWCS,oilreserves,-0.0479,0.2312,0.8359,False,True
3,mix,onset2COWCS,oilreserves,-0.0479,0.2312,0.8359,False,False
4,mix,onset2COWCS,oilreserves_full,-0.0002,0.0005,0.772,True,True
5,mix,onset2COWCS,oilreserves_full,-0.0002,0.0005,0.772,True,False
6,mix,onset2COWCS,oilreserves_full,-0.0,0.0,0.7548,False,True
7,mix,onset2COWCS,oilreserves_full,-0.0,0.0,0.7548,False,False
8,mix,onset2COWCS,oilreserves_public,0.0003,0.0039,0.934,True,True
9,mix,onset2COWCS,oilreserves_public,0.0003,0.0039,0.934,True,False


In [None]:
final_res.to_csv('mix_res.csv', index=False)

PermissionError: [Errno 1] Operation not permitted: 'mix_res.csv'

## Appendix:

For estimating the local average treatment effect under the monotone instrument assumption, there is a double-machine learning approach that works with generic supervised learning approaches. Here, we want an estimator $\hat{\tau}^{\mathrm{LATE}}$ for the parameter
$$
\tau^{\mathrm{LATE}}=\frac{\mathbb{E}[\mathbb{E}[Y \mid X, Z=1]-\mathbb{E}[Y \mid X, Z=0]]}{\mathbb{E}[\mathrm{P}(A=1 \mid X, Z=1)-\mathrm{P}(A=1 \mid X, Z=0)]}
$$
To define the estimator, it's convenient to introduce some additional notation. First, we define the nuisance functions:
$$
\begin{aligned}
\mu(z, x) & =\mathbb{E}[Y \mid z, x] \\
m(z, x) & =\mathrm{P}(A=1 \mid x, z) \\
p(x) & =\mathrm{P}(Z=1 \mid x) .
\end{aligned}
$$
We also define the score $\phi$ by:
$$
\begin{aligned}
& \phi_{Z \rightarrow Y}(\mathbf{X} ; \mu, p) \triangleq \mu(1, X)-\mu(0, X)+\frac{Z(Y-\mu(1, X))}{p(X)}-\frac{(1-Z)(Y-\mu(0, X))}{1-p(X)} \\
& \phi_{Z \rightarrow A}(\mathbf{X} ; m, p) \triangleq m(1, X)-m(0, X)+\frac{Z(A-m(1, X))}{p(X)}-\frac{(1-Z)(A-m(0, X))}{1-p(X)} \\
& \phi(\mathbf{X} ; \mu, m, p, \tau) \triangleq \phi_{Z \rightarrow Y}(\mathbf{X} ; \mu, p)-\phi_{Z \rightarrow A}(\mathbf{X} ; m, p) \times \tau
\end{aligned}
$$
Then, the estimator is defined by a two stage procedure:
1. Fit models $\hat{\mu}, \hat{m}, \hat{p}$ for each of $\mu, m, p$ (using supervised machine learning).
2. Define $\hat{\tau}^{\mathrm{LATE}}$ as the solution to $\frac{1}{n} \sum_i \phi\left(\mathbf{X}_i ; \hat{\mu}, \hat{m}, \hat{p}, \hat{\tau}^{\mathrm{LATE}}\right)=0$. That is,
$$
\hat{\tau}^{\mathrm{LATE}}=\frac{\frac{1}{n} \sum_i \phi_{Z \rightarrow Y}\left(\mathbf{X}_i ; \hat{\mu}, \hat{p}\right)}{\frac{1}{n} \sum_i \phi_{Z \rightarrow A}\left(\mathbf{X}_i ; \hat{m}, \hat{p}\right)}
$$
It may help intuitions to notice that the double machine learning estimator of the LATE is effectively the double machine learning estimator of of the average treatment effect of $Z$ on $Y$ divided by the double machine learning estimator of the average treatment effect of $Z$ on $A$.
The nuisance functions can be estimated by:
1. fit a model $\hat{\mu}$ that predicts $Y$ from $Z, X$ by minimizing mean square error
2. fit a model $\hat{m}$ that predicts $A$ from $Z, X$ by minimizing mean cross-entropy
3. fit a model $\hat{p}$ that predicts $Z$ from $X$ by minimizing mean cross-entropy.