In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import LabelEncoder
import sklearn
import os
from matplotlib.pyplot import hist
import scipy.stats as stats
import math
import statsmodels.api as sm


In [2]:
# read in the dataframe
df = pd.read_csv('df_bp.csv')

In [3]:
df1 = pd.read_csv('../../data/GVC_data/transportIV_file.csv')
df1 = df1.loc[:, ['country', 't', 'trans_outp_p']]

df = pd.merge(df, df1, on=['country', 't'])

In [4]:
print(df.dtypes)

country                        object
t                               int64
onset2COWCS                   float64
milexp_pergdpSIPRI            float64
decade                        float64
democracy                     float64
logmountain                   float64
ethnic_fractionalization      float64
religion_fractionalization    float64
language_fractionalization    float64
leg_british                   float64
opec                          float64
logpop_M_diff                 float64
logpopdens_diff               float64
logoutreg_diff                float64
ecgrowth_demeaned             float64
treat_agri                    float64
treat_mine                    float64
treat_fuel                    float64
treat_metal                   float64
iv_transport                  float64
iv_agri                       float64
iv_mine                       float64
iv_fuel                       float64
iv_metal                      float64
trans_outp_p                  float64
dtype: objec

In [5]:
num_countries = df['country'].nunique()
print(f"There are {num_countries} unique countries in the dataset.")

There are 181 unique countries in the dataset.


Without Country Fixed effect

In [6]:
import pandas as pd
import numpy as np
import doubleml as dml
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

# Define the machine learning model for the treatment effect (ml_m) and outcome (ml_l)
learner = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1111)
ml_m = clone(learner)  # Machine learning model for the treatment effect
ml_l = clone(learner)  # Machine learning model for the outcome

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
import doubleml as dml

def run_plr(df, outcome_l, treatment_l, block_l, fe, stationary_c):
    # Select the relevant columns and drop NA
    df_subset = df[outcome_l + treatment_l + block_l].dropna()

    # Prepare the data for DoubleML
    obj_dml_data = dml.DoubleMLData(df_subset, y_col=outcome_l[0], d_cols=treatment_l[0], x_cols=block_l)

    # Define the machine learning models
    learner = RandomForestRegressor(n_estimators=500, max_features='sqrt', random_state=1111)
    ml_m = clone(learner)
    ml_l = clone(learner)

    # Set up the PLR model
    dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=10)

    # Fit the PLR model and get the summary
    plr_results = dml_plr.fit()
    summary = plr_results.summary  # Access summary as an attribute, not a method

    return summary

In [8]:
def run_all(df, outcome_l, treatment_l, block_fe_l, block_sta_l, block_other_l):
    # Adjusted the columns to store summaries in a list since they're DataFrames now
    summaries = []

    for fe in [True, False]:
        block_l = block_other_l.copy()
        if fe:
            # Exclude 'country' from fixed effects
            block_l += [b for b in block_fe_l if b != 'country']
        for sta in [True, False]:
            if sta:
                block_l += block_sta_l
            summary = run_plr(df, outcome_l, treatment_l, block_l, fe, sta)
            summaries.append(summary)  # Append the summary DataFrame to the list

    return summaries  # Return the list of summary DataFrames

In [9]:
plr_summaries = run_all(df, 
                        outcome_l=['milexp_pergdpSIPRI'], 
                        treatment_l=['treat_fuel'],
                        block_fe_l=['country', 't'],  # 'country' will be ignored in the fixed effects
                        block_sta_l=['logmountain', 'ethnic_fractionalization', 'religion_fractionalization', 'language_fractionalization'], 
                        block_other_l=['democracy', 'logpopdens_diff', 'ecgrowth_demeaned'])

In [10]:
print(plr_summaries)

[                  coef     std err         t     P>|t|       2.5 %      97.5 %
treat_fuel -240.037501  318.343405 -0.754021  0.450837 -863.979109  383.904107,                   coef     std err         t     P>|t|       2.5 %      97.5 %
treat_fuel -229.189675  306.105725 -0.748727  0.454022 -829.145872  370.766521,                   coef     std err         t     P>|t|       2.5 %      97.5 %
treat_fuel -224.742989  303.956382 -0.739392  0.459669 -820.486551  371.000573,                   coef     std err         t     P>|t|       2.5 %      97.5 %
treat_fuel -231.500539  318.825707 -0.726104  0.467775 -856.387442  393.386364]


In [11]:
import pandas as pd

# Assuming plr_summaries is a list of DataFrames as shown in your print output
# We'll add a multi-index or additional columns to indicate whether fixed effects or stationary controls were used

# Define the configurations corresponding to your summaries for clarity
configurations = [
    {'fixed_effects': True, 'stationary_controls': True},
    {'fixed_effects': True, 'stationary_controls': False},
    {'fixed_effects': False, 'stationary_controls': True},
    {'fixed_effects': False, 'stationary_controls': False},
]

# Add a multi-index or new columns to each DataFrame to indicate the configuration
for i, summary in enumerate(plr_summaries):
    summary['fixed_effects'] = configurations[i]['fixed_effects']
    summary['stationary_controls'] = configurations[i]['stationary_controls']

# Concatenate all the DataFrames into one
combined_summary = pd.concat(plr_summaries, axis=0)

# Reset index if you want a clean index
combined_summary.reset_index(drop=True, inplace=True)

# Now you can print or display the combined summary
print(combined_summary)


         coef     std err         t     P>|t|       2.5 %      97.5 %  \
0 -240.037501  318.343405 -0.754021  0.450837 -863.979109  383.904107   
1 -229.189675  306.105725 -0.748727  0.454022 -829.145872  370.766521   
2 -224.742989  303.956382 -0.739392  0.459669 -820.486551  371.000573   
3 -231.500539  318.825707 -0.726104  0.467775 -856.387442  393.386364   

   fixed_effects  stationary_controls  
0           True                 True  
1           True                False  
2          False                 True  
3          False                False  


With country fix effect

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
import doubleml as dml

def run_plr(df, outcome_l, treatment_l, block_l, fe, stationary_c):
    # If fixed effects include 'country', apply one-hot encoding
    if fe and 'country' in block_l:
        df = pd.get_dummies(df, columns=['country'])
        block_l = [col for col in df.columns if col not in outcome_l + treatment_l]  # Update block_l to include new dummy variables

    # Select the relevant columns and drop NA
    df_subset = df[outcome_l + treatment_l + block_l].dropna()

    # Prepare the data for DoubleML
    obj_dml_data = dml.DoubleMLData(df_subset, y_col=outcome_l[0], d_cols=treatment_l[0], x_cols=block_l)

    # Define the machine learning models
    learner = RandomForestRegressor(n_estimators=500, max_features='sqrt', random_state=1111)
    ml_m = clone(learner)
    ml_l = clone(learner)

    # Set up the PLR model
    dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=10)

    # Fit the PLR model and get the summary
    plr_results = dml_plr.fit()
    summary = plr_results.summary

    return summary

def run_all(df, outcome_l, treatment_l, block_fe_l, block_sta_l, block_other_l):
    # Adjusted the columns to store summaries in a list since they're DataFrames now
    summaries = []

    for fe in [True, False]:
        block_l = block_other_l.copy()
        if fe:
            block_l += block_fe_l  # Keep 'country' in block_fe_l to include country fixed effects
        for sta in [True, False]:
            if sta:
                block_l += block_sta_l
            summary = run_plr(df, outcome_l, treatment_l, block_l, fe, sta)
            summaries.append(summary)  # Append the summary DataFrame to the list

    return summaries

# Example usage
plr_summaries = run_all(df, 
                        outcome_l=['milexp_pergdpSIPRI'], 
                        treatment_l=['treat_fuel'],
                        block_fe_l=['country', 't'],  # Include 'country' for fixed effects
                        block_sta_l=['logmountain', 'ethnic_fractionalization', 'religion_fractionalization', 'language_fractionalization'], 
                        block_other_l=['democracy', 'logpopdens_diff', 'ecgrowth_demeaned'])


In [14]:
print(plr_summaries)

[                 coef     std err        t     P>|t|        2.5 %      97.5 %
treat_fuel -376.22691  424.942065 -0.88536  0.375962 -1209.098053  456.644234,                   coef     std err         t     P>|t|        2.5 %  \
treat_fuel -394.212356  433.524858 -0.909319  0.363182 -1243.905464   

                97.5 %  
treat_fuel  455.480752  ,                   coef     std err         t     P>|t|       2.5 %      97.5 %
treat_fuel -215.177394  296.924381 -0.724688  0.468644 -797.138486  366.783699,                   coef     std err         t     P>|t|       2.5 %      97.5 %
treat_fuel -242.925546  321.886843 -0.754692  0.450434 -873.812166  387.961074]


In [15]:
import pandas as pd

# Define the configurations corresponding to your summaries for clarity
configurations = [
    {'fixed_effects': True, 'stationary_controls': True},
    {'fixed_effects': True, 'stationary_controls': False},
    {'fixed_effects': False, 'stationary_controls': True},
    {'fixed_effects': False, 'stationary_controls': False},
]

# Add a multi-index or new columns to each DataFrame to indicate the configuration
for i, summary in enumerate(plr_summaries):
    summary['fixed_effects'] = configurations[i]['fixed_effects']
    summary['stationary_controls'] = configurations[i]['stationary_controls']

# Concatenate all the DataFrames into one
combined_summary = pd.concat(plr_summaries, axis=0)

# Reset index to obtain a clean index
combined_summary.reset_index(drop=True, inplace=True)
rounded_combsum = round(combined_summary, 4)

print(rounded_combsum)


       coef   std err       t   P>|t|      2.5 %    97.5 %  fixed_effects  \
0 -376.2269  424.9421 -0.8854  0.3760 -1209.0981  456.6442           True   
1 -394.2124  433.5249 -0.9093  0.3632 -1243.9055  455.4808           True   
2 -215.1774  296.9244 -0.7247  0.4686  -797.1385  366.7837          False   
3 -242.9255  321.8868 -0.7547  0.4504  -873.8122  387.9611          False   

   stationary_controls  
0                 True  
1                False  
2                 True  
3                False  


In [16]:
dfs = [rounded_combsum]
stacked_df = pd.concat(dfs)
final_res = stacked_df.reset_index(drop=True)
final_res.insert(0, 'gvc_type', 'backward')
final_res
final_res.to_csv('backward_res.csv', index=False)