In [12]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import LabelEncoder
import sklearn
import os
from matplotlib.pyplot import hist
import scipy.stats as stats
import math
import statsmodels.api as sm


In [13]:
# read in the dataframe
df = pd.read_csv('df_mix.csv')

In [14]:
df1 = pd.read_csv('../../data/GVC_data/transportIV_file.csv')
df1 = df1.loc[:, ['country', 't', 'trans_outp_p']]

df = pd.merge(df, df1, on=['country', 't'])

In [15]:
print(df.dtypes)

country                        object
t                               int64
onset2COWCS                   float64
milexp_pergdpSIPRI            float64
oilreserves                   float64
decade                        float64
democracy                     float64
logmountain                   float64
ethnic_fractionalization      float64
religion_fractionalization    float64
language_fractionalization    float64
leg_british                   float64
opec                          float64
logpop_M_diff                 float64
logpopdens_diff               float64
logoutreg_diff                float64
ecgrowth_demeaned             float64
treat_agri                    float64
treat_mine                    float64
treat_fuel                    float64
treat_metal                   float64
iv_transport                  float64
iv_agri                       float64
iv_mine                       float64
iv_fuel                       float64
iv_metal                      float64
trans_outp_p

In [16]:
num_countries = df['country'].nunique()
print(f"There are {num_countries} unique countries in the dataset.")

There are 181 unique countries in the dataset.


Without Country Fixed effect

In [17]:
import pandas as pd
import numpy as np
import doubleml as dml
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

# Define the machine learning model for the treatment effect (ml_m) and outcome (ml_l)
learner = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1111)
ml_m = clone(learner)  # Machine learning model for the treatment effect
ml_l = clone(learner)  # Machine learning model for the outcome

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
import doubleml as dml

def run_plr(df, outcome_l, treatment_l, block_l, fe, stationary_c):
    # Select the relevant columns and drop NA
    df_subset = df[outcome_l + treatment_l + block_l].dropna()

    # Prepare the data for DoubleML
    obj_dml_data = dml.DoubleMLData(df_subset, y_col=outcome_l[0], d_cols=treatment_l[0], x_cols=block_l)

    # Define the machine learning models
    learner = RandomForestRegressor(n_estimators=500, max_features='sqrt', random_state=1111)
    ml_m = clone(learner)
    ml_l = clone(learner)

    # Set up the PLR model
    dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=10)

    # Fit the PLR model and get the summary
    plr_results = dml_plr.fit()
    summary = plr_results.summary  # Access summary as an attribute, not a method

    return summary

In [19]:
def run_all(df, outcome_l, treatment_l, block_fe_l, block_sta_l, block_other_l):
    # Adjusted the columns to store summaries in a list since they're DataFrames now
    summaries = []

    for fe in [True, False]:
        block_l = block_other_l.copy()
        if fe:
            # Exclude 'country' from fixed effects
            block_l += [b for b in block_fe_l if b != 'country']
        for sta in [True, False]:
            if sta:
                block_l += block_sta_l
            summary = run_plr(df, outcome_l, treatment_l, block_l, fe, sta)
            summaries.append(summary)  # Append the summary DataFrame to the list

    return summaries  # Return the list of summary DataFrames

In [20]:
plr_summaries = run_all(df, 
                        outcome_l=['milexp_pergdpSIPRI'], 
                        treatment_l=['oilreserves'],
                        block_fe_l=['country', 't'],  # 'country' will be ignored in the fixed effects
                        block_sta_l=['logmountain', 'ethnic_fractionalization', 'religion_fractionalization', 'language_fractionalization'], 
                        block_other_l=['democracy', 'logpopdens_diff', 'ecgrowth_demeaned'])

In [21]:
print(plr_summaries)

[                 coef   std err         t     P>|t|     2.5 %    97.5 %
oilreserves  0.154346  0.106056  1.455329  0.145578 -0.053519  0.362212,                  coef   std err        t     P>|t|     2.5 %  97.5 %
oilreserves  0.172184  0.123123  1.39847  0.161972 -0.069133  0.4135,                  coef   std err       t     P>|t|     2.5 %    97.5 %
oilreserves  0.189674  0.136191  1.3927  0.163711 -0.077257  0.456604,                  coef   std err         t     P>|t|     2.5 %    97.5 %
oilreserves  0.186518  0.136839  1.363048  0.172867 -0.081681  0.454717]


In [22]:
import pandas as pd

configurations = [
    {'fixed_effects': True, 'stationary_controls': True},
    {'fixed_effects': True, 'stationary_controls': False},
    {'fixed_effects': False, 'stationary_controls': True},
    {'fixed_effects': False, 'stationary_controls': False},
]

# Add a multi-index or new columns to each DataFrame to indicate the configuration
for i, summary in enumerate(plr_summaries):
    summary['fixed_effects'] = configurations[i]['fixed_effects']
    summary['stationary_controls'] = configurations[i]['stationary_controls']

# Concatenate all the DataFrames into one
combined_summary = pd.concat(plr_summaries, axis=0)

combined_summary.reset_index(drop=True, inplace=True)

print(combined_summary)


       coef   std err         t     P>|t|     2.5 %    97.5 %  fixed_effects  \
0  0.154346  0.106056  1.455329  0.145578 -0.053519  0.362212           True   
1  0.172184  0.123123  1.398470  0.161972 -0.069133  0.413500           True   
2  0.189674  0.136191  1.392700  0.163711 -0.077257  0.456604          False   
3  0.186518  0.136839  1.363048  0.172867 -0.081681  0.454717          False   

   stationary_controls  
0                 True  
1                False  
2                 True  
3                False  


With country fix effect

In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
import doubleml as dml

def run_plr(df, outcome_l, treatment_l, block_l, fe, stationary_c):
    # If fixed effects include 'country', apply one-hot encoding
    if fe and 'country' in block_l:
        df = pd.get_dummies(df, columns=['country'])
        block_l = [col for col in df.columns if col not in outcome_l + treatment_l]  # Update block_l to include new dummy variables

    # Select the relevant columns and drop NA
    df_subset = df[outcome_l + treatment_l + block_l].dropna()

    # Prepare the data for DoubleML
    obj_dml_data = dml.DoubleMLData(df_subset, y_col=outcome_l[0], d_cols=treatment_l[0], x_cols=block_l)

    # Define the machine learning models
    learner = RandomForestRegressor(n_estimators=500, max_features='sqrt', random_state=1111)
    ml_m = clone(learner)
    ml_l = clone(learner)

    # Set up the PLR model
    dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=10)

    # Fit the PLR model and get the summary
    plr_results = dml_plr.fit()
    summary = plr_results.summary

    return summary

def run_all(df, outcome_l, treatment_l, block_fe_l, block_sta_l, block_other_l):
    # Adjusted the columns to store summaries in a list since they're DataFrames now
    summaries = []

    for fe in [True, False]:
        block_l = block_other_l.copy()
        if fe:
            block_l += block_fe_l  # Keep 'country' in block_fe_l to include country fixed effects
        for sta in [True, False]:
            if sta:
                block_l += block_sta_l
            summary = run_plr(df, outcome_l, treatment_l, block_l, fe, sta)
            summaries.append(summary)  # Append the summary DataFrame to the list

    return summaries

# Example usage
plr_summaries = run_all(df, 
                        outcome_l=['milexp_pergdpSIPRI'], 
                        treatment_l=['treat_fuel'],
                        block_fe_l=['country', 't'],  # Include 'country' for fixed effects
                        block_sta_l=['logmountain', 'ethnic_fractionalization', 'religion_fractionalization', 'language_fractionalization'], 
                        block_other_l=['democracy', 'logpopdens_diff', 'ecgrowth_demeaned'])


In [24]:
print(plr_summaries)

[                 coef    std err       t     P>|t|      2.5 %     97.5 %
treat_fuel  31.531322  33.320624  0.9463  0.343995 -33.775902  96.838546,                  coef    std err         t     P>|t|      2.5 %     97.5 %
treat_fuel  30.478633  32.792154  0.929449  0.352656 -33.792807  94.750074,                  coef    std err         t     P>|t|      2.5 %     97.5 %
treat_fuel  11.963633  13.951129  0.857539  0.391147 -15.380077  39.307343,                 coef   std err         t     P>|t|      2.5 %   97.5 %
treat_fuel  7.762125  13.00737  0.596748  0.550675 -17.731851  33.2561]


In [25]:
import pandas as pd

configurations = [
    {'fixed_effects': True, 'stationary_controls': True},
    {'fixed_effects': True, 'stationary_controls': False},
    {'fixed_effects': False, 'stationary_controls': True},
    {'fixed_effects': False, 'stationary_controls': False},
]

# Add a multi-index or new columns to each DataFrame to indicate the configuration
for i, summary in enumerate(plr_summaries):
    summary['fixed_effects'] = configurations[i]['fixed_effects']
    summary['stationary_controls'] = configurations[i]['stationary_controls']

# Concatenate all the DataFrames into one
combined_summary = pd.concat(plr_summaries, axis=0)

# # Reset index to obtain a clean index
combined_summary.reset_index(drop=True, inplace=True)
rounded_combsum = round(combined_summary, 4)

print(rounded_combsum)


      coef  std err       t   P>|t|    2.5 %   97.5 %  fixed_effects  \
0  31.5313  33.3206  0.9463  0.3440 -33.7759  96.8385           True   
1  30.4786  32.7922  0.9294  0.3527 -33.7928  94.7501           True   
2  11.9636  13.9511  0.8575  0.3911 -15.3801  39.3073          False   
3   7.7621  13.0074  0.5967  0.5507 -17.7319  33.2561          False   

   stationary_controls  
0                 True  
1                False  
2                 True  
3                False  


In [26]:
dfs = [rounded_combsum]
stacked_df = pd.concat(dfs)
final_res = stacked_df.reset_index(drop=True)
final_res.insert(0, 'gvc_type', 'mix')
final_res
final_res.to_csv('oilreserves_res.csv', index=False)