In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import LabelEncoder
import sklearn
import os
from matplotlib.pyplot import hist
import scipy.stats as stats
import math
import statsmodels.api as sm

In [6]:
# read in the dataframe
df = pd.read_csv('df_mix.csv')

In [24]:
df1 = pd.read_csv('../../data/GVC_data/transportIV_file.csv')
df1 = df1.loc[:, ['country', 't', 'trans_outp_p']]

df = pd.merge(df, df1, on=['country', 't'])

In [25]:
print(df.dtypes)

country                        object
t                               int64
onset2COWCS                   float64
milexp_pergdpSIPRI            float64
decade                        float64
democracy                     float64
logmountain                   float64
ethnic_fractionalization      float64
religion_fractionalization    float64
language_fractionalization    float64
leg_british                   float64
opec                          float64
logpop_M_diff                 float64
logpopdens_diff               float64
logoutreg_diff                float64
ecgrowth_demeaned             float64
treat_agri                    float64
treat_mine                    float64
treat_fuel                    float64
treat_metal                   float64
iv_transport                  float64
iv_agri                       float64
iv_mine                       float64
iv_fuel                       float64
iv_metal                      float64
trans_outp_p_x                float64
trans_outp_p

In [23]:
num_countries = df['country'].nunique()
print(f"There are {num_countries} unique countries in the dataset.")

There are 116 unique countries in the dataset.


Without Country Fixed effect

In [26]:
import pandas as pd
import numpy as np
import doubleml as dml
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

# Define the machine learning model for the treatment effect (ml_m) and outcome (ml_l)
learner = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1111)
ml_m = clone(learner)  # Machine learning model for the treatment effect
ml_l = clone(learner)  # Machine learning model for the outcome

In [60]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
import doubleml as dml

def run_plr(df, outcome_l, treatment_l, block_l, fe, stationary_c):
    # Select the relevant columns and drop NA
    df_subset = df[outcome_l + treatment_l + block_l].dropna()

    # Prepare the data for DoubleML
    obj_dml_data = dml.DoubleMLData(df_subset, y_col=outcome_l[0], d_cols=treatment_l[0], x_cols=block_l)

    # Define the machine learning models
    learner = RandomForestRegressor(n_estimators=500, max_features='sqrt', random_state=1111)
    ml_m = clone(learner)
    ml_l = clone(learner)

    # Set up the PLR model
    dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=10)

    # Fit the PLR model and get the summary
    plr_results = dml_plr.fit()
    summary = plr_results.summary  # Access summary as an attribute, not a method

    return summary

In [61]:
def run_all(df, outcome_l, treatment_l, block_fe_l, block_sta_l, block_other_l):
    # Adjusted the columns to store summaries in a list since they're DataFrames now
    summaries = []

    for fe in [True, False]:
        block_l = block_other_l.copy()
        if fe:
            # Exclude 'country' from fixed effects
            block_l += [b for b in block_fe_l if b != 'country']
        for sta in [True, False]:
            if sta:
                block_l += block_sta_l
            summary = run_plr(df, outcome_l, treatment_l, block_l, fe, sta)
            summaries.append(summary)  # Append the summary DataFrame to the list

    return summaries  # Return the list of summary DataFrames

In [62]:
plr_summaries = run_all(df, 
                        outcome_l=['milexp_pergdpSIPRI'], 
                        treatment_l=['treat_fuel'],
                        block_fe_l=['country', 't'],  # 'country' will be ignored in the fixed effects
                        block_sta_l=['logmountain', 'ethnic_fractionalization', 'religion_fractionalization', 'language_fractionalization'], 
                        block_other_l=['democracy', 'logpopdens_diff', 'ecgrowth_demeaned'])

In [63]:
print(plr_summaries)

[               coef   std err         t   P>|t|     2.5 %     97.5 %
treat_fuel  5.51942  7.686223  0.718093  0.4727 -9.545301  20.584141,                  coef    std err         t     P>|t|     2.5 %     97.5 %
treat_fuel  25.148928  18.776181  1.339406  0.180439 -11.65171  61.949566,                  coef    std err         t     P>|t|      2.5 %     97.5 %
treat_fuel  11.632668  13.366769  0.870268  0.384154 -14.565718  37.831054,                  coef    std err         t     P>|t|      2.5 %     97.5 %
treat_fuel  12.361575  13.032301  0.948534  0.342858 -13.181266  37.904415]


In [64]:
import pandas as pd

# Assuming plr_summaries is a list of DataFrames as shown in your print output
# We'll add a multi-index or additional columns to indicate whether fixed effects or stationary controls were used

# Define the configurations corresponding to your summaries for clarity
configurations = [
    {'fixed_effects': True, 'stationary_controls': True},
    {'fixed_effects': True, 'stationary_controls': False},
    {'fixed_effects': False, 'stationary_controls': True},
    {'fixed_effects': False, 'stationary_controls': False},
]

# Add a multi-index or new columns to each DataFrame to indicate the configuration
for i, summary in enumerate(plr_summaries):
    summary['fixed_effects'] = configurations[i]['fixed_effects']
    summary['stationary_controls'] = configurations[i]['stationary_controls']

# Concatenate all the DataFrames into one
combined_summary = pd.concat(plr_summaries, axis=0)

# Reset index if you want a clean index
combined_summary.reset_index(drop=True, inplace=True)

# Now you can print or display the combined summary
print(combined_summary)


        coef    std err         t     P>|t|      2.5 %     97.5 %  \
0   5.519420   7.686223  0.718093  0.472700  -9.545301  20.584141   
1  25.148928  18.776181  1.339406  0.180439 -11.651710  61.949566   
2  11.632668  13.366769  0.870268  0.384154 -14.565718  37.831054   
3  12.361575  13.032301  0.948534  0.342858 -13.181266  37.904415   

   fixed_effects  stationary_controls  
0           True                 True  
1           True                False  
2          False                 True  
3          False                False  


With country fix effect

In [65]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
import doubleml as dml

def run_plr(df, outcome_l, treatment_l, block_l, fe, stationary_c):
    # If fixed effects include 'country', apply one-hot encoding
    if fe and 'country' in block_fe_l:
        df = pd.get_dummies(df, columns=['country'])
        block_l = [col for col in df.columns if col not in outcome_l + treatment_l]  # Update block_l to include new dummy variables

    # Select the relevant columns and drop NA
    df_subset = df[outcome_l + treatment_l + block_l].dropna()

    # Prepare the data for DoubleML
    obj_dml_data = dml.DoubleMLData(df_subset, y_col=outcome_l[0], d_cols=treatment_l[0], x_cols=block_l)

    # Define the machine learning models
    learner = RandomForestRegressor(n_estimators=500, max_features='sqrt', random_state=1111)
    ml_m = clone(learner)
    ml_l = clone(learner)

    # Set up the PLR model
    dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=10)

    # Fit the PLR model and get the summary
    plr_results = dml_plr.fit()
    summary = plr_results.summary

    return summary

def run_all(df, outcome_l, treatment_l, block_fe_l, block_sta_l, block_other_l):
    # Adjusted the columns to store summaries in a list since they're DataFrames now
    summaries = []

    for fe in [True, False]:
        block_l = block_other_l.copy()
        if fe:
            block_l += block_fe_l  # Keep 'country' in block_fe_l to include country fixed effects
        for sta in [True, False]:
            if sta:
                block_l += block_sta_l
            summary = run_plr(df, outcome_l, treatment_l, block_l, fe, sta)
            summaries.append(summary)  # Append the summary DataFrame to the list

    return summaries

# Example usage
plr_summaries = run_all(df, 
                        outcome_l=['milexp_pergdpSIPRI'], 
                        treatment_l=['treat_fuel'],
                        block_fe_l=['country', 't'],  # Include 'country' for fixed effects
                        block_sta_l=['logmountain', 'ethnic_fractionalization', 'religion_fractionalization', 'language_fractionalization'], 
                        block_other_l=['democracy', 'logpopdens_diff', 'ecgrowth_demeaned'])


In [66]:
print(plr_summaries)

[                 coef    std err         t     P>|t|      2.5 %     97.5 %
treat_fuel  31.099312  33.435277  0.930135  0.352301 -34.432628  96.631251,                  coef    std err        t     P>|t|     2.5 %      97.5 %
treat_fuel  33.984366  34.917354  0.97328  0.330414 -34.45239  102.421122,                  coef    std err        t     P>|t|     2.5 %     97.5 %
treat_fuel  13.366071  13.948859  0.95822  0.337952 -13.97319  40.705333,                  coef    std err         t     P>|t|      2.5 %     97.5 %
treat_fuel  13.702961  14.950962  0.916527  0.359391 -15.600387  43.006309]


In [67]:
import pandas as pd

# Assuming plr_summaries is a list of DataFrames as shown in your print output
# We'll add a multi-index or additional columns to indicate whether fixed effects or stationary controls were used

# Define the configurations corresponding to your summaries for clarity
configurations = [
    {'fixed_effects': True, 'stationary_controls': True},
    {'fixed_effects': True, 'stationary_controls': False},
    {'fixed_effects': False, 'stationary_controls': True},
    {'fixed_effects': False, 'stationary_controls': False},
]

# Add a multi-index or new columns to each DataFrame to indicate the configuration
for i, summary in enumerate(plr_summaries):
    summary['fixed_effects'] = configurations[i]['fixed_effects']
    summary['stationary_controls'] = configurations[i]['stationary_controls']

# Concatenate all the DataFrames into one
combined_summary = pd.concat(plr_summaries, axis=0)

# Reset index if you want a clean index
combined_summary.reset_index(drop=True, inplace=True)

# Now you can print or display the combined summary
print(combined_summary)


        coef    std err         t     P>|t|      2.5 %      97.5 %  \
0  31.099312  33.435277  0.930135  0.352301 -34.432628   96.631251   
1  33.984366  34.917354  0.973280  0.330414 -34.452390  102.421122   
2  13.366071  13.948859  0.958220  0.337952 -13.973190   40.705333   
3  13.702961  14.950962  0.916527  0.359391 -15.600387   43.006309   

   fixed_effects  stationary_controls  
0           True                 True  
1           True                False  
2          False                 True  
3          False                False  
