In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_name):
    current_dir = os.getcwd()
    os.chdir(current_dir)
    data = pd.read_csv(file_name)
    data.fillna(0, inplace=True)  # Update to properly handle NaNs as numeric zeros
    return data

def perform_iv_first_stage(data, instruments):
    X = data[instruments]
    X = sm.add_constant(X)
    y = data['ln_export_area']
    first_stage = sm.OLS(y, X).fit()
    data['predicted_ln_export_area'] = first_stage.predict(X)
    return first_stage, data

def perform_iv_second_stage(data, dependent_var):
    X = data[['predicted_ln_export_area']]
    X = sm.add_constant(X)
    y = data[dependent_var]
    second_stage = sm.OLS(y, X).fit()
    return second_stage

def run_analysis(entry_filename):
    if os.path.exists(entry_filename):
        data = load_data(entry_filename)
        instruments = ['atlantic_distance_minimum', 'indian_distance_minimum', 
                       'saharan_distance_minimum', 'red_sea_distance_minimum']

        # Perform IV regression for each year from 1990 to 2021
        iv_first_stage_results, updated_data = perform_iv_first_stage(data, instruments)
        results = []  # List to store results
        for year in range(1990, 2022):
            dependent_var = f'LNYR{year}'
            if dependent_var in updated_data.columns:
                iv_second_stage_results = perform_iv_second_stage(updated_data, dependent_var)
                results.append({
                    'Year': year,
                    'Coefficients': iv_second_stage_results.params,
                    'Standard Errors': iv_second_stage_results.bse
                })
                print(f'Results for {year}:')
                print(iv_second_stage_results.summary())
# Convert results to DataFrame
        results_df = pd.DataFrame(results)
        # Flatten the DataFrame columns if they are Series
        for col in ['Coefficients', 'Standard Errors']:
            results_df = results_df.join(pd.DataFrame(results_df.pop(col).values.tolist()).add_prefix(col + '_'))

        # Save to CSV
        results_df.to_csv('IV_analysis_results.csv', index=False)
def main():
    entry_filename = "mergedReplicationData3.csv"
    run_analysis(entry_filename)

if __name__ == "__main__":
    main()


Results for 1990:
                            OLS Regression Results                            
Dep. Variable:               LNYR1990   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.257
Method:                 Least Squares   F-statistic:                     18.68
Date:                Mon, 29 Apr 2024   Prob (F-statistic):           7.35e-05
Time:                        13:27:05   Log-Likelihood:                -63.104
No. Observations:                  52   AIC:                             130.2
Df Residuals:                      50   BIC:                             134.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const 