## REGRESSION WESTERN HEMISPHERE COUNTRIES

In [5]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [6]:
data = pd.read_csv("../data/Western.csv")
data.dropna(inplace=True)

data

Unnamed: 0.1,Unnamed: 0,Country,Exports_2014,Exports_2015,Exports_2016,Exports_2017,Exports_2018,Exports_2019,Exports_2020,Exports_2021,...,Inflation_2022,Openness_2014,Openness_2015,Openness_2016,Openness_2017,Openness_2018,Openness_2019,Openness_2020,Openness_2021,Openness_2022
0,92,Antigua and Barbuda,639.494224,312.665067,401.824439,354.139517,398.855517,395.618983,318.994758,559.532983,...,7.531078,50.307554,28.882081,22.660917,19.360042,19.308903,74.418618,67.936212,77.053276,75.778859
1,93,"Bahamas, The",3464.844049,4212.66805,1682.287335,1798.833767,1647.35121,1718.519338,1302.363903,1837.899516,...,5.605406,34.459404,25.274607,13.572217,11.280912,11.988846,11.486461,11.561923,12.003463,14.273302
2,94,Bolivia,6210.470534,5252.533893,5789.972995,6119.115524,5514.947306,5510.798658,4150.479494,5501.111611,...,1.746329,22.499611,17.393036,15.044058,15.162923,14.366274,14.541226,13.379201,16.034487,19.785565
3,95,Brazil,112917.04797,78544.784437,63598.955234,74939.859019,87388.715406,84667.893021,77379.344817,117507.803061,...,9.280106,5.175495,5.640289,5.133649,5.401262,6.583768,6.497868,7.935549,9.895149,9.876588
4,96,Chile,32765.558016,29081.412664,28406.419836,33111.11368,38318.763284,33307.088628,30817.870982,51331.262648,...,11.643867,14.031695,13.138365,12.046528,12.546322,13.77027,13.351384,14.571255,17.03997,19.538105
5,97,Colombia,23817.976163,20314.929856,18937.380116,20291.056219,22090.830133,22890.206901,20202.47405,30716.723465,...,10.177231,6.595191,6.41881,5.873923,6.174761,6.699427,6.730717,6.759955,8.460828,9.54377
6,98,Costa Rica,4772.501032,4867.409535,5216.178952,5154.647414,5603.088311,4850.154454,4929.724844,7197.093466,...,8.274775,18.42673,8.553228,8.185696,8.108482,8.24169,7.286439,7.886558,10.525414,12.198309
7,99,Dominican Rep.,5427.2106,5254.82692,5385.614783,5418.609125,5994.974108,6007.600628,5623.128171,8725.839876,...,8.811092,5.017336,4.867213,4.587781,4.461125,4.522849,4.474621,4.534951,5.814796,5.588157
8,100,Ecuador,10322.487544,8232.765958,6970.335941,9067.308389,10753.272241,10141.467844,8291.396681,12735.639431,...,3.46617,10.779758,8.579542,7.474784,8.788975,10.146427,10.309863,9.682683,12.516476,13.081102
9,101,El Salvador,3922.69774,4032.359157,3968.190714,4184.576229,4681.393124,4776.438677,4766.628478,6859.529459,...,7.198616,13.763181,14.019418,13.326889,14.14138,14.673503,13.932606,15.037498,17.825683,17.826747


In [7]:
data = data.drop(['Unnamed: 0', 'Country'], axis=1)

In [8]:
for year in range(2014, 2023):
    # Logarithmic transformation of the columns
    data[f'log_GDP_{year}'] = np.log(data[f'GDP_{year}'] + 1)
    data[f'log_Exports_{year}'] = np.log(data[f'Exports_{year}'] + 1)
    data[f'log_Imports_{year}'] = np.log(data[f'Imports_{year}'] + 1)
    data[f'log_Inflation_{year}'] = np.log(data[f'Inflation_{year}'] + 1)
    data[f'log_Openness_{year}'] = np.log(data[f'Openness_{year}'] + 1)

# Analyse for each year
for year in range(2014, 2023):
    cols = [f'log_Exports_{year}', f'log_Imports_{year}', f'log_Inflation_{year}', f'log_GDP_{year}']

    # Drop NA
    data_year = data.dropna(subset=cols)

    # Independent and dependent variables
    X = data_year[[f'log_Exports_{year}', f'log_Imports_{year}', f'log_Inflation_{year}', f'log_Openness_{year}']]
    Y = data_year[f'log_GDP_{year}']

    # Train and test split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # Model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Prediction and evaluation
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Year: {year}")
    print("Coefficients:", model.coef_)
    print("Mean Squared Error (MSE):", mse)
    print("R-squared (R²):", r2)
    print()


Year: 2014
Coefficients: [ 0.61029621  0.32703565  0.0738126  -1.12841108]
Mean Squared Error (MSE): 0.014805181317217595
R-squared (R²): 0.9926121924428901

Year: 2015
Coefficients: [ 0.67622483  0.28681329  0.08668167 -1.09421108]
Mean Squared Error (MSE): 0.0074503586298305625
R-squared (R²): 0.9985401576129547

Year: 2016
Coefficients: [ 0.70362362  0.28046026  0.07713515 -1.12024953]
Mean Squared Error (MSE): 0.0035784829071208606
R-squared (R²): 0.9980840249413139

Year: 2017
Coefficients: [ 0.73913577  0.26983607  0.11159505 -1.07316806]
Mean Squared Error (MSE): 0.008029396620617602
R-squared (R²): 0.9957409838465272

Year: 2018
Coefficients: [ 0.704978    0.29163426  0.0516342  -1.01868895]
Mean Squared Error (MSE): 0.01328372770824561
R-squared (R²): 0.9927742634874119

Year: 2019
Coefficients: [ 0.6510659   0.3247583   0.04836615 -1.11461743]
Mean Squared Error (MSE): 0.08805857780812346
R-squared (R²): 0.9514091510128406

Year: 2020
Coefficients: [ 0.53348943  0.42660537 -0

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
for year in range(2014, 2023):
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('real values')
    plt.ylabel('Prediction')
    plt.title(f'Prediction vs values for {year}')
    plt.show()