## REGRESSION AFRICA-SUBSAHARA COUNTRIES


In [2]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
data = pd.read_csv("../data/Africa.csv")
data

Unnamed: 0.1,Unnamed: 0,Country,Exports_2014,Exports_2015,Exports_2016,Exports_2017,Exports_2018,Exports_2019,Exports_2020,Exports_2021,...,Inflation_2022,Openness_2014,Openness_2015,Openness_2016,Openness_2017,Openness_2018,Openness_2019,Openness_2020,Openness_2021,Openness_2022
0,58,Benin,6463.367729,5112.72404,4240.799727,4941.110388,4714.593172,4295.443983,2806.917193,3477.593185,...,1.350779,32.171955,27.26399,22.563862,25.037455,21.342108,20.413266,14.291073,16.585501,21.165547
1,59,Botswana,6128.255905,5609.635585,4859.160825,5062.24625,5018.671646,5270.414738,4461.924534,5926.120761,...,11.665567,28.01478,28.107501,27.304461,28.731112,24.453541,24.555431,22.362458,25.589657,24.414008
2,60,Burkina Faso,2034.519468,1551.551663,1668.039543,2035.283518,2048.712181,2020.250821,2039.167666,2424.768628,...,14.290235,9.246632,9.501812,8.485036,11.319689,10.243829,10.061071,8.474246,9.334062,10.113093
3,61,Burundi,350.779022,265.44152,333.806357,358.952479,368.045099,453.005774,566.648588,638.96119,...,18.800879,11.064783,7.653037,9.008162,8.804849,10.209184,12.56098,13.310777,13.196638,13.449368
4,62,Cameroon,4725.154602,3809.781919,3340.500629,3721.507262,3968.638606,3791.801524,3896.046494,5615.95645,...,6.247677,9.802586,10.029875,7.830845,8.078727,8.427376,8.651649,7.607302,10.023815,11.093151
5,63,Central African Rep.,110.407391,125.680993,132.07283,133.911919,187.426652,206.140991,198.290077,217.575956,...,5.583167,4.458239,5.559359,6.613282,5.81612,6.322394,6.101664,6.655842,9.186439,9.472433
6,64,Chad,645.186274,427.716654,317.967612,364.364661,511.329742,630.429539,624.513448,741.871705,...,5.788027,3.543913,4.745836,4.192165,4.483992,5.917988,7.441206,7.092171,8.153947,13.463698
7,65,"Congo, Rep. of",3787.816307,2569.481219,1740.437421,1368.874229,1343.014904,1288.563672,1463.340197,1933.820867,...,3.043443,31.391537,29.567624,27.179336,34.010163,39.38863,40.871664,33.521539,39.094969,50.240176
8,66,Côte d'Ivoire,6367.004431,5741.064051,5557.017046,6150.388677,7111.625413,7007.826379,7094.601912,9998.51957,...,5.276167,11.46734,10.900173,10.034409,10.661682,10.616008,10.60061,10.393159,11.83923,13.716699
9,67,"Equatorial Guinea, Rep. of",949.623736,614.941541,449.27763,329.672559,373.105489,263.424979,327.891226,313.96698,...,4.786717,15.608051,12.32893,11.001556,11.815048,14.282271,15.320627,10.189398,11.334884,17.429922


In [4]:
data = data.drop(['Unnamed: 0', 'Country'], axis=1)

In [5]:
for year in range(2014, 2023):
    # Logarithmic transformation of the columns
    data[f'log_GDP_{year}'] = np.log(data[f'GDP_{year}'] + 1)
    data[f'log_Exports_{year}'] = np.log(data[f'Exports_{year}'] + 1)
    data[f'log_Imports_{year}'] = np.log(data[f'Imports_{year}'] + 1)
    data[f'log_Inflation_{year}'] = np.log(data[f'Inflation_{year}'] + 1)
    data[f'log_Openness_{year}'] = np.log(data[f'Openness_{year}'] + 1)

# Analyse for each year
for year in range(2014, 2023):
    cols = [f'log_Exports_{year}', f'log_Imports_{year}', f'log_Inflation_{year}', f'log_GDP_{year}']

    # Drop NA
    data_year = data.dropna(subset=cols)

    # Independent and dependent variables
    X = data_year[[f'log_Exports_{year}', f'log_Imports_{year}', f'log_Inflation_{year}', f'log_Openness_{year}']]
    Y = data_year[f'log_GDP_{year}']

    # Train and test split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # Model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Prediction and evaluation
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Year: {year}")
    print("Coefficients:", model.coef_)
    print("Mean Squared Error (MSE):", mse)
    print("R-squared (R²):", r2)
    print()


Year: 2014
Coefficients: [ 0.58207225  0.40208672  0.00881651 -1.03195332]
Mean Squared Error (MSE): 0.0390262724397679
R-squared (R²): 0.9854589282704045

Year: 2015
Coefficients: [ 0.62696663  0.33865683  0.00355764 -1.0457531 ]
Mean Squared Error (MSE): 0.026408498912774494
R-squared (R²): 0.941031948958744

Year: 2016
Coefficients: [ 0.57204362  0.40995307 -0.01521191 -1.04890666]
Mean Squared Error (MSE): 0.016617907424254175
R-squared (R²): 0.9653044589604423

Year: 2017
Coefficients: [ 0.45952435  0.50482092  0.03067298 -0.95823586]
Mean Squared Error (MSE): 0.026713488297489995
R-squared (R²): 0.9559445471829237

Year: 2018
Coefficients: [ 0.469256    0.4730476   0.06073406 -0.90635606]
Mean Squared Error (MSE): 0.042243169172222574
R-squared (R²): 0.9278273831170931

Year: 2019
Coefficients: [ 0.41438002  0.51517112  0.01672774 -1.03057558]
Mean Squared Error (MSE): 0.011720620576239462
R-squared (R²): 0.9832552721441853

Year: 2020
Coefficients: [ 0.49481249  0.46850269 -0.00

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
for year in range(2014, 2023):
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('real values')
    plt.ylabel('Prediction')
    plt.title(f'Prediction vs values for {year}')
    plt.show()