In [5]:
import numpy as np
import pandas as pd

df = pd.read_csv('combined_data.csv', index_col=[0, 1])

In [6]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Inflation,Population,GDP,Export,Import
Country Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABW,1980,,59909.0,,,
ABW,1981,,60563.0,,,
ABW,1982,,61276.0,,,
ABW,1983,,62228.0,,,
ABW,1984,,62901.0,,,
...,...,...,...,...,...,...
ZWE,2017,0.893962,14812482.0,5.107466e+10,107.151887,83.660837
ZWE,2018,10.618866,15034452.0,3.415607e+10,124.909506,105.579357
ZWE,2019,255.304991,15271368.0,2.571741e+10,131.425343,79.585840
ZWE,2020,557.201817,15526888.0,2.686794e+10,135.325610,82.633621


In [7]:
df = df.dropna()

In [8]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Inflation,Population,GDP,Export,Import
Country Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABW,1986,1.073966,59931.0,4.055866e+08,105.022708,47.432669
ABW,1987,3.643045,59159.0,4.877095e+08,115.503652,58.364675
ABW,1988,3.121868,59331.0,5.966480e+08,135.066427,83.150974
ABW,1989,3.991628,60443.0,6.955307e+08,101.349267,95.561922
ABW,1990,5.836688,62753.0,7.648045e+08,123.596991,132.571551
...,...,...,...,...,...,...
ZWE,2017,0.893962,14812482.0,5.107466e+10,107.151887,83.660837
ZWE,2018,10.618866,15034452.0,3.415607e+10,124.909506,105.579357
ZWE,2019,255.304991,15271368.0,2.571741e+10,131.425343,79.585840
ZWE,2020,557.201817,15526888.0,2.686794e+10,135.325610,82.633621


In [10]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import os
import joblib
from sklearn.preprocessing import StandardScaler

os.makedirs("models", exist_ok = True)
os.makedirs("scalers", exist_ok = True)

results = []

countries = df.index.get_level_values('Country Code').unique()

for country in countries:
    country_data = df.loc[country]
    
    if country_data.shape[0] < 20:
        print(f"Skipping {country}: fewer than 20 samples.")
        continue
    
    X = country_data[['Population', 'Inflation', 'Import', 'Export']]
    y = country_data['GDP']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)  
    X_test_scaled = scaler.transform(X_test)       
    
    export_gdp_corr = country_data[['Export', 'GDP']].corr().iloc[0, 1]
    
    model = Ridge(0.01)
    model.fit(X_train_scaled, y_train)
    
    joblib.dump(model, f'models/{country}_model.pkl')
    joblib.dump(scaler, f'scalers/{country}_scaler.pkl')
  
    coefficients = model.coef_
    r2_test = model.score(X_test_scaled, y_test)

    results.append({
        'Country': country,
        'Population_Coeff': coefficients[0],
        'Inflation_Coeff': coefficients[1],
        'Import_Coeff': coefficients[2],
        'Export_Coeff': coefficients[3],
        'Test_Score': r2_test,
        'Export_GDP_Correlation': export_gdp_corr
    })


coefficients_df = pd.DataFrame(results)
coefficients_df.to_csv('ridge_regression_per_country.csv', index=False)
print("Models and scalers saved for each country.")


Skipping AFG: fewer than 20 samples.
Skipping ARE: fewer than 20 samples.
Skipping BIH: fewer than 20 samples.
Skipping COM: fewer than 20 samples.
Skipping CUW: fewer than 20 samples.
Skipping CYM: fewer than 20 samples.
Skipping GIN: fewer than 20 samples.
Skipping IDN: fewer than 20 samples.
Skipping KIR: fewer than 20 samples.
Skipping LBN: fewer than 20 samples.
Skipping MOZ: fewer than 20 samples.
Skipping NAM: fewer than 20 samples.
Skipping NCL: fewer than 20 samples.
Skipping NRU: fewer than 20 samples.
Skipping SDN: fewer than 20 samples.
Skipping SLE: fewer than 20 samples.
Skipping SRB: fewer than 20 samples.
Skipping TJK: fewer than 20 samples.
Skipping UZB: fewer than 20 samples.
Skipping ZWE: fewer than 20 samples.
Models and scalers saved for each country.
