In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Data.csv', index_col = [0,1])

In [3]:
df_firstSector = data.drop(columns = ['Second_sector_GDP', 'Third_sector_GDP', 'total_GDP', 'total_Investment', 'Second_sector_Inv', 'Third_sector_Inv'])

In [4]:
df_firstSector

Unnamed: 0_level_0,Unnamed: 1_level_0,ED,Expenditure,Irate,Population,Tax,Trade,First_sector_GDP,First_sector_Inv
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austria,1994,849432.583,55.486122,6.8167,7928746.0,43.985937,-3065.82896,3907.513358,2457.712241
Austria,1995,882550.058,56.000000,7.1067,7943489.0,43.100000,-2194.40000,3953.600000,2359.800000
Austria,1996,961798.352,56.000000,5.1067,7953067.0,44.600000,-2963.70000,3979.300000,2571.100000
Austria,1997,950097.615,52.700000,4.4192,7964966.0,45.300000,-1524.50000,4124.600000,2852.500000
Austria,1998,971582.690,52.600000,3.9058,7971116.0,45.300000,-439.00000,4287.400000,2600.700000
...,...,...,...,...,...,...,...,...,...
Spain,2019,3641677.147,42.000000,-0.2173,46918951.0,35.200000,37422.00000,34628.000000,6551.000000
Spain,2020,3288511.875,51.400000,-0.3057,47318050.0,37.300000,17118.00000,33937.000000,5951.000000
Spain,2021,3542616.185,49.500000,-0.4906,47400798.0,38.100000,12226.00000,36942.000000,6444.000000
Spain,2022,3472923.261,46.400000,1.0997,47486843.0,37.500000,12125.00000,31335.000000,6035.000000


In [5]:
df = df_firstSector.copy()

Index(['ED', 'Expenditure', 'Irate', 'Population', 'Tax', 'Trade',
       'First_sector_GDP', 'First_sector_Inv'],
      dtype='object')

In [9]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import os
import joblib
from sklearn.preprocessing import StandardScaler

os.makedirs("models", exist_ok = True)
os.makedirs("scalers", exist_ok = True)

results = []

countries = df.index.get_level_values('Country Code').unique()

for country in countries:
    country_data = df.loc[country]
    
    
    X = country_data['ED', 'Expenditure', 'Irate', 'Population', 'Tax', 'Trade', 'First_sector_Inv']
    y = country_data['First_sector_GDP']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)  
    X_test_scaled = scaler.transform(X_test)       
    
    model = Ridge()
    model.fit(X_train_scaled, y_train)
    
    joblib.dump(model, f'models/{country}_model.pkl')
    joblib.dump(scaler, f'scalers/{country}_scaler.pkl')
  
    coefficients = model.coef_
    r2_test = model.score(X_test_scaled, y_test)

    results.append({
        'Country': country,
        'ED_Coeff': coefficients[0],
        'Expenditure_Coeff': coefficients[1],
        'Irate_Coeff': coefficients[2],
        'Population_Coeff': coefficients[3],
        'Tax_Coeff': coefficients[3],
        'Trade_Coeff': coefficients[3],
        'First_sector_Inv': coefficients[3],
        'Test_Score': r2_test,
    })


coefficients_df = pd.DataFrame(results)
coefficients_df.to_csv('ridge_regression_per_country.csv', index=False)
print("Models and scalers saved for each country.")


KeyError: 'Level Country Code not found'