# 4. Splitting

In [None]:
# Find the earliest and latest years in the dataset
earliest_year = df_efi['index_year'].min()
latest_year = df_efi['index_year'].max()

print(f"The datset reaches from {earliest_year} to {latest_year}.")

The datset reaches from 1995 to 2025.


In [None]:
def temporal_split(df, year_column, split_year):
    """Split dataset chronologically by year."""

    # Convert year to datetime objects (beginning of the year)
    df['datetime'] = pd.to_datetime(df[year_column], format='%Y')
    df_sorted = df.sort_values('datetime')

    # Define the split date as the beginning of the split year
    split_date = pd.to_datetime(split_year, format='%Y')

    train_data = df_sorted[df_sorted['datetime'] < split_date].drop(columns=['datetime'])
    test_data = df_sorted[df_sorted['datetime'] >= split_date].drop(columns=['datetime'])

    return train_data, test_data

# Apply temporal split to df_efi
split_year = 2020
train_df_efi, test_df_efi = temporal_split(df_efi.copy(), 'index_year', split_year)

print(f"Temporal split of df_efi at the beginning of {split_year}:")
print(f"Training period: {train_df_efi['index_year'].min()} to {train_df_efi['index_year'].max()}")
print(f"Test period: {test_df_efi['index_year'].min()} to {test_df_efi['index_year'].max()}")


display("Train DataFrame Head:")
display(train_df_efi.head())

display("Test DataFrame Head:")
display(test_df_efi.head())

Temporal split of df_efi at the beginning of 2020:
Training period: 1995 to 2019
Test period: 2020 to 2025


'Train DataFrame Head:'

Unnamed: 0,country,index_year,overall_score,property_rights,government_integrity,judicial_effectiveness,tax_burden,government_spending,fiscal_health,business_freedom,labor_freedom,monetary_freedom,trade_freedom,investment_freedom,financial_freedom,overall_score_without_monetary_freedom
5704,zimbabwe,1995,48.5,50.0,30.0,,50.1,65.3,,55.0,,54.7,51.8,30.0,50.0,47.775
2718,laos,1995,,,,,,,,,,,,,,
4449,sierra leone,1995,49.8,50.0,10.0,,51.7,91.5,,70.0,,49.9,45.0,50.0,30.0,49.775
973,chad,1995,,,,,,,,,,,,,,
942,central african republic,1995,,,,,,,,,,,,,,


'Test DataFrame Head:'

Unnamed: 0,country,index_year,overall_score,property_rights,government_integrity,judicial_effectiveness,tax_burden,government_spending,fiscal_health,business_freedom,labor_freedom,monetary_freedom,trade_freedom,investment_freedom,financial_freedom,overall_score_without_monetary_freedom
2586,kiribati,2020,45.2,47.5,24.3,34.2,72.4,0.0,99.0,40.6,62.9,82.2,23.8,25.0,30.0,41.790909
4083,romania,2020,69.7,72.5,55.1,56.1,90.3,70.4,85.6,58.6,63.0,78.1,86.4,70.0,50.0,68.909091
1072,comoros,2020,53.7,43.1,23.8,31.7,63.6,73.6,85.0,47.8,60.3,81.7,59.2,45.0,30.0,51.190909
3711,north macedonia,2020,69.5,62.5,42.2,42.7,91.5,71.0,87.7,80.6,67.0,77.7,86.2,65.0,60.0,68.763636
3463,namibia,2020,60.9,58.9,46.0,58.5,65.7,58.1,28.7,65.6,84.9,75.9,83.2,65.0,40.0,59.509091


In [None]:
total_rows = len(df_efi)
train_rows = len(train_df_efi)
test_rows = len(test_df_efi)

train_percentage = (train_rows / total_rows) * 100
test_percentage = (test_rows / total_rows) * 100

print(f"Total rows in the original dataset: {total_rows}")
print(f"Number of rows in the training set: {train_rows} ({train_percentage:.2f}%)")
print(f"Number of rows in the test set: {test_rows} ({test_percentage:.2f}%)")

if abs(train_percentage - 80) < 5 and abs(test_percentage - 20) < 5:
    print("\nThe split is close to 80% training and 20% test.")
else:
    print("\nThe split deviates from 80% training and 20% test. A temporal split is based on a specific point in time, not a percentage, which can result in different sizes.")

Total rows in the original dataset: 5704
Number of rows in the training set: 4598 (80.61%)
Number of rows in the test set: 1106 (19.39%)

The split is close to 80% training and 20% test.


> Distribution Balance

##Imputation

In [None]:
# Define the dependent and independent variables
dependent_variable = dependent_variable_new # Use the new dependent variable
independent_variables = [col.lower().replace(' ', '_') for col in column_names if col.lower().replace(' ', '_') not in ['country', 'index_year', 'overall_score', 'monetary_freedom']]


# Function to apply median imputation by country and drop rows with missing overall_score
def impute_and_clean(df, dependent_var, independent_vars):
    """Applies median imputation by country for independent variables and drops rows with missing dependent variable and any remaining NaNs in independent variables."""

    # Ensure the dependent variable is numeric
    df[dependent_var] = pd.to_numeric(df[dependent_var], errors='coerce')

    # Impute missing values using the median of each country for each independent column
    for col in independent_vars:
        if col in df.columns: # Check if column exists
             df[col] = df.groupby('country')[col].transform(lambda x: x.fillna(x.median()))

    # Drop rows where the dependent variable is still missing
    df_cleaned = df.dropna(subset=[dependent_var]).copy()

    # Drop rows with any remaining missing values in the independent variables
    df_cleaned = df_cleaned.dropna(subset=independent_variables).copy()


    return df_cleaned

# Apply imputation and cleaning to the training set
train_df_imputed = impute_and_clean(train_df_efi.copy(), dependent_variable, independent_variables)

# Apply imputation and cleaning to the test set
test_df_imputed = impute_and_clean(test_df_efi.copy(), dependent_variable, independent_variables)

print("Training DataFrame after imputation and dropping rows with missing Overall Score (excluding Monetary Freedom):")
display(train_df_imputed.head())
print("\nMissing values per column in training set:")
display(train_df_imputed.isnull().sum())


print("\nTest DataFrame after imputation and dropping rows with missing Overall Score (excluding Monetary Freedom):")
display(test_df_imputed.head())
print("\nMissing values per column in test set:")
display(test_df_imputed.isnull().sum())

Training DataFrame after imputation and dropping rows with missing Overall Score (excluding Monetary Freedom):


Unnamed: 0,country,index_year,overall_score,property_rights,government_integrity,judicial_effectiveness,tax_burden,government_spending,fiscal_health,business_freedom,labor_freedom,monetary_freedom,trade_freedom,investment_freedom,financial_freedom,overall_score_without_monetary_freedom
5704,zimbabwe,1995,48.5,50.0,30.0,26.1,50.1,65.3,59.1,55.0,38.8,54.7,51.8,30.0,50.0,47.775
4449,sierra leone,1995,49.8,50.0,10.0,29.6,51.7,91.5,65.9,70.0,32.7,49.9,45.0,50.0,30.0,49.775
4015,portugal,1995,62.4,70.0,70.0,68.9,60.5,36.0,46.0,70.0,41.5,77.7,77.8,50.0,50.0,60.5375
3674,nigeria,1995,47.3,50.0,50.0,34.3,75.4,0.0,80.9,55.0,80.6,50.2,45.0,50.0,50.0,46.925
5611,vietnam,1995,41.7,10.0,10.0,36.3,68.8,87.1,27.3,40.0,65.5,55.2,44.6,30.0,30.0,40.0625



Missing values per column in training set:


Unnamed: 0,0
country,0
index_year,0
overall_score,36
property_rights,0
government_integrity,0
judicial_effectiveness,0
tax_burden,0
government_spending,0
fiscal_health,0
business_freedom,0



Test DataFrame after imputation and dropping rows with missing Overall Score (excluding Monetary Freedom):


Unnamed: 0,country,index_year,overall_score,property_rights,government_integrity,judicial_effectiveness,tax_burden,government_spending,fiscal_health,business_freedom,labor_freedom,monetary_freedom,trade_freedom,investment_freedom,financial_freedom,overall_score_without_monetary_freedom
2586,kiribati,2020,45.2,47.5,24.3,34.2,72.4,0.0,99.0,40.6,62.9,82.2,23.8,25.0,30.0,41.790909
4083,romania,2020,69.7,72.5,55.1,56.1,90.3,70.4,85.6,58.6,63.0,78.1,86.4,70.0,50.0,68.909091
1072,comoros,2020,53.7,43.1,23.8,31.7,63.6,73.6,85.0,47.8,60.3,81.7,59.2,45.0,30.0,51.190909
3711,north macedonia,2020,69.5,62.5,42.2,42.7,91.5,71.0,87.7,80.6,67.0,77.7,86.2,65.0,60.0,68.763636
3463,namibia,2020,60.9,58.9,46.0,58.5,65.7,58.1,28.7,65.6,84.9,75.9,83.2,65.0,40.0,59.509091



Missing values per column in test set:


Unnamed: 0,0
country,0
index_year,0
overall_score,5
property_rights,0
government_integrity,0
judicial_effectiveness,0
tax_burden,0
government_spending,0
fiscal_health,0
business_freedom,0


## Preprocessing Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the features (X) and target (y) for the training and test sets
# Use the imputed dataframes as imputation was done before splitting

# Update the independent variables to exclude 'Monetary Freedom' and match dataframe column format
independent_variables_for_model = [col.lower().replace(' ', '_') for col in independent_variables if col.lower().replace(' ', '_') != 'monetary_freedom']

X_train = train_df_imputed[independent_variables_for_model]
y_train = train_df_imputed[dependent_variable]
X_test = test_df_imputed[independent_variables_for_model]
y_test = test_df_imputed[dependent_variable]

# Create the preprocessing and modeling pipeline
# Removed PowerTransformer and SelectKBest due to errors in cross-validation folds
pipeline = Pipeline([
    ('scaler', StandardScaler()),           # Feature scaling
    ('regressor', LinearRegression())       # Regression model
])