In [3]:
import pandas as pd
import numpy as np

# variables
key_metrics = [
    'Unemployment, total (% of total labor force) (modeled ILO estimate)',
    'GDP growth (annual %)',
    'Government expenditure on education, total (% of GDP)'
]
years_columns = [f"{year} [YR{year}]" for year in range(1965, 2024)]

# Reshape and clean data
def process_and_pivot_data(data):
    time_data_full = data[data['Series Name'].isin(key_metrics)].melt(
        id_vars=['Country Name', 'country_type', 'Series Name'],
        value_vars=years_columns,
        var_name='Year',
        value_name='Value'
    )

    # Convert Year column to numeric
    time_data_full["Year"] = time_data_full["Year"].str.extract(r'(\d{4})').astype(int)

    time_data_full['Value'] = time_data_full['Value'].replace('..', np.nan)

    time_data_full['Value'] = pd.to_numeric(time_data_full['Value'], errors='coerce')

    # Pivot table: create separate columns for each metric
    summary_data_pivot = time_data_full.pivot_table(
        index=['Country Name', 'Year', 'country_type'],
        columns='Series Name',
        values='Value'
    ).reset_index()

    # Rename 
    summary_data_pivot = summary_data_pivot.rename(columns={
        'GDP growth (annual %)': 'GDP_growth',
        'Government expenditure on education, total (% of GDP)': 'Education_expenditure',
        'Unemployment, total (% of total labor force) (modeled ILO estimate)': 'Unemployment'
    })

  
    summary_data_pivot.replace(["..", "N/A", ""], np.nan, inplace=True)

    # numeric conversion
    numeric_columns = ["GDP_growth", "Education_expenditure", "Unemployment"]
    for col in numeric_columns:
        summary_data_pivot[col] = pd.to_numeric(summary_data_pivot[col], errors="coerce")

    return summary_data_pivot


file_paths = [
    '94086144-6ad8-4b75-ac26-1b60a764018a_Data.csv',  
    '3fd493b6-dfe0-4afd-b296-1b5892e64ba8_Data.csv',  
    '579772b1-602f-4cf6-ac0b-1bb1289918f8_Data.csv',  
    '5026bb4a-f8ac-4ee9-860c-f83d47a7aded_Data.csv' 
]
country_types = ['low_income', 'lower_middle', 'upper_middle', 'high_income']

processed_datasets = []

for file_path, country_type in zip(file_paths, country_types):
    data = pd.read_csv(file_path)
    data['country_type'] = country_type
    processed_data = process_and_pivot_data(data)
    processed_datasets.append(processed_data)

# combine all processed datasets
final_data = pd.concat(processed_datasets, ignore_index=True)


numeric_columns = ["GDP_growth", "Education_expenditure", "Unemployment"]
#for col in numeric_columns:
   # print(f"Unique values in {col}: {final_data[col].unique()[:10]}")

# apply linear interpolation to numeric columns
for col in numeric_columns:
    final_data[col] = final_data.groupby("Country Name")[col].transform(
        lambda group: group.interpolate(method="linear")
    )

#
final_data = final_data.dropna(subset=numeric_columns)

# check for remaining missing values
print("Missing Values After Interpolation:")
print(final_data.isnull().sum())

#final_cleaned_file_path = 'final_cleaned_data3.csv'
#final_data.to_csv(final_cleaned_file_path, index=False)


print("Final Data Shape:", final_data.shape)
final_data.head()


Missing Values After Interpolation:
Series Name
Country Name             0
Year                     0
country_type             0
GDP_growth               0
Education_expenditure    0
Unemployment             0
dtype: int64
Final Data Shape: (5301, 6)


Series Name,Country Name,Year,country_type,GDP_growth,Education_expenditure,Unemployment
18,Afghanistan,2001,low_income,-9.431974,3.761392,7.958
19,Afghanistan,2002,low_income,28.600001,3.946065,7.939
20,Afghanistan,2003,low_income,8.832278,4.130739,7.922
21,Afghanistan,2004,low_income,1.414118,4.315413,7.914
22,Afghanistan,2005,low_income,11.229715,4.500087,7.914
