In [6]:
## Data Cleaning and Preprocessing

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def clean_dataset(data, country_type_label):
    
    """
    The clean_dataset function takes a dataset and 
    a country type label as inputs,filters the dataset
    to include only specific economic indicators, cleans
    the data by dropping irrelevant or missing columns,
    interpolates missing values for years where applicable,
    and finally returns a cleaned dataset with an additional 
    column for the country type.

    Parameters:
    data: A pandas DataFrame containing raw data with 
    economic indicators for various countries and years.
    
    country_type_label: A string that labels the dataset 
    based on the income level of the country (e.g., 
    'low_income', 'lower_middle', 'upper_middle', 'high_income').

    """
    #variables we are interested in
    indicators = [
        'GDP growth (annual %)',
        'Government expenditure on education, total (% of GDP)',
        'Unemployment, total (% of total labor force) (modeled ILO estimate)'
    ]
    # filter to only include specific economic indicators we are interested in
    cleaned_data = data[data['Series Name'].isin(indicators)]
    
    cleaned_data = cleaned_data.dropna(subset=['Country Name', \
                                    'Country Code', 'Series Name'])

    #has too many missing values to be useful
    columns_to_drop = ['1960 [YR1960]', '1961 [YR1961]', '1962 [YR1962]', \
                       '1963 [YR1963]', '1964 [YR1964]']
    cleaned_data = cleaned_data.drop(columns=columns_to_drop, errors='ignore')

    
    year_columns = [col for col in cleaned_data.columns \
                    if col.split()[0].isdigit()]
    cleaned_data[year_columns] =\
    cleaned_data[year_columns].replace("..", pd.NA).apply(pd.to_numeric, errors='coerce')

    #linear interpolation to fill in NA
    cleaned_data[year_columns] =\
    cleaned_data[year_columns].interpolate(method='linear', axis=0)
    #drop any remaining NA rows
    cleaned_data = cleaned_data.dropna()
    #add in a col to label each country's income level
    cleaned_data['country_type'] = country_type_label
    return cleaned_data

#the four data source each with different income levels of countries
file_paths = [
    '94086144-6ad8-4b75-ac26-1b60a764018a_Data.csv',  
    '3fd493b6-dfe0-4afd-b296-1b5892e64ba8_Data.csv',  
    '579772b1-602f-4cf6-ac0b-1bb1289918f8_Data.csv',  
    '5026bb4a-f8ac-4ee9-860c-f83d47a7aded_Data.csv' 
]

country_types = ['low_income', 'lower_middle', 'upper_middle', 'high_income']

cleaned_datasets = []

#cleaning each file and merging in to one 
for file_path, country_type in zip(file_paths, country_types):
    data = pd.read_csv(file_path)
    cleaned_data = clean_dataset(data, country_type)
    cleaned_datasets.append(cleaned_data)

merged_data = pd.concat(cleaned_datasets, ignore_index=True)
merged_file_path = 'final_cleaned_data.csv'
#merged_data.to_csv(merged_file_path, index=False)

print(merged_data.shape)
merged_data.head()


(623, 64)


Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1965 [YR1965],1966 [YR1966],1967 [YR1967],1968 [YR1968],1969 [YR1969],1970 [YR1970],...,2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023],country_type
0,Burkina Faso,BFA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.764545,0.537613,8.823315,3.070774,2.026081,0.11702,...,3.921229,5.957977,6.203489,6.604569,5.889205,2.010773,6.939155,1.777915,2.96018,low_income
1,Burkina Faso,BFA,"Government expenditure on education, total (% ...",SE.XPD.TOTL.GD.ZS,3.832105,1.896073,10.489383,1.947888,0.864207,7.18657,...,3.67009,3.656988,5.62847,5.48861,5.407,5.151,5.224,5.277,1.94509,low_income
2,Burundi,BDI,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,3.899666,3.254533,12.155451,0.825002,-0.297667,14.25612,...,1.444,1.356,1.258,1.157,1.053,1.03,1.11,0.919,0.93,low_income
3,Burundi,BDI,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.967226,4.612993,13.821519,-0.297884,-1.459541,21.325671,...,-3.9,-0.600001,0.500001,1.609935,1.812565,0.327157,3.1,1.848999,2.700001,low_income
4,Burundi,BDI,"Government expenditure on education, total (% ...",SE.XPD.TOTL.GD.ZS,2.960677,3.290257,10.773217,0.269449,1.39082,14.994741,...,6.37134,4.692,4.76202,5.07865,5.347,5.322,4.871,4.821,4.493,low_income
