In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def clean_dataset(data, country_type_label):
    
    """
    The clean_dataset function takes a dataset and 
    a country type label as inputs,filters the dataset
    to include only specific economic indicators, cleans
    the data by dropping irrelevant or missing columns,
    interpolates missing values for years where applicable,
    and finally returns a cleaned dataset with an additional 
    column for the country type.

    Parameters:
    data: A pandas DataFrame containing raw data with 
    economic indicators for various countries and years.
    
    country_type_label: A string that labels the dataset 
    based on the income level of the country (e.g., 
    'low_income', 'lower_middle', 'upper_middle', 'high_income').

    """
    #variables we are interested in
    indicators = [
        'GDP growth (annual %)',
        'Government expenditure on education, total (% of GDP)',
        'Unemployment, total (% of total labor force) (modeled ILO estimate)'
    ]
    # filter to only include specific economic indicators we are interested in
    cleaned_data = data[data['Series Name'].isin(indicators)]
    
    cleaned_data = cleaned_data.dropna(subset=['Country Name', \
                                    'Country Code', 'Series Name'])

    #has too many missing values to be useful
    columns_to_drop = ['1960 [YR1960]', '1961 [YR1961]', '1962 [YR1962]', \
                       '1963 [YR1963]', '1964 [YR1964]']
    cleaned_data = cleaned_data.drop(columns=columns_to_drop, errors='ignore')

    
    year_columns = [col for col in cleaned_data.columns \
                    if col.split()[0].isdigit()]
    cleaned_data[year_columns] =\
    cleaned_data[year_columns].replace("..", pd.NA).apply(pd.to_numeric, errors='coerce')

    #linear interpolation to fill in NA
    cleaned_data[year_columns] =\
    cleaned_data[year_columns].interpolate(method='linear', axis=0)
    #drop any remaining NA rows
    cleaned_data = cleaned_data.dropna()
    #add in a col to label each country's income level
    cleaned_data['country_type'] = country_type_label
    return cleaned_data

In [None]:
#the four data source each with different income levels of countries
file_paths = [
    '94086144-6ad8-4b75-ac26-1b60a764018a_Data.csv',  
    '3fd493b6-dfe0-4afd-b296-1b5892e64ba8_Data.csv',  
    '579772b1-602f-4cf6-ac0b-1bb1289918f8_Data.csv',  
    '5026bb4a-f8ac-4ee9-860c-f83d47a7aded_Data.csv' 
]

country_types = ['low_income', 'lower_middle', 'upper_middle', 'high_income']

cleaned_datasets = []

#cleaning each file and merging in to one 
for file_path, country_type in zip(file_paths, country_types):
    data = pd.read_csv(file_path)
    cleaned_data = clean_dataset(data, country_type)
    cleaned_datasets.append(cleaned_data)

merged_data = pd.concat(cleaned_datasets, ignore_index=True)
merged_file_path = 'final_cleaned_data.csv'

print(merged_data.shape)
merged_data

We first melt the `merged_df` so that we have a year attached to each country/economic indicator combo.

In [None]:
id_vars = [
    'Country Name', 'Country Code', 'Series Name', 'Series Code',
    'country_type'
]
year_cols = [
    '1965 [YR1965]', '1966 [YR1966]', '1967 [YR1967]', '1968 [YR1968]',
       '1969 [YR1969]', '1970 [YR1970]', '1971 [YR1971]', '1972 [YR1972]',
       '1973 [YR1973]', '1974 [YR1974]', '1975 [YR1975]', '1976 [YR1976]',
       '1977 [YR1977]', '1978 [YR1978]', '1979 [YR1979]', '1980 [YR1980]',
       '1981 [YR1981]', '1982 [YR1982]', '1983 [YR1983]', '1984 [YR1984]',
       '1985 [YR1985]', '1986 [YR1986]', '1987 [YR1987]', '1988 [YR1988]',
       '1989 [YR1989]', '1990 [YR1990]', '1991 [YR1991]', '1992 [YR1992]',
       '1993 [YR1993]', '1994 [YR1994]', '1995 [YR1995]', '1996 [YR1996]',
       '1997 [YR1997]', '1998 [YR1998]', '1999 [YR1999]', '2000 [YR2000]',
       '2001 [YR2001]', '2002 [YR2002]', '2003 [YR2003]', '2004 [YR2004]',
       '2005 [YR2005]', '2006 [YR2006]', '2007 [YR2007]', '2008 [YR2008]',
       '2009 [YR2009]', '2010 [YR2010]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]',
       '2021 [YR2021]', '2022 [YR2022]', '2023 [YR2023]']
# melt to have years as a column
analysis_df = merged_data.melt(
    id_vars=id_vars,
    value_vars=year_cols,
    var_name='Year',
    value_name='value'
) 

# pivot to get column for each economic indicator
index = ['Country Name', 'Country Code', 'country_type', 'Year']
analysis_df = analysis_df.pivot(
    index=index,
    columns='Series Name',
    values='value'
)

# Inspiraiton from : https://stackoverflow.com/questions/28337117/how-to-pivot-a-dataframe-in-pandas
analysis_df = analysis_df.rename_axis(columns=None).reset_index()
analysis_df

We then pivot to associate the value of economic indicator with the country and year it occured in.