In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

In [None]:
def clean_dataset(data, country_type_label):
    
    """
    The clean_dataset function takes a dataset and 
    a country type label as inputs,filters the dataset
    to include only specific economic indicators, cleans
    the data by dropping irrelevant or missing columns,
    interpolates missing values for years where applicable,
    and finally returns a cleaned dataset with an additional 
    column for the country type.

    Parameters:
    data: A pandas DataFrame containing raw data with 
    economic indicators for various countries and years.
    
    country_type_label: A string that labels the dataset 
    based on the income level of the country (e.g., 
    'low_income', 'lower_middle', 'upper_middle', 'high_income').

    """
    #variables we are interested in
    indicators = [
        'GDP growth (annual %)',
        'Government expenditure on education, total (% of GDP)',
        'Unemployment, total (% of total labor force) (modeled ILO estimate)'
    ]
    # filter to only include specific economic indicators we are interested in
    cleaned_data = data[data['Series Name'].isin(indicators)]
    
    cleaned_data = cleaned_data.dropna(subset=['Country Name', \
                                    'Country Code', 'Series Name'])

    #has too many missing values to be useful
    columns_to_drop = ['1960 [YR1960]', '1961 [YR1961]', '1962 [YR1962]', \
                       '1963 [YR1963]', '1964 [YR1964]']
    cleaned_data = cleaned_data.drop(columns=columns_to_drop, errors='ignore')

    
    year_columns = [col for col in cleaned_data.columns \
                    if col.split()[0].isdigit()]
    cleaned_data[year_columns] =\
    cleaned_data[year_columns].replace("..", pd.NA).apply(pd.to_numeric, errors='coerce')

    #linear interpolation to fill in NA
    cleaned_data[year_columns] =\
    cleaned_data[year_columns].interpolate(method='linear', axis=0)
    #drop any remaining NA rows
    cleaned_data = cleaned_data.dropna()
    #add in a col to label each country's income level
    cleaned_data['country_type'] = country_type_label
    return cleaned_data

In [None]:
#the four data source each with different income levels of countries
file_paths = [
    '94086144-6ad8-4b75-ac26-1b60a764018a_Data.csv',  
    '3fd493b6-dfe0-4afd-b296-1b5892e64ba8_Data.csv',  
    '579772b1-602f-4cf6-ac0b-1bb1289918f8_Data.csv',  
    '5026bb4a-f8ac-4ee9-860c-f83d47a7aded_Data.csv' 
]

country_types = ['low_income', 'lower_middle', 'upper_middle', 'high_income']

cleaned_datasets = []

#cleaning each file and merging in to one 
for file_path, country_type in zip(file_paths, country_types):
    data = pd.read_csv(file_path)
    cleaned_data = clean_dataset(data, country_type)
    cleaned_datasets.append(cleaned_data)

merged_data = pd.concat(cleaned_datasets, ignore_index=True)

## Data Preprocessing

To prepare the data for for analysis, we want to convert the data into a form that is easy to analyze and rename columns so that they can be used as variables. Optimally, we'd like to have `Country Name`, `Year`, and the economic indicators `GDP %`, `Government Education Expenditure %`, and `Unemployment %` all in one row. To accomplish this, we do the following:

1. Rename the year columns so they are easier to reference in the future
2. Combine the years into one column instead of having them as columns using a melt transform
3. Separate the economic indicators into columns of their own using a pivot transform
4. Cast data types into their appropriate types

We first rename the year columns.

In [None]:
# this loop was used to make the dictionary for the year --> column mappings
# print(merged_data.shape)
# print("{")
# for yr in range(1965, 2024):
#     print(f"\"{yr} [YR{yr}]\": \"{yr}\",")
# print("}")
yr_col_mappings = {
    "1965 [YR1965]": "1965",
    "1966 [YR1966]": "1966",
    "1967 [YR1967]": "1967",
    "1968 [YR1968]": "1968",
    "1969 [YR1969]": "1969",
    "1970 [YR1970]": "1970",
    "1971 [YR1971]": "1971",
    "1972 [YR1972]": "1972",
    "1973 [YR1973]": "1973",
    "1974 [YR1974]": "1974",
    "1975 [YR1975]": "1975",
    "1976 [YR1976]": "1976",
    "1977 [YR1977]": "1977",
    "1978 [YR1978]": "1978",
    "1979 [YR1979]": "1979",
    "1980 [YR1980]": "1980",
    "1981 [YR1981]": "1981",
    "1982 [YR1982]": "1982",
    "1983 [YR1983]": "1983",
    "1984 [YR1984]": "1984",
    "1985 [YR1985]": "1985",
    "1986 [YR1986]": "1986",
    "1987 [YR1987]": "1987",
    "1988 [YR1988]": "1988",
    "1989 [YR1989]": "1989",
    "1990 [YR1990]": "1990",
    "1991 [YR1991]": "1991",
    "1992 [YR1992]": "1992",
    "1993 [YR1993]": "1993",
    "1994 [YR1994]": "1994",
    "1995 [YR1995]": "1995",
    "1996 [YR1996]": "1996",
    "1997 [YR1997]": "1997",
    "1998 [YR1998]": "1998",
    "1999 [YR1999]": "1999",
    "2000 [YR2000]": "2000",
    "2001 [YR2001]": "2001",
    "2002 [YR2002]": "2002",
    "2003 [YR2003]": "2003",
    "2004 [YR2004]": "2004",
    "2005 [YR2005]": "2005",
    "2006 [YR2006]": "2006",
    "2007 [YR2007]": "2007",
    "2008 [YR2008]": "2008",
    "2009 [YR2009]": "2009",
    "2010 [YR2010]": "2010",
    "2011 [YR2011]": "2011",
    "2012 [YR2012]": "2012",
    "2013 [YR2013]": "2013",
    "2014 [YR2014]": "2014",
    "2015 [YR2015]": "2015",
    "2016 [YR2016]": "2016",
    "2017 [YR2017]": "2017",
    "2018 [YR2018]": "2018",
    "2019 [YR2019]": "2019",
    "2020 [YR2020]": "2020",
    "2021 [YR2021]": "2021",
    "2022 [YR2022]": "2022",
    "2023 [YR2023]": "2023"
}
merged_data = merged_data.rename(columns=yr_col_mappings)
merged_data

## Data Transformation

Then, we melt the data to have a year attached to each country/economic indicator combo. Subsequently, we pivot to put each economic indicator into a column of its own.

In [None]:
id_vars = [
    'Country Name', 'Country Code', 'Series Name', 'Series Code',
    'country_type'
]
year_cols = [
    '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972',
       '1973', '1974', '1975', '1976',
       '1977', '1978', '1979', '1980',
       '1981', '1982', '1983', '1984',
       '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992',
       '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '2000',
       '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016',
       '2017', '2018', '2019', '2020',
       '2021', '2022', '2023']
# melt to have years as a column
analysis_df = merged_data.melt(
    id_vars=id_vars,
    value_vars=year_cols,
    var_name='Year',
    value_name='value'
) 

# pivot to get column for each economic indicator
index = ['Country Name', 'Country Code', 'country_type', 'Year']
analysis_df = analysis_df.pivot(
    index=index,
    columns='Series Name',
    values='value'
)

# have to reset index because pivoting adds more headers
# Inspiraiton from : https://stackoverflow.com/questions/28337117/how-to-pivot-a-dataframe-in-pandas
analysis_df = analysis_df.rename_axis(columns=None).reset_index()

col_name_mappings = {
    'Country Name': 'country_name',
    'Country Code': 'country_code',
    'country_type': 'income_class',
    'Year': 'year',
    'GDP growth (annual %)': 'gdp_growth_pct',
    'Government expenditure on education, total (% of GDP)': 'govt_exp_edu_pct',
    'Unemployment, total (% of total labor force) (modeled ILO estimate)': 'unemp_pct',
}
analysis_df = analysis_df.rename(columns=col_name_mappings)
analysis_df

Finally, we check that all types are appropriate for their data.

In [None]:
# types are not all appropriate, have to convert them
analysis_df['year'] = analysis_df['year'].astype('int64')
analysis_df.dtypes

## Hypothesis Test

We conduct our second hypothesis test. We first drop the GDP growth indicator, as it is not relevant for the analysis.

In [None]:
developing_country_catgrs = ['low_income', 'lower_middle']
developed_country_catgrs = ['upper_middle', 'high_income']

In [None]:
# drop the gdp_growth_pct because it is not relevant for this analysis
# for dropping one column: https://stackoverflow.com/questions/29763620/how-to-select-all-columns-except-one-in-pandas
analysis_df = analysis_df.drop('gdp_growth_pct', axis=1)

We also drop rows that have NaN in any of their columns to avoid any errors.

In [None]:
# drop NaN values from analysis
# analysis_df.loc[analysis_df['unemp_pct'].isna() | analysis_df['govt_exp_edu_pct'].isna() | analysis_df['gdp_growth_pct'].isna()]
analysis_df = analysis_df.dropna().reset_index(drop=True)
analysis_df

We create dummy variables for each category except `high_income`, as this is our reference variable. These dummy variables then get added to our analysis dataframe for the regression.

In [None]:
# create dummy variables for each category of income, and add to a new df
# high_income is reference var
income_dummies = pd.get_dummies(analysis_df['income_class'], drop_first=True)
income_dummies = income_dummies.astype('int32')
# print(income_dummies)
analysis_with_dummies = pd.concat([analysis_df, income_dummies], axis=1)
analysis_with_dummies

In [None]:
analysis_df[['govt_exp_edu_pct', 'unemp_pct']].corr()

In [None]:
plt.scatter(data=analysis_df, x='govt_exp_edu_pct',
            y='unemp_pct', marker="o")
plt.xlabel("Government expenditure on education, total (% of GDP)")
plt.ylabel("Unemployment, total (% of total labor force)")
# plt.xticks(rotation=60)
plt.show()

We represent the experiment as `govt_exp_edu_pct` $\sigma \sim$ (`unemp_pct` + `high_income` + `low_income` + `lower_middle` + `upper_middle` + `high_income` * `unemp_pct` + `low_income` * `unemp_pct` + `lower_middle` * `unemp_pct` + `upper_middle` * `unemp_pct`). To add interaction variables, we multiply the columns manually and add them to the analysis dataframe.

In [None]:
# add multiplied columns to df so use in regression
# analysis_with_dummies['high_income_x_unemp_pct'] = analysis_with_dummies['high_income'] * analysis_with_dummies['unemp_pct']
analysis_with_dummies['low_income_x_unemp_pct'] = analysis_with_dummies['low_income'] * analysis_with_dummies['unemp_pct']
analysis_with_dummies['lower_middle_x_unemp_pct'] = analysis_with_dummies['lower_middle'] * analysis_with_dummies['unemp_pct']
analysis_with_dummies['upper_middle_x_unemp_pct'] = analysis_with_dummies['upper_middle'] * analysis_with_dummies['unemp_pct']
analysis_with_dummies

This is the Linear Regression. So exciting!

In [None]:
input_cols = [
    'unemp_pct', 'low_income', 'lower_middle',
    'upper_middle', 'low_income_x_unemp_pct',
    'lower_middle_x_unemp_pct', 'upper_middle_x_unemp_pct'
]

hyp2_model = LinearRegression().fit(
    analysis_with_dummies[input_cols],
    analysis_with_dummies['govt_exp_edu_pct'])
print(hyp2_model.coef_)
for i in range(len(input_cols)):
    print(f"The model's {input_cols[i]} coefficient is {round(hyp2_model.coef_[i], 2)}")

In [None]:
# normalize inputs to see if that makes a difference
# Source: HW4 B11
def Normalizer(df_cols):
    scaler = preprocessing.StandardScaler().fit(df_cols)
    return(scaler.transform(df_cols))
hyp2_model = LinearRegression().fit(
    Normalizer(analysis_with_dummies[input_cols]),
    analysis_with_dummies['govt_exp_edu_pct'])
print(hyp2_model.coef_)
for i in range(len(input_cols)):
    print(f"The model's {input_cols[i]} coefficient is {round(hyp2_model.coef_[i], 2)}")