In [21]:
import pandas as pd
import uuid
from sklearn.impute import SimpleImputer, KNNImputer
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
with open('keys.json') as json_file:
    column_keys = json.load(json_file)

data = pd.read_csv('./data/HNP_StatsData.csv')

In [6]:
countries = [
  'CAN',
  'USA',
  'MEX',
  'IND',
  'CHN',
  'SDN',
  'BGD',
  'BRA',
  'NER'
]

education_keys = [
'SE.ADT.LITR.ZS',
'SE.ADT.LITR.FE.ZS',
'SE.ADT.LITR.MA.ZS',
'SE.PRM.ENRR',
'SE.PRM.ENRR.FE',
'SE.PRM.ENRR.MA',
'SE.SEC.ENRR',
'SE.SEC.ENRR.FE',
'SE.SEC.ENRR.MA',
'SE.XPD.TOTL.GD.ZS',
'SE.PRM.CMPT.FE.ZS',
'SE.PRM.CMPT.MA.ZS',
'SE.PRM.CMPT.ZS',
]

health_keys = [
  'SH.DTH.COMM.ZS',
  'SH.DTH.NCOM.ZS',
  'SH.XPD.CHEX.GD.ZS',
  'SH.MED.BEDS.ZS',
  'SH.SGR.PROC.P5',
  'SH.STA.OWAD.ZS',
  'SH.STA.OWGH.ME.ZS',
  'SH.DYN.AIDS',
  'SH.HIV.INCD',
  'SH.HIV.0014',
  'SH.HIV.INCD.14',
  'SH.STA.DIAB.ZS',
  'SH.UHC.SRVS.CV.XD',
  'SH.MED.NUMW.P3'
]

quality_of_life_keys = [
  'SH.STA.BASS.ZS',
  'SH.STA.SMSS.ZS',
  'SH.STA.WASH.P5',
  'SH.H2O.BASW.ZS',
  'SH.H2O.SMDW.ZS',
  'SH.STA.HYGN.ZS',
  'SL.UEM.TOTL.MA.ZS',
  'SL.UEM.TOTL.FE.ZS',
  'SH.MMR.WAGE.ZS',
  'SH.STA.ODFC.ZS',
  'SL.TLF.TOTL.FE.ZS',
  'SH.STA.AIRP.P5',
  'SH.STA.BRTC.ZS ',
]

population_keys = [
'SP.POP.TOTL',
'SP.DYN.LE00.MA.IN',
'SP.DYN.LE00.FE.IN ',
'SP.POP.GROW ',
'SP.DYN.LE00.IN',
'SM.POP.NETM',
'SP.RUR.TOTL',
'SP.RUR.TOTL.ZG',
'SI.POV.RUHC',
'SP.URB.TOTL',
'SP.URB.GROW',
'SI.POV.URHC',
'SP.POP.TOTL.MA.ZS',
'SP.POP.TOTL.FE.ZS',
'SI.POV.NAHC',
]

columns = [
  'Country Name',
  'Country Code',
  'Indicator Name',
  'Indicator Code',
  '2005',
  '2006',
  '2007',
  '2008',
  '2009',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '2015',
  '2016',
  '2017',
  '2018',
  '2019',
  '2020'
]

In [25]:
def getData(columns, keys, countries, tableName):
  selected_data = pd.DataFrame()
  selected_data_imputed = pd.DataFrame()
  for country in countries:
    current_info = data.loc[data['Country Code'] == country]
    current_info = current_info.loc[data['Indicator Code'].isin(keys)]
    current_info = current_info[columns]
    cols = current_info.columns.tolist()
    # convert Year into its own column
    current_info = current_info.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], var_name= "Year", value_name="Value")
    cols = current_info.columns.tolist()
    cols.insert(2, cols.pop(cols.index("Year")))
    current_info = current_info.reindex(columns= cols) #Reposition the Year column into index 2
    current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
    current_info = current_info.pivot_table('Value', ['Country Name', 'Country Code', 'Year'], 'Indicator Name').reset_index() #Pivot the table so that Each value in indicator Name is its own column

    cols = current_info.columns.tolist()
    numeric_info = current_info.iloc[:,3:]
    imputer = KNNImputer()
    imputer.fit(numeric_info)
    input_x = imputer.transform(numeric_info)
    imputed_data = pd.DataFrame(input_x, columns=numeric_info.columns, index=numeric_info.index)
    result = pd.concat([current_info.iloc[:,:3], imputed_data], axis=1, join="inner")
    selected_data_imputed = pd.concat((selected_data_imputed, result[cols]))

  selected_data_imputed.insert(0, 'key', [i for i in range(len(selected_data_imputed.index))])
  selected_data_imputed = selected_data_imputed.rename(columns=column_keys)
  selected_data_imputed.to_csv(path_or_buf=f'./seed_data/{tableName}_seed.csv', index=False)
tables = {
  'Education': education_keys,
  'Health' : health_keys,
  'Quality_of_life' : quality_of_life_keys,
  'Population': population_keys
  }
for key in tables:
  getData(columns, tables[key], countries, key)