In [36]:
import pandas as pd
import uuid
from sklearn.impute import SimpleImputer, KNNImputer

data = pd.read_csv('./data/HNP_StatsData.csv')

In [116]:
countries = [
  'CAN',
  'USA',
  'MEX',
  'IND',
  'CHN',
  'SDN',
  'BGD',
  'BRA',
  'NER'
]

education_keys = [
'SE.ADT.LITR.ZS',
'SE.ADT.LITR.FE.ZS',
'SE.ADT.LITR.MA.ZS',
'SE.PRM.ENRR',
'SE.PRM.ENRR.FE',
'SE.PRM.ENRR.MA',
'SE.SEC.ENRR',
'SE.SEC.ENRR.FE',
'SE.SEC.ENRR.MA',
'SE.XPD.TOTL.GD.ZS',
'SE.PRM.CMPT.FE.ZS',
'SE.PRM.CMPT.MA.ZS',
'SE.PRM.CMPT.ZS',
]

health_keys = [
  'SH.DTH.COMM.ZS',
  'SH.DTH.NCOM.ZS',
  'SH.XPD.CHEX.GD.ZS',
  'SH.MED.BEDS.ZS',
  'SH.SGR.PROC.P5',
  'SH.STA.OWAD.ZS',
  'SH.STA.OWGH.ME.ZS',
  'SH.DYN.AIDS',
  'SH.HIV.INCD',
  'SH.HIV.0014',
  'SH.HIV.INCD.14',
  'SH.STA.DIAB.ZS',
  'SH.UHC.SRVS.CV.XD',
  'SH.MED.NUMW.P3'
]

quality_of_life_keys = [
  'SH.STA.BASS.ZS',
  'SH.STA.SMSS.ZS',
  'SH.STA.WASH.P5',
  'SH.H2O.BASW.ZS',
  'SH.H2O.SMDW.ZS',
  'SH.STA.HYGN.ZS',
  'SL.UEM.TOTL.MA.ZS',
  'SL.UEM.TOTL.FE.ZS',
  'SH.MMR.WAGE.ZS',
  'SI.POV.NAHC',
  'SH.STA.ODFC.ZS',
  'SL.TLF.TOTL.FE.ZS',
  'SH.STA.AIRP.P5',
  'SH.STA.BRTC.ZS ',
]

population_keys = [
'SP.POP.TOTL',
'SP.DYN.LE00.MA.IN',
'SP.DYN.LE00.FE.IN ',
'SP.POP.GROW ',
'SP.DYN.LE00.IN',
'SM.POP.NETM',
'SP.RUR.TOTL',
'SP.RUR.TOTL.ZG',
'SI.POV.RUHC',
'SP.URB.TOTL',
'SP.URB.GROW',
'SI.POV.URHC',
'SP.POP.TOTL.MA.ZS',
'SP.POP.TOTL.FE.ZS',
]

columns = [
  'Country Name',
  'Country Code',
  'Indicator Name',
  'Indicator Code',
  '2005',
  '2006',
  '2007',
  '2008',
  '2009',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '2015',
  '2016',
  '2017',
  '2018',
  '2019',
  '2020'
]

In [124]:
def getData(columns, keys, countries, tableName):
  selected_data = pd.DataFrame()
  selected_data_imputed = pd.DataFrame()
  for country in countries:
    current_info = data.loc[data['Country Code'] == country]
    current_info = current_info.loc[data['Indicator Code'].isin(keys)]
    current_info = current_info[columns]
    cols = current_info.columns.tolist()
    # convert Year into its own column
    current_info = current_info.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], var_name= "Year", value_name="Value")
    cols = current_info.columns.tolist()
    cols.insert(2, cols.pop(cols.index("Year")))
    current_info = current_info.reindex(columns= cols) #Reposition the Year column into index 2
    current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
    current_info = current_info.pivot_table('Value', ['Country Name', 'Country Code', 'Year'], 'Indicator Name').reset_index() #Pivot the table so that Each value in indicator Name is its own column


    cols = current_info.columns.tolist()
    numeric_info = current_info.iloc[:,3:]
    imputer = KNNImputer()
    imputer.fit(numeric_info)
    sample_incomplete_rows = numeric_info[numeric_info.isnull().any(axis=1)].head()
    input_x = imputer.transform(numeric_info)
    imputed_data = pd.DataFrame(input_x, columns=numeric_info.columns, index=numeric_info.index)
    result = pd.concat([current_info.iloc[:,:3], imputed_data], axis=1, join="inner")

    # current_info.to_csv(path_or_buf=f'./seed_data/{country}_{tableName}finished.csv', columns=cols)
    selected_data_imputed = pd.concat((selected_data_imputed, result[cols]))
    selected_data = pd.concat((selected_data, current_info))
  selected_data_imputed.insert(0, 'key', [uuid.uuid4() for _ in range(len(selected_data_imputed.index))])
  selected_data.insert(0, 'key', [uuid.uuid4() for _ in range(len(selected_data.index))])
  cols.insert(0,'key')
  print(selected_data_imputed.head())
  selected_data_imputed.to_csv(path_or_buf=f'./seed_data/{tableName}_seed.csv', columns=cols, index=False)
  selected_data.to_csv(path_or_buf=f'./seed_data/{tableName}_Unimputed.csv', columns=cols, index=False)
tables = {
  'Education': education_keys,
  'Health' : health_keys,
  'Quality_of_life' : quality_of_life_keys,
  'Population': population_keys
  }
for key in tables:
  getData(columns, tables[key], countries, key)

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.dr

Indicator Name                                   key Country Name  \
0               c5e255e6-4661-4e4d-8a7e-b142a02645cc       Canada   
1               fac2d6e1-c4a4-4699-888c-3d38fec22ed3       Canada   
2               5d14c322-25c6-4107-87d7-5a9af69e69f9       Canada   
3               9b1ceca1-96fb-4612-bd7f-3d2b7711e4cd       Canada   
4               9dcaf955-fc40-472e-8f2b-cb20c06d7d9c       Canada   

Indicator Name Country Code  Year  \
0                       CAN  2005   
1                       CAN  2006   
2                       CAN  2007   
3                       CAN  2008   
4                       CAN  2009   

Indicator Name  Public spending on education, total (% of GDP)  \
0                                                     4.765880   
1                                                     4.871072   
2                                                     4.766410   
3                                                     4.626120   
4                               

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column


Indicator Name                                   key Country Name  \
0               9d4a0054-bfcb-406f-88e5-15e515b57d63       Canada   
1               17b53cae-4039-41b1-9666-964bdb7669ea       Canada   
2               1608c23f-0cb0-42e4-a2c2-35a9408a0d76       Canada   
3               40c1ab8d-7388-466f-8905-71f768b4a447       Canada   
4               d16db803-a869-4f6f-8dc1-5fce42fa64ba       Canada   

Indicator Name Country Code  Year  \
0                       CAN  2005   
1                       CAN  2006   
2                       CAN  2007   
3                       CAN  2008   
4                       CAN  2009   

Indicator Name  Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)  \
0                                                        5.190839                                                       
1                                                        5.190839                                                       

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column


Indicator Name                                   key Country Name  \
0               4237c133-2dc8-4793-b9b6-a902b126e2fa       Canada   
1               ad595369-7895-4961-b2ef-981c67106db4       Canada   
2               734cf587-de39-44a6-bb85-9351c1327312       Canada   
3               a869ca09-e4dc-4634-bd32-94193318069e       Canada   
4               5a06e7c7-fc83-4848-955e-ea60a824cc2e       Canada   

Indicator Name Country Code  Year  \
0                       CAN  2005   
1                       CAN  2006   
2                       CAN  2007   
3                       CAN  2008   
4                       CAN  2009   

Indicator Name  Labor force, female (% of total labor force)  \
0                                                  46.419241   
1                                                  46.674059   
2                                                  46.855960   
3                                                  46.800794   
4                                         

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column


Indicator Name                                   key Country Name  \
0               014eb68d-7a82-4154-a000-b31e788fcf1e       Canada   
1               d200dc1b-515d-4f5a-89e7-50ddda2b1510       Canada   
2               1ca0a143-f070-4377-8fdb-e39788eb1c0c       Canada   
3               86bf64ef-9f39-456e-8286-7e3776807409       Canada   
4               a7d6e8a9-962e-431b-82f2-7873d62ddcad       Canada   

Indicator Name Country Code  Year  Life expectancy at birth, male (years)  \
0                       CAN  2005                                    77.9   
1                       CAN  2006                                    78.1   
2                       CAN  2007                                    78.3   
3                       CAN  2008                                    78.5   
4                       CAN  2009                                    78.8   

Indicator Name  Life expectancy at birth, total (years)  Net migration  \
0                                             80

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
