In [10]:
import pandas as pd
import uuid
from sklearn.impute import SimpleImputer, KNNImputer

data = pd.read_csv('./data/HNP_StatsData.csv')

In [11]:
countries = [
  'CAN',
  'USA',
  'MEX',
  'IND',
  'CHN',
  'SDN',
  'BGD',
  'BRA',
  'NER'
]

education_keys = [
'SE.ADT.LITR.ZS',
'SE.ADT.LITR.FE.ZS',
'SE.ADT.LITR.MA.ZS',
'SE.PRM.ENRR',
'SE.PRM.ENRR.FE',
'SE.PRM.ENRR.MA',
'SE.SEC.ENRR',
'SE.SEC.ENRR.FE',
'SE.SEC.ENRR.MA',
'SE.XPD.TOTL.GD.ZS',
'SE.PRM.CMPT.FE.ZS',
'SE.PRM.CMPT.MA.ZS',
'SE.PRM.CMPT.ZS',
]

health_keys = [
  'SH.DTH.COMM.ZS',
  'SH.DTH.NCOM.ZS',
  'SH.XPD.CHEX.GD.ZS',
  'SH.MED.BEDS.ZS',
  'SH.IMM.MEAS',
  'SH.STA.OWGH.ME.ZS',
  'SH.STA.OWGH.ME.ZS',
  'SH.TBS.INCD',
  'SH.ANM.CHLD.ZS',
  'SH.IMM.POL3',
  'SH.IMM.IDPT',
  'SH.STA.DIAB.ZS',
  'SH.UHC.SRVS.CV.XD',
  'SH.MED.NUMW.P3'
]

quality_of_life_keys = [
  'SH.STA.BASS.ZS',
  'SH.STA.SMSS.ZS',
  'SH.STA.WASH.P5',
  'SH.H2O.BASW.ZS',
  'SH.H2O.SMDW.ZS',
  'SL.TLF.TOTL.IN',
  'SL.UEM.TOTL.MA.ZS',
  'SL.UEM.TOTL.FE.ZS',
  'SH.MMR.WAGE.ZS',
  'SI.POV.NAHC',
  'SH.STA.ODFC.ZS',
  'SL.TLF.TOTL.FE.ZS',
  'SH.STA.AIRP.P5',
  'SH.STA.BRTC.ZS ',
]

population_keys = [
'SP.POP.TOTL',
'SP.DYN.LE00.MA.IN',
'SP.DYN.LE00.FE.IN ',
'SP.POP.GROW ',
'SP.DYN.LE00.IN',
'SM.POP.NETM',
'SP.RUR.TOTL',
'SP.RUR.TOTL.ZG',
'SI.POV.RUHC',
'SP.URB.TOTL',
'SP.URB.GROW',
'SI.POV.URHC',
'SP.POP.TOTL.MA.ZS',
'SP.POP.TOTL.FE.ZS',
]

columns = [
  'Country Name',
  'Country Code',
  'Indicator Name',
  'Indicator Code',
  '2005',
  '2006',
  '2007',
  '2008',
  '2009',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '2015',
  '2016',
  '2017',
  '2018',
  '2019',
  '2020'
]

In [12]:
def getData(columns, keys, countries, tableName):
  selected_data = pd.DataFrame()
  selected_data_imputed = pd.DataFrame()
  for country in countries:
    current_info = data.loc[data['Country Code'] == country]
    current_info = current_info.loc[data['Indicator Code'].isin(keys)]
    current_info = current_info[columns]
    cols = current_info.columns.tolist()
    # convert Year into its own column
    current_info = current_info.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], var_name= "Year", value_name="Value")
    cols = current_info.columns.tolist()
    cols.insert(2, cols.pop(cols.index("Year")))
    current_info = current_info.reindex(columns= cols) #Reposition the Year column into index 2
    current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
    current_info = current_info.pivot_table('Value', ['Country Name', 'Country Code', 'Year'], 'Indicator Name').reset_index() #Pivot the table so that Each value in indicator Name is its own column


    cols = current_info.columns.tolist()
    numeric_info = current_info.iloc[:,3:]
    imputer = KNNImputer()
    imputer.fit(numeric_info)
    sample_incomplete_rows = numeric_info[numeric_info.isnull().any(axis=1)].head()
    input_x = imputer.transform(numeric_info)
    imputed_data = pd.DataFrame(input_x, columns=numeric_info.columns, index=numeric_info.index)
    result = pd.concat([current_info.iloc[:,:3], imputed_data], axis=1, join="inner")

    # current_info.to_csv(path_or_buf=f'./seed_data/{country}_{tableName}finished.csv', columns=cols)
    selected_data_imputed = pd.concat((selected_data_imputed, result[cols]))
    selected_data = pd.concat((selected_data, current_info))
  selected_data_imputed.insert(0, 'key', [uuid.uuid4() for _ in range(len(selected_data_imputed.index))])
  selected_data.insert(0, 'key', [uuid.uuid4() for _ in range(len(selected_data.index))])
  cols.insert(0,'key')
  print(selected_data_imputed.head())
  selected_data_imputed.to_csv(path_or_buf=f'./seed_data/{tableName}_seed.csv', columns=cols, index=False)
  selected_data.to_csv(path_or_buf=f'./seed_data/{tableName}_Unimputed.csv', columns=cols, index=False)
tables = {
  'Education': education_keys,
  'Health' : health_keys,
  'Quality_of_life' : quality_of_life_keys,
  'Population': population_keys
  }
for key in tables:
  getData(columns, tables[key], countries, key)

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column


Indicator Name                                   key Country Name  \
0               109072bc-22bd-4c5e-85d0-1130989f1411       Canada   
1               733b4c72-d680-4b86-b01f-3709d9e9402a       Canada   
2               5cd83dcb-843e-4cb8-bcb8-ad86a43d4b1c       Canada   
3               d5e1bde7-a296-41f1-b8db-8ca43eaf5c85       Canada   
4               49bcf17c-456d-4121-a5fc-42d09ddf4ba9       Canada   

Indicator Name Country Code  Year  \
0                       CAN  2005   
1                       CAN  2006   
2                       CAN  2007   
3                       CAN  2008   
4                       CAN  2009   

Indicator Name  Literacy rate, adult female (% of females ages 15 and above)  \
0                                                            99.0              
1                                                            99.0              
2                                                            99.0              
3                                         

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.dr

Indicator Name                                   key Country Name  \
0               10c70f8d-8879-4d21-85a0-fc1387b95d0b       Canada   
1               a105543e-7b4a-4675-aa9c-6cdedf9d279a       Canada   
2               e782e295-579b-42a0-b208-9029b47d5ffd       Canada   
3               155b20ae-d4b4-46af-bb56-ed9e5a7897a9       Canada   
4               070064fa-241d-47c6-b196-ad1d76ffd8f0       Canada   

Indicator Name Country Code  Year  \
0                       CAN  2005   
1                       CAN  2006   
2                       CAN  2007   
3                       CAN  2008   
4                       CAN  2009   

Indicator Name  Labor force, female (% of total labor force)  \
0                                                  46.419241   
1                                                  46.674059   
2                                                  46.855960   
3                                                  46.800794   
4                                         

  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
