In [10]:
import pandas as pd
import uuid
from sklearn.impute import SimpleImputer, KNNImputer

countries = pd.read_csv('./data/HNP_StatsCountry.csv')
countrySeries = pd.read_csv('./data/HNP_StatsCountry-Series.csv')
data = pd.read_csv('./data/HNP_StatsData.csv')
footNote = pd.read_csv('./data/HNP_StatsFootNote.csv')
seriesTime = pd.read_csv('./data/HNP_StatsSeries-Time.csv')
series = pd.read_csv('./data/HNP_StatsSeries.csv')
catastrophes = pd.read_csv('./data/emdat_public_2022_01_31.csv')
electoralConflict = pd.read_csv('./data/ucdp-prio-acd-211.csv')

fullData = [
  countries,
  countrySeries,
  data,
  footNote,
  seriesTime,
  series,
  catastrophes,
  electoralConflict
]

  catastrophes = pd.read_csv('./data/emdat_public_2022_01_31.csv')


In [3]:
def getAllColumns():
  for frame in fullData:
    print('----------------')
    for col in frame.columns:
      print(col)
  print('----------------')
print(getAllColumns())

----------------
Country Code
Short Name
Table Name
Long Name
2-alpha code
Currency Unit
Special Notes
Region
Income Group
WB-2 code
National accounts base year
National accounts reference year
SNA price valuation
Lending category
Other groups
System of National Accounts
Alternative conversion factor
PPP survey year
Balance of Payments Manual in use
External debt Reporting status
System of trade
Government Accounting concept
IMF data dissemination standard
Latest population census
Latest household survey
Source of most recent Income and expenditure data
Vital registration complete
Latest agricultural census
Latest industrial data
Latest trade data
Unnamed: 30
----------------
CountryCode
SeriesCode
DESCRIPTION
Unnamed: 3
----------------
Country Name
Country Code
Indicator Name
Indicator Code
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1

In [4]:
def getTopics():
  column_values = series[["Topic"]].values.ravel()
  unique_values =  pd.unique(column_values)
  print(unique_values)

print(getTopics())

['Public Sector: Policy & institutions' 'Background: Economy'
 'Background: Education' 'Non-communicable diseases' 'Nutrition'
 'HIV/AIDS' 'Population dynamics: Other demographic indicators'
 'Cause of deaths' 'Population dynamics: Other demographic Indicators'
 'Reproductive health' 'Water and sanitation' nan 'Immunization'
 'Medical resources and usage' 'Infectious diseases' 'Health financing'
 'Background: Poverty' 'Background: Labor force' 'Population dynamics'
 'Background' 'Population dynamics: Population by age group'
 'Population dynamics: Population']
None


In [5]:
def getStats():
  for col in data.columns:
    print(col)

#print(data['Indicator Name'].unique())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
  print(series[['Series Code', 'Indicator Name']])


              Series Code                                     Indicator Name
0             HD.HCI.OVRL              Human capital index (HCI) (scale 0-1)
1          HD.HCI.OVRL.FE      Human capital index (HCI), female (scale 0-1)
2          HD.HCI.OVRL.LB  Human capital index (HCI), lower bound (scale ...
3       HD.HCI.OVRL.LB.FE  Human capital index (HCI), female, lower bound...
4       HD.HCI.OVRL.LB.MA  Human capital index (HCI), male, lower bound (...
5          HD.HCI.OVRL.MA        Human capital index (HCI), male (scale 0-1)
6          HD.HCI.OVRL.UB  Human capital index (HCI), upper bound (scale ...
7       HD.HCI.OVRL.UB.FE  Human capital index (HCI), female, upper bound...
8       HD.HCI.OVRL.UB.MA  Human capital index (HCI), male, upper bound (...
9          NY.GNP.PCAP.CD         GNI per capita, Atlas method (current US$)
10   SE.ADT.1524.LT.FM.ZS  Literacy rate, youth (ages 15-24), gender pari...
11   SE.ADT.1524.LT.MA.ZS  Literacy rate, youth male (% of males ages 15-24)

In [6]:
def getIndicators(topics):
  healthStats = series.loc[series['Topic'].isin(topics)]
  column_values = healthStats[["Indicator Name"]].values.ravel()
  unique_values =  pd.unique(column_values)
  print(unique_values)

healthIndicatorTopics = ['Non-communicable diseases', 'Nutrition', 'HIV/AIDS', 'Reproductive health', 'Water and sanitation', 'Immunization', 'Medical resources and usage' 'Infectious diseases' 'Health financing', 'Background: Poverty']

#getIndicators(healthIndicatorTopics)

# for col in electoralViolence.columns:
#    print(col)

In [7]:
countries = [
  'CAN',
  'USA',
  'MEX',
  'IND',
  'CHN',
  'SDN',
  'BGD',
  'BRA',
  'NER'
]

education_keys = [
'SE.ADT.LITR.ZS',
'SE.ADT.LITR.FE.ZS',
'SE.ADT.LITR.MA.ZS',
'SE.PRM.ENRR',
'SE.PRM.ENRR.FE',
'SE.PRM.ENRR.MA',
'SE.SEC.ENRR',
'SE.SEC.ENRR.FE',
'SE.SEC.ENRR.MA',
'SE.XPD.TOTL.GD.ZS',
'SE.PRM.CMPT.FE.ZS',
'SE.PRM.CMPT.MA.ZS',
'SE.PRM.CMPT.ZS',
]

health_keys = [
  'SH.DTH.COMM.ZS',
  'SH.DTH.NCOM.ZS',
  'SH.XPD.CHEX.GD.ZS',
  'SH.MED.BEDS.ZS',
  'SH.SGR.PROC.P5',
  'SH.STA.OWAD.ZS',
  'SH.STA.OWGH.ME.ZS',
  'SH.DYN.AIDS',
  'SH.HIV.INCD',
  'SH.HIV.0014',
  'SH.HIV.INCD.14',
  'SH.STA.DIAB.ZS',
  'SH.UHC.SRVS.CV.XD',
  'SH.MED.NUMW.P3'
]

quality_of_life_keys = [
  'SH.STA.BASS.ZS',
  'SH.STA.SMSS.ZS',
  'SH.STA.WASH.P5',
  'SH.H2O.BASW.ZS',
  'SH.H2O.SMDW.ZS',
  'SH.STA.HYGN.ZS',
  'SL.UEM.TOTL.MA.ZS',
  'SL.UEM.TOTL.FE.ZS',
  'SH.MMR.WAGE.ZS',
  'SI.POV.NAHC',
  'SH.STA.ODFC.ZS',
  'SL.TLF.TOTL.FE.ZS',
  'SH.STA.AIRP.P5',
  'SH.STA.BRTC.ZS ',
]

population_keys = [
'SP.POP.TOTL',
'SP.DYN.LE00.MA.IN',
'SP.DYN.LE00.FE.IN ',
'SP.POP.GROW ',
'SP.DYN.LE00.IN',
'SM.POP.NETM',
'SP.RUR.TOTL',
'SP.RUR.TOTL.ZG',
'SI.POV.RUHC',
'SP.URB.TOTL',
'SP.URB.GROW',
'SI.POV.URHC',
'SP.POP.TOTL.MA.ZS',
'SP.POP.TOTL.FE.ZS',
]



columns = [
  'Country Name',
  'Country Code',
  'Indicator Name',
  'Indicator Code',
  '2005',
  '2006',
  '2007',
  '2008',
  '2009',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '2015',
  '2016',
  '2017',
  '2018',
  '2019',
  '2020'
]

In [19]:
# def getData(columns, keys, countries, tableName):
#   selected_data = pd.DataFrame(columns=columns)
#   for country in countries:
#     current_info = data.loc[data['Country Code'] == country]
#     current_info = current_info.loc[data['Indicator Code'].isin(keys)]
#     selected_data = pd.concat((selected_data, current_info[columns]))

#   cols = selected_data.columns.tolist()
#   selected_data.to_csv(path_or_buf=f'./seed_data/{tableName}_Unprepared_seed.csv', columns=cols)
#   # convert Year into its own column
#   selected_data = selected_data.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], var_name= "Year", value_name="Value")

#   cols = selected_data.columns.tolist()
#   cols.insert(2, cols.pop(cols.index("Year")))
  
#   selected_data = selected_data.reindex(columns= cols) #Reposition the Year column into index 2
#   selected_data = selected_data.drop('Indicator Code', 1) #Drop the Indicator Code column

#   selected_data = selected_data.pivot_table('Value', ['Country Name', 'Country Code', 'Year'], 'Indicator Name') #Pivot the table so that Each value in indicator Name is its own column
#   selected_data.insert(0, 'key', [uuid.uuid4() for _ in range(len(selected_data.index))])
#   cols = selected_data.columns.tolist()
#   print(cols)
  
#   selected_data.to_csv(path_or_buf=f'./seed_data/{tableName}_seed.csv', columns=cols)

# tables = {
#   'Education': education_keys,
#   # 'Health' : health_keys,
#   # 'Quality_of_life' : quality_of_life_keys,
#   # 'Population': population_keys
#   }

# for key in tables:
#   getData(columns, tables[key], countries, key)


In [23]:
def getData(columns, keys, countries, tableName):
  selected_data = pd.DataFrame()
  for country in countries:
    current_info = data.loc[data['Country Code'] == country]
    print(current_info.head())
    current_info = current_info.loc[data['Indicator Code'].isin(keys)]
    current_info = current_info[columns]
    cols = current_info.columns.tolist()
    # convert Year into its own column
    current_info = current_info.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], var_name= "Year", value_name="Value")
    cols = current_info.columns.tolist()
    cols.insert(2, cols.pop(cols.index("Year")))
    current_info = current_info.reindex(columns= cols) #Reposition the Year column into index 2
    current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column
    current_info = current_info.pivot_table('Value', ['Country Name', 'Country Code', 'Year'], 'Indicator Name').reset_index() #Pivot the table so that Each value in indicator Name is its own column


  #   cols = current_info.columns.tolist()
  #   numeric_info = current_info.iloc[:,3:]
  #   print(current_info)

  #   imputer = KNNImputer()
  #   imputer.fit(numeric_info)
  #   sample_incomplete_rows = numeric_info[numeric_info.isnull().any(axis=1)].head()
  #   input_x = imputer.transform(numeric_info)
  #   imputed_data = pd.DataFrame(input_x, columns=numeric_info.columns, index=numeric_info.index)
  #   result = pd.concat([current_info.iloc[:,:3], imputed_data], axis=1, join="inner")

  #   # current_info.to_csv(path_or_buf=f'./seed_data/{country}_{tableName}finished.csv', columns=cols)
  #   selected_data = pd.concat((selected_data, result[cols]))
  # selected_data.insert(0, 'key', [uuid.uuid4() for _ in range(len(selected_data.index))])
  # cols.insert(0,'key')
  # print(selected_data.head())
  # selected_data.to_csv(path_or_buf=f'./seed_data/{tableName}.csv', columns=cols, index=False)
  

tables = {
  'Education': education_keys,
  # 'Health' : health_keys,
  # 'Quality_of_life' : quality_of_life_keys,
  # 'Population': population_keys
  }

# for key in tables:
#   getData(columns, education_keys, countries, 'Education')
getData(columns, education_keys, countries, 'Education')

Empty DataFrame
Columns: [Country Name, Country Code, Indicator Name, Indicator Code, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, Unnamed: 66]
Index: []

[0 rows x 67 columns]
Empty DataFrame
Columns: [Country Name, Country Code, Year]
Index: []


  current_info = current_info.drop('Indicator Code', 1) #Drop the Indicator Code column


ValueError: at least one array or dtype is required