In [2]:
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns

import os

In [3]:
""" Data locations
"""

unpd_root = "../../data/unpd_data"
gbd_root = "../../data/gbd_data"

prepped_data_root = "../../data/prepared_data"

# UN Data

Extract a "World" subset for working with on plotting, global forecasts and working out next 8BN

In [4]:
unpd_scenarios = ["High","Medium","Low"]

In [5]:
df_unpd_medium_indicators = pd.read_csv(
    os.path.join(unpd_root, "WPP2022_Demographic_Indicators_Medium.csv"),
    low_memory=False
)
df_unpd_other_indicators = pd.read_csv(
    os.path.join(unpd_root, "WPP2022_Demographic_Indicators_OtherVariants.csv"),
    low_memory=False
)

In [6]:
df_unpd_all_indicators = pd.concat(
    [df_unpd_medium_indicators, df_unpd_other_indicators]
)

In [7]:
df_unpd_all_indicators_scenario_filter = df_unpd_all_indicators[
    df_unpd_all_indicators.Variant.apply(lambda v: v in unpd_scenarios)
]

In [8]:
df_unpd_world_indicators = df_unpd_all_indicators_scenario_filter[df_unpd_all_indicators_scenario_filter.Location=="World"]

In [13]:
df_unpd_world_indicators.loc[:,"BirthsSingle"] = df_unpd_world_indicators.Births*1_000

In [14]:
df_unpd_world_indicators.columns

Index(['SortOrder', 'LocID', 'Notes', 'ISO3_code', 'ISO2_code', 'SDMX_code',
       'LocTypeID', 'LocTypeName', 'ParentID', 'Location', 'VarID', 'Variant',
       'Time', 'TPopulation1Jan', 'TPopulation1July', 'TPopulationMale1July',
       'TPopulationFemale1July', 'PopDensity', 'PopSexRatio', 'MedianAgePop',
       'NatChange', 'NatChangeRT', 'PopChange', 'PopGrowthRate',
       'DoublingTime', 'Births', 'Births1519', 'CBR', 'TFR', 'NRR', 'MAC',
       'SRB', 'Deaths', 'DeathsMale', 'DeathsFemale', 'CDR', 'LEx', 'LExMale',
       'LExFemale', 'LE15', 'LE15Male', 'LE15Female', 'LE65', 'LE65Male',
       'LE65Female', 'LE80', 'LE80Male', 'LE80Female', 'InfantDeaths', 'IMR',
       'LBsurvivingAge1', 'Under5Deaths', 'Q5', 'Q0040', 'Q0040Male',
       'Q0040Female', 'Q0060', 'Q0060Male', 'Q0060Female', 'Q1550',
       'Q1550Male', 'Q1550Female', 'Q1560', 'Q1560Male', 'Q1560Female',
       'NetMigrations', 'CNMR', 'BirthsSingle'],
      dtype='object')

In [15]:
df_unpd_world_indicators.to_csv("../../data/prepared_data/unpd_combined_indicators.csv")

In [16]:
df_unpd_world_indicators.to_csv("../../data/prepared_data/unpd_world_indicators.csv") # for global stats and working out 8BN

# Global Burden of Disease

Do the same thing with the GBD study

In [12]:
gbd_pop_all_sex_all_age = "../../data/gbd_data/IHME_POP_2017_2100_POP_BOTH_SEX_ALL_AGE_Y2020M05D01.CSV"
gbd_births = "../../data/gbd_data/IHME_POP_2017_2100_LIVE_BIRTHS_Y2020M05D01.CSV"

gbd_scenarios = ["Reference","SDG Met Need and Education"]

In [7]:
df_gbd_pop_all = pd.read_csv(gbd_pop_all_sex_all_age)
df_gbd_births = pd.read_csv(gbd_births)

In [16]:
df_gbd_pop_all_scenario_filtered = df_gbd_pop_all[
    df_gbd_pop_all.scenario_name.apply(lambda s: s in gbd_scenarios)
]

df_gbd_births_scenario_filtered = df_gbd_births[
    df_gbd_births.scenario_name.apply(lambda s: s in gbd_scenarios)
]

In [19]:
df_gbd_births_global = df_gbd_births_scenario_filtered[
    df_gbd_births_scenario_filtered.location_name=="Global"
]

df_gbd_pop_all_global = df_gbd_pop_all_scenario_filtered[
    df_gbd_pop_all_scenario_filtered.location_name=="Global"
]

In [22]:
df_gbd_pop_all_scenario_filtered.to_csv(os.path.join(prepped_data_root, "gbd_scenarios_pop.csv"))
df_gbd_pop_all_global.to_csv(os.path.join(prepped_data_root, "gbd_world_pop.csv"))
df_gbd_births_global.to_csv(os.path.join(prepped_data_root, "gbd_world_births.csv"))

## Back to UNPD compute cumulative births

In [2]:
from pandas import concat, read_csv

df_unpd_all_locs = concat(
    [
        read_csv("../../data/unpd_data/WPP2022_Demographic_Indicators_Medium.csv", low_memory=False),
        read_csv("../../data/unpd_data/WPP2022_Demographic_Indicators_OtherVariants.csv", low_memory=False)
    ])

In [6]:
df_unpd_all_locs["BirthsSingle"] = df_unpd_all_locs.Births*1_000

In [18]:
df_unpd_all_locs = df_unpd_all_locs[df_unpd_all_locs.Variant.apply(lambda v: v in ("Low","Medium","High"))]

In [23]:

cumulative_births_unpd = df_unpd_all_locs[(df_unpd_all_locs.Time > 2022) & (df_unpd_all_locs.Time < 2101)]\
    .sort_values(["Location","Variant","Time"], ascending=True)\
    .set_index(["Variant","Time","Location"])\
    .groupby(["Variant", "Location"])\
    .BirthsSingle\
    .cumsum()\
    .reset_index()

In [24]:
cumulative_births_unpd

Unnamed: 0,Variant,Time,Location,BirthsSingle
0,High,2023,Afghanistan,1542341.0
1,High,2024,Afghanistan,3096794.0
2,High,2025,Afghanistan,4663043.0
3,High,2026,Afghanistan,6248250.0
4,High,2027,Afghanistan,7898637.0
...,...,...,...,...
66919,Medium,2096,Zimbabwe,39039139.0
66920,Medium,2097,Zimbabwe,39514130.0
66921,Medium,2098,Zimbabwe,39987212.0
66922,Medium,2099,Zimbabwe,40457429.0


In [25]:
cumulative_births_unpd.to_csv("../../data/prepared_data/cumulative_births_unpd_all_countries.csv")

In [26]:
sum(df_unpd_all_locs.Location=="Americas")

0