In [2]:
import pkg_resources
import os
import numpy as np
import pandas as pd
import matplotlib
import statepop
import statepop.multistate as multi

In [174]:
#Path to results folder
results_path = pkg_resources.resource_filename('statepop', 'data/inputs/No_Mig')

# Path to state-level inputs folders
state_inputs = pkg_resources.resource_filename('statepop', 'data/State_Inputs')

# Path to the package
# potentially have an absolute import statement above ? Need to recreate package.

# UN standard life table e0=30; used for linear interpolation of lx values
mortality_30 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')

# UN standard life table e0=100; used for linear interpolation of lx values
mortality_100 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe0100.csv')

# List of states
states_list = os.listdir(state_inputs)

# Specify scenario
scenarios = ("Constant_rate", "SSP2", "SSP3", "SSP5")

In [6]:
def get_state_dirs(root_dir, delimeter='-', target_index=0):
    """Get a list of the full file path to the state directories

    :param root_dir                 A full path with file name and extension to the directory containing the state level data

    :type root_dir:                 str

    :param ..:

    :return:                        A list of full path...

    """

    state_dirs = []
    for i in os.listdir(root_dir):
        split_i = i.split("-")[0]

        try:
            x = int(split_i)
            state_dirs.append(os.path.join(root_dir, i))
        except ValueError:
            pass
        
    return state_dirs

In [7]:
def get_contents_file_list(state_dirs, target_file):
    constant_rate_files = []
    for i in state_dirs:

        contents = os.listdir(i)
        if target_file in contents:
            constant_rate_files.append(os.path.join(i, target_file))

        else:
            print(f"No contents file in directory {i}")

    return constant_rate_files

In [8]:
def make_dataframes(file_list, skiprows=False):
    dfs = [pd.read_csv(files, skiprows=skiprows) for files in file_list]

    return dfs

In [9]:
""""""""""""""""""""""""""""""""""
# Not sure how to directly incorporate this, come back to it later

# Specify the domestic migration factor
# If scenario is not "Constant_rate" (for fertility, mortality and international migration), this factor will become dynamic later
scen.factor <- 0 # 1 for regular, 0 for no domestic migration, 0.5 for half scenario and 2 for double scenario

# Sepecify if international migration is applied
int.mig <- 0 # 1 applied 0 not applied

#* Should details for projection model adjustment be printed?
vis <- F # TRUE (print details); FALSE (don't print details)

#* Should the Brass Relational Model be used or a simple scaling approach to compute future fertility schedules
useBrassf <- T # TRUE (use Brass); FALSE (use scaling)

"""""""""""""""""""""""""""""""""

'"\n# Not sure how to directly incorporate this, come back to it later\n\n# Specify the domestic migration factor\n# If scenario is not "Constant_rate" (for fertility, mortality and international migration), this factor will become dynamic later\nscen.factor <- 0 # 1 for regular, 0 for no domestic migration, 0.5 for half scenario and 2 for double scenario\n\n# Sepecify if international migration is applied\nint.mig <- 0 # 1 applied 0 not applied\n\n#* Should details for projection model adjustment be printed?\nvis <- F # TRUE (print details); FALSE (don\'t print details)\n\n#* Should the Brass Relational Model be used or a simple scaling approach to compute future fertility schedules\nuseBrassf <- T # TRUE (use Brass); FALSE (use scaling)\n\n'

In [13]:
#* Generate Directories
#for results in results_path:
#    os.makedirs(results, exist_ok=True)

mort_30 = pd.read_csv(mortality_30)
mort_100 = pd.read_csv(mortality_100)

#From 0 to 100
num_ages = list(range(101))

#initial_all_base_population = pd.DataFrame(index = 4*num_ages, columns=regions)
#updated_all_base_population = pd.DataFrame(index = 4*num_ages, columns=regions)

#* Define starting year and end year
#year_start = 2010
#year_through = 2100
#timestep = 5
#steps = year_through - year_start

# These dataframes will hold population projections and net/in/out state-level migrations for all states and years
total_population_projection = pd.DataFrame()
total_state_net_migration = pd.DataFrame()
total_state_in_migration = pd.DataFrame()
total_state_out_migration = pd.DataFrame()

# This dataframe holds population values before applying domestic migration. It is necessary to disaggregate
# domestic migration to/from each state across all other states
total_population_no_domestic_migration = pd.DataFrame()

# Initialize a dataframe for holding international migration for all years and states
#total_international_migration = pd.DataFrame(index = 4*num_ages*(steps+1), columns=regions)

state_dir_list = get_state_dirs(state_inputs) # Input data directory

In [14]:
def single_state_constant_rate(state_abbr, year_start=2010, year_through=2100, timestep=5):
    """Retrieve indicated state's constant rate population scenario file and assign to dataframe.

    :param state_abbr:              Two-character state abbreviations
    
    :type state_abbr:               str
    
    :param year_start:              (Optional) 
    
    :param year_through:              (Optional) 
    
    :param timestep:                Years are by  
    
    

    :return:                        A list of full path...

    """
    target_state = [s for s in states_list if state_abbr in s]

    constant_rate_list = get_contents_file_list(state_dir_list, target_file='Constant_rate.csv')
    constant_rate_file = [i for i in constant_rate_list if target_state[0] in i][0]

    constant_rate_df = pd.read_csv(constant_rate_file, skiprows=[1,2,3])
    
    constant_rate_df = constant_rate_df.loc[constant_rate_df["year"].isin(
    list(range(year_start, year_through + timestep, timestep)))]
    
    return constant_rate_df

In [80]:
def single_state_base_pop(state_abbr):
    
    #* Base Population data
    target_state = [s for s in states_list if state_abbr in s]
    
    base_pop_list = get_contents_file_list(state_dir_list, target_file='basePop.csv')
    base_pop_file = [i for i in base_pop_list if target_state[0] in i][0]
    
    base_pop_df = pd.read_csv(base_pop_file)
    
    return base_pop_df

In [16]:
def single_state_int_migration(state_abbr):
    
    target_state = [s for s in states_list if state_abbr in s]
    
    int_migration_rates_list = get_contents_file_list(state_dir_list, target_file='intMig.csv')
    int_migration_rates_file = [i for i in int_migration_rates_list if target_state[0] in i][0]
    
    int_migration_rates_df = pd.read_csv(int_migration_rates_file)
    
    return int_migration_rates_df

In [76]:
#* Scenario data (The Constant_rate scenario. )
cr_df = single_state_constant_rate("WA")
cr_df

Unnamed: 0,year,f_A,f_R,f_U,m_AM,m_AF,m_RM,m_RF,m_UM,m_UF,pu,sr_A,sr_R,sr_U,nim_M,nim_F
0,2010,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
1,2015,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
2,2020,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
3,2025,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
4,2030,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
5,2035,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
6,2040,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
7,2045,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
8,2050,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093
9,2055,1.88,1.88,1.88,77.6,81.6,77.6,81.6,77.6,81.6,0.841,105,105,105,15656.90687,14575.13093


In [81]:
#* Base Population data
bp_df = single_state_base_pop("WA")
bp_df

Unnamed: 0,region,region.code,age,rural_female,rural_male,urban_female,urban_male,all_female,all_male
0,WA,840-53,0,,,41910.000000,42790.000000,,
1,WA,840-53,1,,,44000.000000,44500.000000,,
2,WA,840-53,2,,,43540.000000,46510.000000,,
3,WA,840-53,3,,,43990.000000,46470.000000,,
4,WA,840-53,4,,,42360.000000,44140.000000,,
...,...,...,...,...,...,...,...,...,...
96,WA,840-53,96,,,1568.257468,522.311937,,
97,WA,840-53,97,,,1129.572812,350.373981,,
98,WA,840-53,98,,,783.325206,225.345442,,
99,WA,840-53,99,,,544.186865,144.831048,,


In [99]:
#* International migration data
# Note: The international migration rates input data needs to be in relative rates (summing up to 1)
im_df = single_state_int_migration("WA")
im_df

Unnamed: 0,region,region_code,age,net_male,net_female,net_all
0,WA,840-53,0,0.013864,0.014405,
1,WA,840-53,1,0.013864,0.014405,
2,WA,840-53,2,0.013864,0.014405,
3,WA,840-53,3,0.013864,0.014405,
4,WA,840-53,4,0.013864,0.014405,
...,...,...,...,...,...,...
96,WA,840-53,96,0.000000,0.000000,
97,WA,840-53,97,0.000000,0.000000,
98,WA,840-53,98,0.000000,0.000000,
99,WA,840-53,99,0.000000,0.000000,


In [19]:
# Annual total net international migration counts for 5-year periods
female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values

In [136]:
# Update the base year population with the international migration
# females -> urban0, rural0, urban1, rural1,...
females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
females_base_pop = females_base_pop.reset_index(drop=True)
females_base_pop

0      41910.000000
1               NaN
2      44000.000000
3               NaN
4      43540.000000
           ...     
197             NaN
198      544.186865
199             NaN
200      940.399316
201             NaN
Length: 202, dtype: float64

In [134]:
# males
males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
males_base_pop = males_base_pop.reset_index(drop=True)
males_base_pop

0      42790.000000
1               NaN
2      44500.000000
3               NaN
4      46510.000000
           ...     
197             NaN
198      144.831048
199             NaN
200      221.788105
201             NaN
Length: 202, dtype: float64

In [177]:
# combines female and male pieces
base_pop_df = pd.DataFrame(pd.concat([females_base_pop, 
                                           males_base_pop]).reset_index(drop=True))
base_pop_df = base_pop_df.fillna(0)
base_pop_df = base_pop_df.round(0).astype(np.int)
base_pop_df.columns=['53-WA']
base_pop_df

Unnamed: 0,53-WA
0,41910
1,0
2,44000
3,0
4,43540
...,...
399,0
400,145
401,0
402,222


In [171]:
# Spread migrant numbers according to profile
int_mig_rates = 1

if int_mig_rates == 1:
    im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
    im_df["nmRF"] = im_df['nmUF']
    im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
    im_df['nmRM'] = im_df['nmUM']
else:
    im_df["nmUF"] = im_df['net_female'] * 0
    im_df["nmRF"] = im_df['nmUF']
    im_df["nmUM"] = im_df['net_male'] * 0
    im_df['nmRM'] = im_df['nmUM']

im_df

Unnamed: 0,region,region_code,age,net_male,net_female,net_all,nmUF,nmRF,nmUM,nmRM
0,WA,840-53,0,0.013864,0.014405,,52.17387,52.17387,53.931477,53.931477
1,WA,840-53,1,0.013864,0.014405,,52.17387,52.17387,53.931477,53.931477
2,WA,840-53,2,0.013864,0.014405,,52.17387,52.17387,53.931477,53.931477
3,WA,840-53,3,0.013864,0.014405,,52.17387,52.17387,53.931477,53.931477
4,WA,840-53,4,0.013864,0.014405,,52.17387,52.17387,53.931477,53.931477
...,...,...,...,...,...,...,...,...,...,...
96,WA,840-53,96,0.000000,0.000000,,0.00000,0.00000,0.000000,0.000000
97,WA,840-53,97,0.000000,0.000000,,0.00000,0.00000,0.000000,0.000000
98,WA,840-53,98,0.000000,0.000000,,0.00000,0.00000,0.000000,0.000000
99,WA,840-53,99,0.000000,0.000000,,0.00000,0.00000,0.000000,0.000000


In [157]:
# Update the base year population with the international migration rates
urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
urban_rural_females = urban_rural_females.reset_index(drop=True)

urban_rural_males = pd.concat([im_df['nmUM'], im_df['nmRM']]).sort_index(kind='mergesort')
urban_rural_males = urban_rural_males.reset_index(drop=True)

urban_rural_females

0      52.17387
1      52.17387
2      52.17387
3      52.17387
4      52.17387
         ...   
197     0.00000
198     0.00000
199     0.00000
200     0.00000
201     0.00000
Length: 202, dtype: float64

In [179]:
initial_all_base_population = pd.DataFrame(data = base_pop_df, index = list(range(404)), columns=states_list)
initial_all_base_population

Unnamed: 0,1-AL,10-DE,11-DC,12-FL,13-GA,15-HI,16-ID,17-IL,18-IN,19-IA,...,5-AR,50-VT,51-VA,53-WA,54-WV,55-WI,56-WY,6-CA,8-CO,9-CT
0,,,,,,,,,,,...,,,,41910,,,,,,
1,,,,,,,,,,,...,,,,0,,,,,,
2,,,,,,,,,,,...,,,,44000,,,,,,
3,,,,,,,,,,,...,,,,0,,,,,,
4,,,,,,,,,,,...,,,,43540,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,,,,,,,,,,,...,,,,0,,,,,,
400,,,,,,,,,,,...,,,,145,,,,,,
401,,,,,,,,,,,...,,,,0,,,,,,
402,,,,,,,,,,,...,,,,222,,,,,,


In [None]:
def single_state_base_pop_df(state_abbr, int_mig_rates=1):
    #* Scenario data (The Constant_rate scenario. )
    cr_df = single_state_constant_rate(state_abbr)
    
    #* Base Population data
    bp_df = single_state_base_pop(state_abbr)
    
    #* International migration data
    # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
    im_df = single_state_int_migration(state_abbr)
    
    # Annual total net international migration counts for 5-year periods
    female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
    male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values
    
    # Update the base year population with the international migration
    # females -> urban0, rural0, urban1, rural1,...
    females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
    females_base_pop = females_base_pop.reset_index(drop=True)
    
    # males
    males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
    males_base_pop = males_base_pop.reset_index(drop=True)
    
    # combines female and male pieces
    base_pop_df = pd.DataFrame(pd.concat([females_base_pop, 
                                           males_base_pop]).reset_index(drop=True))
    base_pop_df = base_pop_df.fillna(0)
    base_pop_df = base_pop_df.round(0).astype(np.int)
    base_pop_df.columns = [s for s in states_list if state_abbr in s]
    
    # Spread migrant numbers according to profile
    if int_mig_rates == 1:
        im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
        im_df["nmRF"] = im_df['nmUF']
        im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
        im_df['nmRM'] = im_df['nmUM']
    else:
        im_df["nmUF"] = im_df['net_female'] * 0
        im_df["nmRF"] = im_df['nmUF']
        im_df["nmUM"] = im_df['net_male'] * 0
        im_df['nmRM'] = im_df['nmUM']
    
    # Update the base year population with the international migration rates
    urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
    urban_rural_females = urban_rural_females.reset_index(drop=True)
    
    urban_rural_males = pd.concat([im_df['nmUM'], im_df['nm_RM']]).sort_index(kind='mergesort')
    urban_rural_males = urban_rural_males.reset_index(drop=True)
    
    base_pop_df = base_pop_df+(pd.concat([urban_rural_females, 
                                           urban_rural_males]).reset_index(drop=True))
    
    # Populate the data frames that store base-year population and its updated population by international migration
    initial_all_base_population = pd.DataFrame(data = base_pop_df, index = list(range(404)), columns=states_list)
    initial_all_base_population.fillna(0)
    

In [None]:
def multi_state_base_pop_df(state_dirs, year_start=2010, year_through=2100, timestep=5):
    #* Generate paths
    state_dir_list = get_state_dirs(state_inputs) # Input data directory

    #* Scenario data (The Constant_rate scenario.)
    constant_rates = get_contents_file_list(state_dir_list, target_file='Constant_rate.csv', skiprows=[1,2,3])
    constant_rate_dfs = make_dataframes(constant_rates)

    #* Base Population data
    base_population = get_contents_file_list(state_dir_list, target_file='basePop.csv')
    base_population_dfs = make_dataframes(base_population)

    #* International migration data
    # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
    international_migration_rates = get_contents_file_list(state_dir_list, target_file='intMig.csv')
    internation_migration_rate_dfs = make_dataframes(international_migration_rates)
    
    for i in dfs:
        