In [1]:
%matplotlib inline

import os
import pkg_resources

import numpy as np
import pandas as pd
import matplotlib

import statepop.multistate
from statepop.utils import alphabetize_states, single_state_files
from statepop.multistate.domestic_migration import in_domestic_migration, out_domestic_migration

## 0. Set up development environment

In [2]:
# directory housing the data
root_dir = r'C:\Users\deci859\PycharmProjects\IM3\statepop\statepop\data'

# directory containing data from R run to compare against
comp_data_dir = os.path.join(root_dir, 'comp_data')

# Path to results folder
results_path = pkg_resources.resource_filename('statepop', 'data/inputs/No_Mig')

# Path to state-level inputs folders
state_inputs = pkg_resources.resource_filename('statepop', 'data/State_Inputs')

# List of states, alphebetized
states_list = os.listdir(state_inputs)
states_list = sorted(states_list, key = lambda x: int(x.split('-')[0]))

# Specify scenario
scenarios = ("Constant_rate", "SSP2", "SSP3", "SSP5")

In [3]:
""""""""""""""""""""""""""""""""""
# Not sure how to directly incorporate this, come back to it later

# Specify the domestic migration factor
# If scenario is not "Constant_rate" (for fertility, mortality and international migration), this factor will become dynamic later
scen.factor <- 0 # 1 for regular, 0 for no domestic migration, 0.5 for half scenario and 2 for double scenario

if scenario == 'Constant_rate':
    scenario_factor = 0
elif scenario == 

# Sepecify if international migration is applied
int.mig <- 0 # 1 applied 0 not applied

#* Should details for projection model adjustment be printed?
vis <- F # TRUE (print details); FALSE (don't print details)

#* Should the Brass Relational Model be used or a simple scaling approach to compute future fertility schedules
useBrassf <- T # TRUE (use Brass); FALSE (use scaling)

"""""""""""""""""""""""""""""""""

'"\n# Not sure how to directly incorporate this, come back to it later\n\n# Specify the domestic migration factor\n# If scenario is not "Constant_rate" (for fertility, mortality and international migration), this factor will become dynamic later\nscen.factor <- 0 # 1 for regular, 0 for no domestic migration, 0.5 for half scenario and 2 for double scenario\n\nif scenario == \'Constant_rate\':\n    scenario_factor = 0\nelif scenario == \n\n# Sepecify if international migration is applied\nint.mig <- 0 # 1 applied 0 not applied\n\n#* Should details for projection model adjustment be printed?\nvis <- F # TRUE (print details); FALSE (don\'t print details)\n\n#* Should the Brass Relational Model be used or a simple scaling approach to compute future fertility schedules\nuseBrassf <- T # TRUE (use Brass); FALSE (use scaling)\n\n'

## 1. Generate baseline population matrix (from R code)

In [121]:
def out_domestic_migration2(updated_all_base_pop_df, scen_factor=0):
  """Calculate the total out migration numbers for all states.

  :param updated_all_base_pop_df:         A data frame containing population numbers per state
  :type updated_all_base_pop_df:          data frame

  :param scen_factor:                     (Optional) Scale applied to migration rates based on the scenario.
                                          Default scenario = 0, no domestic migration.
  :type scen_factor:                      (Optional) int

  :return:                                A data frame with total out migration population

  """

  print("Calculating domestic migration rates out of states.")
  tot_out_mig_pop = pd.DataFrame()

  for states in alphabetize_states():

    # get target state id_abbrev
    target_state = states.split('-')[1]

    # read in the contributing states in migration rates
    current_out_mig = single_state_files(target_state, '_out_mig.csv')

    # get population data for every state except the target state
    to_states = [i for i in updated_all_base_pop_df.columns if states != i]
    current_to_pop = updated_all_base_pop_df[to_states].copy()

    # set data frame to hold contributing states population
    total_out_mig_pop = pd.DataFrame()

    for state in to_states:
      state_abbrev = state.split('-')[1]

      # create a copy of the out migration data frame for the to state
      state_out_mig_urban = current_out_mig.loc[current_out_mig['to'] == state_abbrev].copy()

      # create copy for rural and zero out rate
      state_out_mig_rural = state_out_mig_urban.copy()
      state_out_mig_rural['rate'] *= 0

      # add sorting value for rural
      state_out_mig_rural['setting'] = 1

      # add sorting value for urban
      state_out_mig_urban['setting'] = 0

      # combine urban and rural data frames
      state_out_mig = pd.concat([state_out_mig_urban, state_out_mig_rural])

      # sort data frame by gender and age
      state_out_mig.sort_values(by=['gender', 'age', 'setting'], inplace=True)

      # add to all states data frame
      total_out_mig_pop[state] = state_out_mig['rate'].values

    # multiply to get the out migration numbers from each contributing state
    total_out_net_pop = (total_out_mig_pop * scen_factor) * current_to_pop

    # sum the total out migration numbers across all contributing states with one value per age group
    tot_out_mig_pop[states] = total_out_net_pop.sum(axis=1)

  return tot_out_mig_pop

In [107]:
def multi_state_base_pop(int_mig_rates=0, domestic_migration_factor='Constant_rate', scen_factor=0):
    
    current_int_mig = []
    initial_all_base_pop = []
    upd_all_base_pop = []
    
    print("Importing data for baseline population matrix.")
    for states in states_list:
        
        state_abbrev = states.split('-')[1]
        
        # Scenario data (The Constant_rate scenario. )
        cr_df = single_state_files(state_abbrev, 'Constant_rate.csv')

        # Base Population data
        bp_df = single_state_files(state_abbrev, 'basePop.csv')
        
        # remove any empty rows
        bp_df = bp_df.loc[bp_df['region'] == state_abbrev]

        #* International migration data
        # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
        im_df = single_state_files(state_abbrev, 'intMig.csv')

        # Annual total net international migration counts for 5-year periods
        female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
        male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values

        # Update the base year population with the international migration
        # females -> urban0, rural0, urban1, rural1,...
        females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
        females_base_pop = females_base_pop.reset_index(drop=True) #fBP

        # males
        males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
        males_base_pop = males_base_pop.reset_index(drop=True) #mBP

        # combines female and male pieces
        base_pop_df = pd.DataFrame(pd.concat([females_base_pop, males_base_pop]).reset_index(drop=True))                  
        base_pop_df.columns = [s for s in states_list if states in s] #matBP
        

        # Spread migrant numbers according to profile
        if int_mig_rates == 1:
            im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
            im_df['nmRM'] = im_df['nmUM']
        else:
            im_df["nmUF"] = im_df['net_female'] * 0
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * 0
            im_df['nmRM'] = im_df['nmUM']

        # Update the base year population with the international migration rates
        urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
        urban_rural_females = urban_rural_females.reset_index(drop=True)

        urban_rural_males = pd.concat([im_df['nmUM'], im_df['nmRM']]).sort_index(kind='mergesort')
        urban_rural_males = urban_rural_males.reset_index(drop=True)

        urban_rural_all = pd.DataFrame(pd.concat([urban_rural_females, urban_rural_males]).reset_index(drop=True)) 
        urban_rural_all.columns = [s for s in states_list if states in s]

        # Populate the data frames that store base-year population and its updated population by international migration
        initial_all_base_pop.append(base_pop_df)

        updated_all_base_pop = base_pop_df + urban_rural_all
        updated_all_base_pop.fillna(0, inplace=True) #matBP_upd
        upd_all_base_pop.append(updated_all_base_pop)

        # Keep the international migration for the current state
        current_int_migration = updated_all_base_pop - base_pop_df
        current_int_migration.fillna(0, inplace=True)        
        current_int_mig.append(current_int_migration)
    
    print("Populating baseline population matrix.")
    # base population dataframe for all states
    updated_all_base_pop_df = pd.concat(upd_all_base_pop, axis=1)

    print("Calculating the net domestic migration numbers.")
    # Calculate the total in/out and net migration numbers for all states                     
    #in_migration = in_domestic_migration(updated_all_base_pop_df, scen_factor)
    #out_migration = out_domestic_migration2(updated_all_base_pop_df, scen_factor)
    #net_migration =

    # in migration rates dataframe for all states
    total_int_migration = pd.concat(current_int_mig, axis=1)
    
    # initial base population dataframe for all states
    init_all_base_pop_df = pd.concat(initial_all_base_pop, axis=1)
    init_all_base_pop_df.fillna(0, inplace=True)
    
    
    return total_int_migration, init_all_base_pop_df, updated_all_base_pop_df

## 1.1 Generate outputs

In [117]:
int_mig_rates = 1
domestic_migration_factor = 'Constant_rate'
scen_factor = 1

bpop = multi_state_base_pop(int_mig_rates, domestic_migration_factor, scen_factor)

# unpack outputs
total_int_migration, init_all_base_pop_df, updated_all_base_pop_df = bpop

Importing data for baseline population matrix.
Populating baseline population matrix.
Calculating the net domestic migration numbers.


In [118]:
total_int_migration

Unnamed: 0,1-AL,2-AK,4-AZ,5-AR,6-CA,8-CO,9-CT,10-DE,11-DC,12-FL,...,46-SD,47-TN,48-TX,49-UT,50-VT,51-VA,53-WA,54-WV,55-WI,56-WY
0,89.899814,31.330251,175.92476,56.077548,729.440262,157.270657,52.17387,28.795021,42.234619,516.898721,...,11.077224,130.405867,577.124826,73.147133,8.5564,282.649278,209.948692,14.059,90.043861,11.149248
1,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000,0.000000,0.000000
2,89.899814,31.330251,175.92476,56.077548,729.440262,157.270657,52.17387,28.795021,42.234619,516.898721,...,11.077224,130.405867,577.124826,73.147133,8.5564,282.649278,209.948692,14.059,90.043861,11.149248
3,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000,0.000000,0.000000
4,89.899814,31.330251,175.92476,56.077548,729.440262,157.270657,52.17387,28.795021,42.234619,516.898721,...,11.077224,130.405867,577.124826,73.147133,8.5564,282.649278,209.948692,14.059,90.043861,11.149248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000,0.000000,0.000000
400,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000,0.000000,0.000000
401,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000,0.000000,0.000000
402,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000,0.000000,0.000000


In [122]:
out_mig = out_domestic_migration2(updated_all_base_pop_df, scen_factor=1)
out_mig

Calculating domestic migration rates out of states.


Unnamed: 0,1-AL,2-AK,4-AZ,5-AR,6-CA,8-CO,9-CT,10-DE,11-DC,12-FL,...,46-SD,47-TN,48-TX,49-UT,50-VT,51-VA,53-WA,54-WV,55-WI,56-WY
0,876.293107,6337.978526,1881.714709,1428.750554,580.070190,1484.659045,1138.884094,1393.858685,2759.234997,1200.106128,...,1109.877534,1198.597992,581.109262,1033.946277,1282.447166,1580.850011,1309.650041,1312.953987,839.154134,1231.518475
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,908.120520,6566.180544,1973.350574,1476.824518,598.867799,1546.831418,1177.002557,1425.853763,2799.144230,1239.966750,...,1150.568304,1237.259647,603.697215,1080.198855,1306.648451,1631.687774,1372.589178,1356.784789,864.233841,1275.010406
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,943.877945,6830.885474,2052.151290,1533.211308,621.763165,1607.055096,1223.200707,1490.824193,2951.076417,1290.340265,...,1194.803450,1289.403644,628.775045,1125.854486,1357.108482,1702.549837,1431.043050,1411.545366,903.172504,1325.182904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
400,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
401,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
402,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [101]:
r_out_migration = pd.read_csv(os.path.join(comp_data_dir, "out.migration.csv"), dtype=np.float64)
r_out_migration = r_out_migration[out_mig.columns]
r_out_migration

Unnamed: 0,1-AL,2-AK,4-AZ,5-AR,6-CA,8-CO,9-CT,10-DE,11-DC,12-FL,...,46-SD,47-TN,48-TX,49-UT,50-VT,51-VA,53-WA,54-WV,55-WI,56-WY
0,431.048883,417.513521,828.693942,339.047068,2274.763167,684.840148,268.372991,115.509220,193.991529,1741.929890,...,131.616010,730.441120,1805.016933,404.568035,66.494330,993.981111,760.188181,201.630441,391.760828,130.027373
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,417.766065,450.583899,862.632263,358.372939,2425.179654,687.416349,277.652628,117.852206,190.187774,1780.274802,...,157.130822,764.348300,1861.608386,423.703009,74.603394,1006.687072,798.097828,189.769827,414.244698,139.898561
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,444.760180,409.245926,921.525232,376.573031,2526.618702,739.369740,306.669906,137.767589,174.972752,1867.673565,...,147.278568,776.800089,1934.726546,424.185404,73.908332,1051.886967,789.754078,200.424277,415.068714,139.217790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
400,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
401,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
402,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [43]:
int_mig_rates=0
domestic_migration_factor='Constant_rate'

bpop = multi_state_base_pop(int_mig_rates, domestic_migration_factor)

# unpack outputs
total_int_migration, init_all_base_pop_df, updated_all_base_pop_df = bpop

Importing data for baseline population matrix.
Populating baseline population matrix.
Calculating the net domestic migration numbers.
Calculating domestic migration rates out of states.


ValueError: too many values to unpack (expected 3)

## 1.1.1 Test against R outputs

In [115]:
def compare_dfs(df_r, df_python, description):
    """
    Compares values between R and Python dataframes
    """
    
    df_r = df_r.round(0)
    df_python = df_python.round(0)
    print('Testing: {}'.format(description))
    pd.testing.assert_frame_equal(df_r, df_python)

In [116]:
# R population outputs as float64 data frames
r_ini_all_base_pop = pd.read_csv(os.path.join(comp_data_dir, "ini.all.base.pop.csv"), dtype=np.float64)
r_upd_all_base_pop = pd.read_csv(os.path.join(comp_data_dir, "upd.all.base.pop.csv"), dtype=np.float64)
r_tot_int_mig = pd.read_csv(os.path.join(comp_data_dir, "tot.int.mig.csv"), dtype=np.float64)

# reorder columns to match new format
r_ini_all_base_pop = r_ini_all_base_pop[init_all_base_pop_df.columns]
r_upd_all_base_pop = r_upd_all_base_pop[updated_all_base_pop_df.columns]
r_tot_int_mig = r_tot_int_mig[total_int_migration.columns]

# compare equality with the Python version; will return nothing if successful
compare_dfs(r_ini_all_base_pop, init_all_base_pop_df, '`init_all_base_pop_df`')
compare_dfs(r_upd_all_base_pop, updated_all_base_pop_df, '`updated_all_base_pop_df`')
compare_dfs(r_tot_int_mig, total_int_migration, '`total_int_migration`')

Testing: `init_all_base_pop_df`
Testing: `updated_all_base_pop_df`
Testing: `total_int_migration`


AssertionError: DataFrame are different

DataFrame shape mismatch
[left]:  (36764, 51)
[right]: (404, 51)

In [103]:
init_all_base_pop_df.head()

NameError: name 'init_all_base_pop_df' is not defined

## 1.2.1 Set up and run in_migration code

In [32]:
# set the scenario factor to 1 so we will have non-zero output for testing
scen_factor = 1

# get a list of state directories
state_dirs = [i for i in os.listdir(state_inputs) if '.DS_Store' not in i]

# sort state directories by their leading id
state_dirs = sorted(state_dirs, key = lambda x: int(x.split('-')[0]))

# create a list of in migration files from each state
in_migration_files = [os.path.join(state_inputs, i, f"{i}_in_mig.csv") for i in state_dirs]

# create the in migration population data frame for all states
in_migration_df = in_domestic_migration(in_migration_files, updated_all_base_pop_df, scen_factor)

In [33]:
in_migration_df

Unnamed: 0,1-AL,2-AK,4-AZ,5-AR,6-CA,8-CO,9-CT,10-DE,11-DC,12-FL,...,46-SD,47-TN,48-TX,49-UT,50-VT,51-VA,53-WA,54-WV,55-WI,56-WY
0,491.061984,190.126802,868.925702,358.058339,1869.815634,766.794796,299.500064,157.022591,73.294516,1758.452766,...,89.125192,697.525495,2454.624518,482.346340,63.292001,1170.708483,917.064522,177.112712,387.694901,135.498583
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,508.057262,197.308111,908.993771,370.201540,1938.948596,795.443960,308.578216,161.267896,73.716656,1810.137835,...,93.276723,714.062019,2538.187138,506.593298,64.749029,1197.497111,959.005538,181.375396,399.317746,142.453514
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,527.024983,203.706732,937.160190,383.042573,2000.638602,822.180684,318.570628,168.274183,78.278187,1875.929812,...,95.415738,742.972946,2628.891745,522.483854,67.032171,1247.469451,996.816251,187.787375,416.268286,145.456149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
400,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.952246,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
401,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
402,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.458228,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## 1.2.2 Test in_migration code against R version output

In [15]:
def single_state_base_pop_df(state_abbr, int_mig_rates=0, domestic_migration_factor='Constant_rate'):
        
    #* Scenario data (The Constant_rate scenario. )
    cr_df = single_state_files(state_abbr, 'Constant_rate.csv')
    
    #* Base Population data
    bp_df = single_state_files(state_abbr, 'basePop.csv')
    
    #* International migration data
    # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
    im_df = single_state_files(state_abbr, 'intMig.csv')
    
    # Annual total net international migration counts for 5-year periods
    female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
    male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values
    
    # Update the base year population with the international migration
    # females -> urban0, rural0, urban1, rural1,...
    females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
    females_base_pop = females_base_pop.reset_index(drop=True) #fBP
    
    # males
    males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
    males_base_pop = males_base_pop.reset_index(drop=True) #mBP
    
    # combines female and male pieces
    base_pop_df = pd.DataFrame(pd.concat([females_base_pop, 
                                           males_base_pop]).reset_index(drop=True))
    # for some reason, Alabama is 4 rows longer than any other state (contains no data)
    if state_abbr == 'AL':
        base_pop_df.drop(base_pop_df.tail(4).index,inplace=True)
    base_pop_df.columns = [s for s in states_list if state_abbr in s] #matBP
    
    # Spread migrant numbers according to profile
    if int_mig_rates == 1:
        im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
        im_df["nmRF"] = im_df['nmUF']
        im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
        im_df['nmRM'] = im_df['nmUM']
    else:
        im_df["nmUF"] = im_df['net_female'] * 0
        im_df["nmRF"] = im_df['nmUF']
        im_df["nmUM"] = im_df['net_male'] * 0
        im_df['nmRM'] = im_df['nmUM']
    
    # Update the base year population with the international migration rates
    urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
    urban_rural_females = urban_rural_females.reset_index(drop=True)
    
    urban_rural_males = pd.concat([im_df['nmUM'], im_df['nmRM']]).sort_index(kind='mergesort')
    urban_rural_males = urban_rural_males.reset_index(drop=True)
    
    urban_rural_all = pd.DataFrame(pd.concat([urban_rural_females, 
                                           urban_rural_males]).reset_index(drop=True)) 
    urban_rural_all.columns = [s for s in states_list if state_abbr in s]

    # Populate the data frames that store base-year population and its updated population by international migration
    initial_all_base_population = pd.DataFrame(data = base_pop_df, index = list(range(404)), 
                                               columns=states_list).fillna(0)

    updated_all_base_pop = base_pop_df + urban_rural_all
    updated_all_base_pop.fillna(0, inplace=True) #matBP_upd
    
    # Keep the international migration for the current state
    current_int_migration = updated_all_base_pop - base_pop_df
    current_int_migration.fillna(0, inplace=True)
    total_int_migration = pd.DataFrame(data = current_int_migration, index = list(range(404)), columns=states_list)
    
    # Calculate the total in out and net migration numbers for all states
    in_migration = in_domestic_migration_calc_ss(state_abbr, updated_all_base_pop, domestic_migration_factor=0)
    #out_migration = 
    #net_migration = 
    
    return current_int_migration #total_int_migration

In [49]:
# R in_migration output as float64 data frames
r_in_migration = pd.read_csv(os.path.join(comp_data_dir, "in.migration.csv"), dtype=np.float64)

# reorder columns to match new format
r_in_migration = r_in_migration[in_migration_df.columns]

# compare equality with the Python version; will return nothing if successful
compare_dfs(r_in_migration, in_migration_df, '`in_migration_df`')

Testing: `in_migration_df`


AssertionError: DataFrame.iloc[:, 1] (column name="2-AK") are different

DataFrame.iloc[:, 1] (column name="2-AK") values are different (3.46535 %)
[left]:  [190.0, 0.0, 197.0, 0.0, 204.0, 0.0, 205.0, 0.0, 201.0, 0.0, 140.0, 0.0, 139.0, 0.0, 138.0, 0.0, 138.0, 0.0, 141.0, 0.0, 130.0, 0.0, 129.0, 0.0, 128.0, 0.0, 126.0, 0.0, 127.0, 0.0, 116.0, 0.0, 119.0, 0.0, 119.0, 0.0, 121.0, 0.0, 126.0, 0.0, 403.0, 0.0, 388.0, 0.0, 380.0, 0.0, 378.0, 0.0, 379.0, 0.0, 315.0, 0.0, 311.0, 0.0, 319.0, 0.0, 316.0, 0.0, 318.0, 0.0, 231.0, 0.0, 214.0, 0.0, 216.0, 0.0, 211.0, 0.0, 208.0, 0.0, 144.0, 0.0, 140.0, 0.0, 144.0, 0.0, 150.0, 0.0, 158.0, 0.0, 138.0, 0.0, 129.0, 0.0, 128.0, 0.0, 127.0, 0.0, 129.0, 0.0, 71.0, 0.0, 72.0, 0.0, 72.0, 0.0, 72.0, 0.0, 73.0, 0.0, ...]
[right]: [190.0, 0.0, 197.0, 0.0, 204.0, 0.0, 205.0, 0.0, 201.0, 0.0, 140.0, 0.0, 139.0, 0.0, 138.0, 0.0, 138.0, 0.0, 141.0, 0.0, 130.0, 0.0, 129.0, 0.0, 128.0, 0.0, 126.0, 0.0, 127.0, 0.0, 116.0, 0.0, 119.0, 0.0, 119.0, 0.0, 121.0, 0.0, 126.0, 0.0, 403.0, 0.0, 388.0, 0.0, 380.0, 0.0, 378.0, 0.0, 379.0, 0.0, 315.0, 0.0, 311.0, 0.0, 319.0, 0.0, 316.0, 0.0, 318.0, 0.0, 231.0, 0.0, 214.0, 0.0, 216.0, 0.0, 211.0, 0.0, 208.0, 0.0, 144.0, 0.0, 140.0, 0.0, 144.0, 0.0, 150.0, 0.0, 158.0, 0.0, 138.0, 0.0, 129.0, 0.0, 128.0, 0.0, 127.0, 0.0, 129.0, 0.0, 71.0, 0.0, 72.0, 0.0, 72.0, 0.0, 72.0, 0.0, 73.0, 0.0, ...]

In [16]:
single_state_base_pop_df("AL", int_mig_rates=1)

TypeError: in_domestic_migration_calc_ss() got multiple values for argument 'domestic_migration_factor'

## Come back to this later

In [266]:
def multi_state_base_pop_df(int_mig_rates=0):
    
    current_int_mig = []
    
    for states in states_list:
        
        #* Scenario data (The Constant_rate scenario. )
        cr_df = single_state_files(states.split('-')[1], 'Constant_rate.csv')

        #* Base Population data
        bp_df = single_state_files(states.split('-')[1], 'basePop.csv')

        #* International migration data
        # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
        im_df = single_state_files(states.split('-')[1], 'intMig.csv')

        # Annual total net international migration counts for 5-year periods
        female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
        male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values

        # Update the base year population with the international migration
        # females -> urban0, rural0, urban1, rural1,...
        females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
        females_base_pop = females_base_pop.reset_index(drop=True)

        # males
        males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
        males_base_pop = males_base_pop.reset_index(drop=True)

        # combines female and male pieces
        base_pop_df = pd.DataFrame(pd.concat([females_base_pop, 
                                               males_base_pop]).reset_index(drop=True))
        base_pop_df = base_pop_df.fillna(0)
        base_pop_df = base_pop_df.round(0).astype(np.int)
        base_pop_df.columns = [s for s in states_list if states in s]

        # Spread migrant numbers according to profile
        if int_mig_rates == 1:
            im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
            im_df['nmRM'] = im_df['nmUM']
        else:
            im_df["nmUF"] = im_df['net_female'] * 0
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * 0
            im_df['nmRM'] = im_df['nmUM']

        # Update the base year population with the international migration rates
        urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
        urban_rural_females = urban_rural_females.reset_index(drop=True)

        urban_rural_males = pd.concat([im_df['nmUM'], im_df['nmRM']]).sort_index(kind='mergesort')
        urban_rural_males = urban_rural_males.reset_index(drop=True)

        urban_rural_all = pd.DataFrame(pd.concat([urban_rural_females, 
                                               urban_rural_males]).reset_index(drop=True))
        urban_rural_all.columns = [s for s in states_list if states in s]

        # Populate the data frames that store base-year population and its updated population by international migration
        initial_all_base_population = pd.DataFrame(data = base_pop_df, index = list(range(404)), 
                                                   columns=states_list).fillna(0)
        update_all_base_pop = base_pop_df.add(urban_rural_all, fill_value=0)    

        # Keep the international migration for the current state
        current_int_migration = update_all_base_pop.subtract(base_pop_df, fill_value=0)
        current_int_mig.append(current_int_migration)
        
        # Calculate the total in out and net migration numbers for all states
        #in_migration = 
        #out_migration = 
        #net_migration = 

    total_int_migration = pd.concat(current_int_mig, axis=1)
    # drop final 4 rows, for some reason AL is 4 rows longer than the rest
    total_int_migration.drop(total_int_migration.tail(4).index,inplace=True)
    
    
    
    
    return update_all_base_pop

In [267]:
multi_state_base_pop_df(int_mig_rates=1)

Unnamed: 0,56-WY
0,3831.149248
1,11.149248
2,4121.149248
3,11.149248
4,4101.149248
...,...
399,0.000000
400,10.000000
401,0.000000
402,15.000000


## Mortality

In [None]:
def single_state_mortality(state_abbr, scenario='Constant_rate'):
    #* Scenario data (The Constant_rate scenario. )
    cr_df = single_state_files(state_abbr, 'Constant_rate.csv')
    
    #* Mortality data
    # Note: The variable names need to be the same in all three mortality data files
    mortality_df = single_state_files(state_abbr, 'mortality.csv')
    
    # UN e0=30 (life expectancy at age 0 is 30 years)
    mortality_30 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
    mort_30 = pd.read_csv(mortality_30)
    mort_30.columns = mort_30.columns.map(lambda x : x+'_30' if x !='age' else x)
    
    # UN e0=100 (life expectancy at age 0 is 100 years)
    mortality_100 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
    mort_100 = pd.read_csv(mortality_100)
    mort_100.columns = mort_100.columns.map(lambda x : x+'_100' if x !='age' else x)
    
    # Merge UN e0=100 and UN e0=30 values into mortality_df
    mort_joined_df = mortality_df.merge(mort_30,on='age').merge(mort_100,on='age')
    
    # Life expectancies at age zero (e0) for projection period according to the scenario
    # Note: Start with e0 of baseline year and provide expected e0 values for 5-year intervals; 
    #single year e0 values will be linearly interpolated 

    urban_females_e0 = cr_df["m_UF"].round(0).astype(np.int).values
    rural_females_e0 = cr_df["m_RF"].round(0).astype(np.int).values
    urban_males_e0 = cr_df["m_UM"].round(0).astype(np.int).values
    rural_males_e0 = cr_df["m_RM"].round(0).astype(np.int).values
    
    

In [453]:
state_abbr = 'CT'

#* Scenario data (The Constant_rate scenario. )
cr_df = single_state_files(state_abbr, 'Constant_rate.csv')
cr_df

Unnamed: 0,year,f_A,f_R,f_U,m_AM,m_AF,m_RM,m_RF,m_UM,m_UF,pu,sr_A,sr_R,sr_U,nim_M,nim_F
0,2010,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
1,2015,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
2,2020,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
3,2025,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
4,2030,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
5,2035,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
6,2040,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
7,2045,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
8,2050,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
9,2055,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555


In [454]:
#* Mortality data
# Note: The variable names need to be the same in all three mortality data files
mortality_df = single_state_files(state_abbr, 'mortality.csv')
mortality_df

Unnamed: 0,region,age,lx_rural_female,lx_rural_male,lx_urban_female,lx_urban_male
0,CT,0,100000.000000,100000.000000,100000.000000,100000.000000
1,CT,1,99544.192200,99451.307230,99544.192200,99451.307230
2,CT,2,99516.735590,99417.353710,99516.735590,99417.353710
3,CT,3,99499.592940,99393.406250,99499.592940,99393.406250
4,CT,4,99487.272120,99375.493850,99487.272120,99375.493850
...,...,...,...,...,...,...
96,CT,96,11817.653210,6807.857033,11817.653210,6807.857033
97,CT,97,9185.842397,5131.178056,9185.842397,5131.178056
98,CT,98,6950.327109,3766.674881,6950.327109,3766.674881
99,CT,99,5109.747346,2689.508377,5109.747346,2689.508377


In [455]:
# UN e0=30 (life expectancy at age 0 is 30 years)
mortality_30 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
mort_30 = pd.read_csv(mortality_30)
mort_30.columns = mort_30.columns.map(lambda x : x+'_30' if x !='age' else x)
mort_30

Unnamed: 0,age,lx_rural_male_30,lx_rural_female_30,lx_urban_male_30,lx_urban_female_30
0,0,100000,100000,100000,100000
1,1,72835,74529,72835,74529
2,2,69291,70504,69291,70504
3,3,66373,67202,66373,67202
4,4,64014,64538,64014,64538
...,...,...,...,...,...
96,96,77,52,77,52
97,97,56,36,56,36
98,98,41,24,41,24
99,99,29,16,29,16


In [456]:
# UN e0=100 (life expectancy at age 0 is 100 years)
mortality_100 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
mort_100 = pd.read_csv(mortality_100)
mort_100.columns = mort_100.columns.map(lambda x : x+'_100' if x !='age' else x)
mort_100

Unnamed: 0,age,lx_rural_male_100,lx_rural_female_100,lx_urban_male_100,lx_urban_female_100
0,0,100000,100000,100000,100000
1,1,72835,74529,72835,74529
2,2,69291,70504,69291,70504
3,3,66373,67202,66373,67202
4,4,64014,64538,64014,64538
...,...,...,...,...,...
96,96,77,52,77,52
97,97,56,36,56,36
98,98,41,24,41,24
99,99,29,16,29,16


In [457]:
# Merge UN e0=100 and UN e0=30 values into mortality_df
mort_joined_df = mortality_df.merge(mort_30,on='age').merge(mort_100,on='age')
mort_joined_df

Unnamed: 0,region,age,lx_rural_female,lx_rural_male,lx_urban_female,lx_urban_male,lx_rural_male_30,lx_rural_female_30,lx_urban_male_30,lx_urban_female_30,lx_rural_male_100,lx_rural_female_100,lx_urban_male_100,lx_urban_female_100
0,CT,0,100000.000000,100000.000000,100000.000000,100000.000000,100000,100000,100000,100000,100000,100000,100000,100000
1,CT,1,99544.192200,99451.307230,99544.192200,99451.307230,72835,74529,72835,74529,72835,74529,72835,74529
2,CT,2,99516.735590,99417.353710,99516.735590,99417.353710,69291,70504,69291,70504,69291,70504,69291,70504
3,CT,3,99499.592940,99393.406250,99499.592940,99393.406250,66373,67202,66373,67202,66373,67202,66373,67202
4,CT,4,99487.272120,99375.493850,99487.272120,99375.493850,64014,64538,64014,64538,64014,64538,64014,64538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,CT,96,11817.653210,6807.857033,11817.653210,6807.857033,77,52,77,52,77,52,77,52
97,CT,97,9185.842397,5131.178056,9185.842397,5131.178056,56,36,56,36,56,36,56,36
98,CT,98,6950.327109,3766.674881,6950.327109,3766.674881,41,24,41,24,41,24,41,24
99,CT,99,5109.747346,2689.508377,5109.747346,2689.508377,29,16,29,16,29,16,29,16


In [458]:
# Life expectancies at age zero (e0) for projection period according to the scenario
# Note: Start with e0 of baseline year and provide expected e0 values for 5-year intervals; 
#single year e0 values will be linearly interpolated 

urban_females_e0 = cr_df["m_UF"].round(0).astype(np.int).values
rural_females_e0 = cr_df["m_RF"].round(0).astype(np.int).values
urban_males_e0 = cr_df["m_UM"].round(0).astype(np.int).values
rural_males_e0 = cr_df["m_RM"].round(0).astype(np.int).values

urban_females_e0

array([82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
       82, 82])

In [459]:
scenario ='Constant_rate'

# Modify the constant rate scenario mortality assumptions if current scenario is something else
if scenario != 'Constant_rate':
    
    # Retrive the national-level changes of female life expectancy according to the current scenario
    # Currently urban and rural are assumed equal
    urban_female_mort_rate = np.cumprod(scenario_df['F_Mor_change'].values())
    rural_female_mort_rate = urban_female_mort_rate
    
    # Retrive the national-level changes of male life expectancy according to the current scenario
    # Currently urban and rural are assumed equal
    urban_male_mort_rate = np.cumprod(scenario_df['M_Mor_change'].values())
    rural_male_mort_rate = urban_female_mort_rate    
    
    # Apply the national-level rates to the state-level life expectancy assumptions
    # Female
    e0_urban_female = urban_female_mort_rate * urban_females_e0[0]
    
    e0_rural_female = rural_female_mort_rate * rural_females_e0[0]

else:
    

SyntaxError: unexpected EOF while parsing (<ipython-input-459-1c35f08dc7ea>, line 23)