In [1]:
%matplotlib inline

import os
import pkg_resources

import numpy as np
import pandas as pd
import matplotlib

import statepop.multistate
from statepop.utils import single_state_files

# 0. Set up development environment

In [2]:
# directory housing the data
root_dir = '/Users/d3y010/repos/temp/statepop/statepop/data'

# directory containing data from R run to compare against
comp_data_dir = os.path.join(root_dir, 'comp_data')

#Path to results folder
results_path = pkg_resources.resource_filename('statepop', 'data/inputs/No_Mig')

# Path to state-level inputs folders
state_inputs = pkg_resources.resource_filename('statepop', 'data/State_Inputs')

# List of states, alphebetized
states_list = os.listdir(state_inputs)
states_list = sorted(states_list, key = lambda x: int(x.split('-')[0]))

# Specify scenario
scenarios = ("Constant_rate", "SSP2", "SSP3", "SSP5")

# 1. Generate baseline population matrix (from R code)

In [3]:
def multi_state_base_pop(int_mig_rates=0, domestic_migration_factor='Constant_rate'):
    
    current_int_mig = []
    initial_all_base_pop = []
    upd_all_base_pop = []
    
    for states in states_list:
        
        state_abbrev = states.split('-')[1]
        
        # Scenario data (The Constant_rate scenario. )
        cr_df = single_state_files(state_abbrev, 'Constant_rate.csv')

        # Base Population data
        bp_df = single_state_files(state_abbrev, 'basePop.csv')
        
        # remove any empty rows
        bp_df = bp_df.loc[bp_df['region'] == state_abbrev]

        #* International migration data
        # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
        im_df = single_state_files(state_abbrev, 'intMig.csv')

        # Annual total net international migration counts for 5-year periods
        female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
        male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values

        # Update the base year population with the international migration
        # females -> urban0, rural0, urban1, rural1,...
        females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
        females_base_pop = females_base_pop.reset_index(drop=True) #fBP

        # males
        males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
        males_base_pop = males_base_pop.reset_index(drop=True) #mBP

        # combines female and male pieces
        base_pop_df = pd.DataFrame(pd.concat([females_base_pop, males_base_pop]).reset_index(drop=True))                  
        base_pop_df.columns = [s for s in states_list if states in s] #matBP
        

        # Spread migrant numbers according to profile
        if int_mig_rates == 1:
            im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
            im_df['nmRM'] = im_df['nmUM']
        else:
            im_df["nmUF"] = im_df['net_female'] * 0
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * 0
            im_df['nmRM'] = im_df['nmUM']

        # Update the base year population with the international migration rates
        urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
        urban_rural_females = urban_rural_females.reset_index(drop=True)

        urban_rural_males = pd.concat([im_df['nmUM'], im_df['nmRM']]).sort_index(kind='mergesort')
        urban_rural_males = urban_rural_males.reset_index(drop=True)

        urban_rural_all = pd.DataFrame(pd.concat([urban_rural_females, urban_rural_males]).reset_index(drop=True)) 
        urban_rural_all.columns = [s for s in states_list if states in s]

        # Populate the data frames that store base-year population and its updated population by international migration
        initial_all_base_pop.append(base_pop_df)

        updated_all_base_pop = base_pop_df + urban_rural_all
        updated_all_base_pop.fillna(0, inplace=True) #matBP_upd
        upd_all_base_pop.append(updated_all_base_pop)

        # Keep the international migration for the current state
        current_int_migration = updated_all_base_pop - base_pop_df
        current_int_migration.fillna(0, inplace=True)        
        current_int_mig.append(current_int_migration)

    # in migration rates dataframe for all states
    total_int_migration = pd.concat(current_int_mig, axis=1)
    
    # initial base population dataframe for all states
    init_all_base_pop_df = pd.concat(initial_all_base_pop, axis=1)
    init_all_base_pop_df.fillna(0, inplace=True)
    
    # base population dataframe for all states
    updated_all_base_pop_df = pd.concat(upd_all_base_pop, axis=1)

    return total_int_migration, init_all_base_pop_df, updated_all_base_pop_df

## 1.1 Generate outputs

In [4]:
int_mig_rates=0
domestic_migration_factor='Constant_rate'

bpop = multi_state_base_pop(int_mig_rates, domestic_migration_factor)

# unpack outputs
total_int_migration, init_all_base_pop_df, updated_all_base_pop_df = bpop

## 1.1.1 Test against R outputs

In [5]:
# R population outputs as float64 data frames
r_ini_all_base_pop = pd.read_csv(os.path.join(comp_data_dir, "ini.all.base.pop.csv"), dtype=np.float64)
r_upd_all_base_pop = pd.read_csv(os.path.join(comp_data_dir, "upd.all.base.pop.csv"), dtype=np.float64)
r_tot_int_mig = pd.read_csv(os.path.join(comp_data_dir, "tot.int.mig.csv"), dtype=np.float64)

# reorder columns to match new format
r_ini_all_base_pop = r_ini_all_base_pop[init_all_base_pop_df.columns]
r_upd_all_base_pop = r_upd_all_base_pop[updated_all_base_pop_df.columns]
r_tot_int_mig = r_tot_int_mig[total_int_migration.columns]

# compare equality with the Python version; will return nothing if successful
print("Testing:  `init_all_base_pop_df`")
pd.testing.assert_frame_equal(r_ini_all_base_pop, init_all_base_pop_df)

print("Testing:  `updated_all_base_pop_df`")
pd.testing.assert_frame_equal(r_upd_all_base_pop, updated_all_base_pop_df)

print("Testing:  `total_int_migration`")
pd.testing.assert_frame_equal(r_tot_int_mig, total_int_migration)


Testing:  `init_all_base_pop_df`
Testing:  `updated_all_base_pop_df`
Testing:  `total_int_migration`


AssertionError: DataFrame are different

DataFrame shape mismatch
[left]:  (36764, 51)
[right]: (404, 51)

## 1.2 Chris' take on the `in_domestic_migration_calc_ss` function:

In [93]:
def sort_dirs_by_number(dir_list):
    """Sort directories by their leading number.
    
    :param dir_list:      List if directories containing a "<number>_<abbrev>" structure
    :type dir_list:       list 
    
    :return:              List of sorted directories by leading number
    
    """
    
    # create a dictionary of {state_id_integer: directory_name, ...}
    d_states = {int(i.split('-')[0]): i for i in dir_list}
    
    return [d_states[k] for k in sorted(d_states.keys())]


def in_domestic_migration(in_migration_files, updated_all_base_pop_df, scen_factor):
    """Calculate the total in migration numbers for all states.
    
    :param in_migration_files:              A list of full path with file name and extensions to the 
                                            in migrations files for each state.
    :type in_migration_files:               list 
    
    :param updated_all_base_pop_df:         A data frame containing population numbers per state
    :type updated_all_base_pop_df:          data frame 
    
    :param scen_factor:                     Scale applied to migration rates based on the scenario.
    :type scen_factor:                      int 
    
    :return:                                A data frame with in migration population
    
    """
    
    final_in_mig_pop = pd.DataFrame()

    for current_in_mig_file in in_migration_files:

        # get target state id_abbrev
        target_state = os.path.basename(current_in_mig_file).split('_')[0]

        # read in the contributing states in migration rates
        current_in_mig = pd.read_csv(current_in_mig_file)

        # get population data for every state except the target state
        from_states = [i for i in updated_all_base_pop_df.columns if target_state != i]
        current_from_pop = updated_all_base_pop_df[from_states].copy()

        # set data frame to hold contributing states population
        total_in_mig_pop = pd.DataFrame()

        for state in from_states:

            state_abbrev = state.split('-')[1]

            # create a copy of the in migration data frame for the from state
            state_in_mig_urban = current_in_mig.loc[current_in_mig['from'] == state_abbrev].copy()

            # create copy for rural and zero out rate
            state_in_mig_rural = state_in_mig_urban.copy()
            state_in_mig_rural['rate'] *= 0 

            # add sorting value for rural
            state_in_mig_rural['setting'] = 1

            # add sorting value for urban
            state_in_mig_urban['setting'] = 0

            # combine urban and rural data frames
            state_in_mig = pd.concat([state_in_mig_urban, state_in_mig_rural])

            # sort data frame by gender and age
            state_in_mig.sort_values(by=['gender', 'age', 'setting'], inplace=True)

            # add to all states data frame
            total_in_mig_pop[state] = state_in_mig['rate'].values

        # multiply to get the in migration numbers from each contributing state
        total_in_net_pop = (total_in_mig_pop * scen_factor) * current_from_pop

        # sum the total in migration numbers across all contributing states with one value per age group
        final_in_mig_pop[target_state] = total_in_net_pop.sum(axis=1)
        
    return final_in_mig_pop

## 1.2.1 Set up and run `in_migration` code

In [94]:
# set the scenario factor to 1 so we will have non-zero output for testing
scen_factor = 1

# get a list of state directories
state_dirs = [i for i in os.listdir(state_inputs) if '.DS_Store' not in i]

# sort state directories by their leading id
state_dirs_sorted = sort_dirs_by_number(state_dirs)

# create a list of in migration files from each state
in_migration_files = [os.path.join(state_inputs, i, f"{i}_in_mig.csv") for i in state_dirs_sorted]

# create the in migration population data frame for all states
in_migration_df = in_domestic_migration(in_migration_files, updated_all_base_pop_df, scen_factor)

## 1.2.2 Test `in_migration` code against R version output

In [95]:
# R in_migration output as float64 data frames
r_in_migration = pd.read_csv(os.path.join(comp_data_dir, "in.migration.csv"), dtype=np.float64)

# reorder columns to match new format
r_in_migration = r_in_migration[in_migration_df.columns]

# compare equality with the Python version; will return nothing if successful
print("Testing:  `in_migration_df`")
pd.testing.assert_frame_equal(r_in_migration, in_migration_df)

Testing:  `in_migration_df`


AssertionError: DataFrame.iloc[:, 0] (column name="1-AL") are different

DataFrame.iloc[:, 0] (column name="1-AL") values are different (47.0297 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
[left]:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]
[right]: [491.06198357200003, 0.0, 508.05726212600007, 0.0, 527.024982669, 0.0, 531.148187779, 0.0, 522.558092793, 0.0, 504.311085865, 0.0, 502.23155887100006, 0.0, 496.78466965300004, 0.0, 501.588586078, 0.0, 512.620739576, 0.0, 383.968043124, 0.0, 377.677431176, 0.0, 375.351135528, 0.0, 375.568140116, 0.0, 375.7499065239999, 0.0, 839.9320897620001, 0.0, 856.073327061, 0.0, 866.5893653180002, 0.0, 875.1576010690001, 0.0, 904.3733114429998, 0.0, 1081.539135558, 0.0, 1046.4552884299999, 0.0, 1016.324102878, 0.0, 1005.8326098580001, 0.0, 1022.597032002, 0.0, 937.2558478839999, 0.0, 924.109196813, 0.0, 945.6115529819999, 0.0, 933.4296556449999, 0.0, 951.019012219, 0.0, 632.993295785, 0.0, 593.731357844, 0.0, 598.3909243669999, 0.0, 582.521883855, 0.0, 577.968854502, 0.0, 443.172668839, 0.0, 434.076934803, 0.0, 450.338391312, 0.0, 472.52254678799994, 0.0, 497.667686747, 0.0, 503.97030802800003, 0.0, 475.21315713800004, 0.0, 471.97397665, 0.0, 476.8436818520001, 0.0, 485.395726816, 0.0, 395.847272966, 0.0, 396.61714070299996, 0.0, 399.86931539, 0.0, 398.00851605500003, 0.0, 402.305051497, 0.0, ...]

# Preexisting Code

In [11]:
""""""""""""""""""""""""""""""""""
# Not sure how to directly incorporate this, come back to it later

# Specify the domestic migration factor
# If scenario is not "Constant_rate" (for fertility, mortality and international migration), this factor will become dynamic later
scen.factor <- 0 # 1 for regular, 0 for no domestic migration, 0.5 for half scenario and 2 for double scenario

if scenario == 'Constant_rate':
    scenario_factor = 0
elif scenario == 

# Sepecify if international migration is applied
int.mig <- 0 # 1 applied 0 not applied

#* Should details for projection model adjustment be printed?
vis <- F # TRUE (print details); FALSE (don't print details)

#* Should the Brass Relational Model be used or a simple scaling approach to compute future fertility schedules
useBrassf <- T # TRUE (use Brass); FALSE (use scaling)

"""""""""""""""""""""""""""""""""

'"\n# Not sure how to directly incorporate this, come back to it later\n\n# Specify the domestic migration factor\n# If scenario is not "Constant_rate" (for fertility, mortality and international migration), this factor will become dynamic later\nscen.factor <- 0 # 1 for regular, 0 for no domestic migration, 0.5 for half scenario and 2 for double scenario\n\nif scenario == \'Constant_rate\':\n    scenario_factor = 0\nelif scenario == \n\n# Sepecify if international migration is applied\nint.mig <- 0 # 1 applied 0 not applied\n\n#* Should details for projection model adjustment be printed?\nvis <- F # TRUE (print details); FALSE (don\'t print details)\n\n#* Should the Brass Relational Model be used or a simple scaling approach to compute future fertility schedules\nuseBrassf <- T # TRUE (use Brass); FALSE (use scaling)\n\n'

## Base Population Matrix (Dataframe)

In [None]:
#' Function to calculate the total in migration numbers for all states
#'
#' @param input.path input folder containing the migration rates
#' @param updated_all_base_pop population data frame
#' @param domestic_migration_factor scale applied to migration rates based on the scenario
#' @return data frame containing in-migration values for all states
#' @export
#'
f.in.dom.mig.calc <- function(input.path, updated_all_base_pop, domestic_migration_factor){

  # The final output dataframe that holds total in-migration numbers for all states
  tot.tot.in           <- data.frame(matrix(0, nrow = nrow(updated_all_base_pop), ncol = ncol(updated_all_base_pop)))
  colnames(tot.tot.in) <- colnames(updated_all_base_pop)

  states <- colnames(updated_all_base_pop)
  for (state in states){

    # Read the csv file holding in migration rates to the current state
    cur.in.path <- file.path(input.path, state, paste0(state, "_in_mig.csv"))
    in_mig_df  <- read.csv(cur.in.path, stringsAsFactors = F, check.names = F)

    # Retrieve population of all states that contributed migration to the current one
    from.states  <- colnames(updated_all_base_pop)[which(colnames(updated_all_base_pop) != state)]
    updated_all_base_pop <- updated_all_base_pop[, from.states]

    # Create a dataframe to hold in migration rates of all states contributing to the current one
    in_mig_df_all           <- data.frame(matrix(0, nrow = nrow(updated_all_base_pop), ncol = ncol(updated_all_base_pop)))
    colnames(in_mig_df_all) <- colnames(updated_all_base_pop)

    # Populate the in migration dataframe with the in migration rates of all contributing states
    for (col in colnames(in_mig_df_all)){

      # The current state that is contributing population to the focal state (state)
      cur.from.state  <- substr(col, nchar(col) - 1, nchar(col))

      # Female rates, rural is assumed 0
      in_mig_df_all[seq(1, nrow(updated_all_base_pop) / 2 , 2), col] <- in_mig_df[in_mig_df$from == cur.from.state & in_mig_df$gender == "f",
                                                                          ncol(in_mig_df)]
        
        1-range(# number of rows in upd_abp/2) by 2s

      # Male rates, rural is assumed 0
      in_mig_df_all[seq((nrow(updated_all_base_pop) * 0.5) + 1, nrow(updated_all_base_pop), 2), col] <- in_mig_df[in_mig_df$from == cur.from.state & in_mig_df$gender == "m",
                                                                                                  ncol(in_mig_df)]
    }

    # Multiplication gives us the in migration numbers from each contributing state
    cur.in.net <- (in_mig_df_all * domestic_migration_factor) * updated_all_base_pop

    # Sum the total in migration numbers across all contributing states, now we have one total value per age group
    cur.tot.in <- data.frame(apply(cur.in.net, 1, sum, na.rm = T))

    # Add the values of the current focal state to the output dataframe
    tot.tot.in[, state] <- cur.tot.in
  }

  return(tot.tot.in)
}

            

### Original function

In [451]:
def in_domestic_migration_calc_ss(updated_all_base_pop, domestic_migration_factor=0):
    
    """Requires all states"""
    #updated_all_base_pop = statepop.multi_state_model_bilateral.multi_state_base_pop()[2]
    
    from_states_df = []
    for states in states_list:
        # Read the csv file holding in migration rates to the current state
        in_mig_df = single_state_files(states, '_in_mig.csv')

        # Retrieve population of all states that contributed migration to the current one
        from_states = updated_all_base_pop.columns
        from_states_pop = updated_all_base_pop.loc[:, updated_all_base_pop.columns != states]

        for s in from_states:
            # Female rates, rural is assumed 0
            from_states_mig_female = in_mig_df.loc[(in_mig_df['from'] == s.split('-')[1]) & 
                                                 (in_mig_df['gender'] == 'f'), ['rate']]

            zeros = from_states_mig_female*0
            data_f = np.hstack([from_states_mig_female.values, zeros.values]).reshape(-1, from_states_mig_female.shape[1])
            from_states_female = pd.DataFrame(data_f, columns=[s])

            # Male rates, rural is assumed 0
            from_states_mig_male = in_mig_df.loc[(in_mig_df['from'] == s.split('-')[1]) & 
                                             (in_mig_df['gender'] == 'm'), ['rate']]
            data_m = np.hstack([from_states_mig_male.values, zeros.values]).reshape(-1, from_states_mig_male.shape[1])
            from_states_male = pd.DataFrame(data_m, columns=[s])

            from_states_mig = pd.concat([from_states_female, from_states_male], axis=0)
            from_states_df.append(from_states_mig)
    
    from_states = pd.concat(from_states_df, axis=1)
    
    # Multiplication gives us the in migration numbers from each contributing state
    #in_mig_total = (from_states_mig * domestic_migration_factor) * from_states_pop
    
    # Sum the total in migration numbers across all contributing states, now we have one total value per age group
    

    
    #uabp = updated_all_base_pop
    
    # Create a dataframe to hold in migration rates of all states contributing to the current one
    # Populate the in migration dataframe with the in migration rates of all contributing states
    
    return(from_states)


In [452]:
in_domestic_migration_calc_ss(updated_all_base_pop_df, domestic_migration_factor=1)

Unnamed: 0,1-AL,2-AK,4-AZ,5-AR,6-CA,8-CO,9-CT,10-DE,11-DC,12-FL,...,46-SD,47-TN,48-TX,49-UT,50-VT,51-VA,53-WA,54-WV,55-WI,56-WY
0,,0.00215,0.000227,0.000292,0.000104,0.000138,0.0,0.00124,0.0,0.000492,...,0.00164,0.000066,0.000044,0.000602,0.0,0.0,0.000241,0.0,0.0,
1,,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,
2,,0.00215,0.000227,0.000292,0.000104,0.000138,0.0,0.00124,0.0,0.000492,...,0.00164,0.000066,0.000044,0.000602,0.0,0.0,0.000241,0.0,0.0,
3,,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,
4,,0.00215,0.000227,0.000292,0.000104,0.000138,0.0,0.00124,0.0,0.000492,...,0.00164,0.000066,0.000044,0.000602,0.0,0.0,0.000241,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,
198,,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,
199,,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,
200,,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,


In [122]:
total_int_migration, init_all_base_pop_df, updated_all_base_pop_df = multi_state_base_pop()

In [123]:
dfx = pd.read_csv(os.path.join(comp_data_dir, 'upd.all.base.pop.csv'), dtype=np.float64)
dfx.head()

Unnamed: 0,9-CT,23-ME,25-MA,33-NH,44-RI,50-VT,34-NJ,36-NY,42-PA,17-IL,...,30-MT,32-NV,35-NM,49-UT,56-WY,2-AK,6-CA,15-HI,41-OR,53-WA
0,18220.0,6510.0,34800.0,6500.0,5170.0,2870.0,50620.0,109240.0,67810.0,79750.0,...,5940.0,17000.0,13400.0,25160.0,3820.0,5050.0,228510.0,7870.0,21580.0,41910.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18850.0,6520.0,35400.0,6510.0,5820.0,3220.0,51850.0,111360.0,69900.0,79860.0,...,5940.0,18720.0,13720.0,26350.0,4110.0,5450.0,243620.0,8440.0,22740.0,44000.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20820.0,6770.0,36500.0,6690.0,5810.0,3190.0,53500.0,114630.0,73170.0,84090.0,...,6090.0,18240.0,14240.0,26380.0,4090.0,4950.0,253810.0,8590.0,24690.0,43540.0


In [124]:
dfi = updated_all_base_pop_df[dfx.columns]
dfi.head()

Unnamed: 0,9-CT,23-ME,25-MA,33-NH,44-RI,50-VT,34-NJ,36-NY,42-PA,17-IL,...,30-MT,32-NV,35-NM,49-UT,56-WY,2-AK,6-CA,15-HI,41-OR,53-WA
0,18220.0,6510.0,34800.0,6500.0,5170.0,2870.0,50620.0,109240.0,67810.0,79750.0,...,5940.0,17000.0,13400.0,25160.0,3820.0,5050.0,228510.0,7870.0,21580.0,41910.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18850.0,6520.0,35400.0,6510.0,5820.0,3220.0,51850.0,111360.0,69900.0,79860.0,...,5940.0,18720.0,13720.0,26350.0,4110.0,5450.0,243620.0,8440.0,22740.0,44000.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20820.0,6770.0,36500.0,6690.0,5810.0,3190.0,53500.0,114630.0,73170.0,84090.0,...,6090.0,18240.0,14240.0,26380.0,4090.0,4950.0,253810.0,8590.0,24690.0,43540.0


In [125]:
pd.testing.assert_frame_equal(dfx, dfi)

In [89]:
dfz = pd.DataFrame({'r': dfx['1-AL'], 'py': dfi['1-AL']})
dfz.to_csv('/Users/d3y010/Desktop/test.csv', index=False)

In [191]:
def single_state_base_pop_df(state_abbr, int_mig_rates=0, domestic_migration_factor='Constant_rate'):
        
    #* Scenario data (The Constant_rate scenario. )
    cr_df = single_state_files(state_abbr, 'Constant_rate.csv')
    
    #* Base Population data
    bp_df = single_state_files(state_abbr, 'basePop.csv')
    
    #* International migration data
    # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
    im_df = single_state_files(state_abbr, 'intMig.csv')
    
    # Annual total net international migration counts for 5-year periods
    female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
    male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values
    
    # Update the base year population with the international migration
    # females -> urban0, rural0, urban1, rural1,...
    females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
    females_base_pop = females_base_pop.reset_index(drop=True) #fBP
    
    # males
    males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
    males_base_pop = males_base_pop.reset_index(drop=True) #mBP
    
    # combines female and male pieces
    base_pop_df = pd.DataFrame(pd.concat([females_base_pop, 
                                           males_base_pop]).reset_index(drop=True))
    # for some reason, Alabama is 4 rows longer than any other state (contains no data)
    if state_abbr == 'AL':
        base_pop_df.drop(base_pop_df.tail(4).index,inplace=True)
    base_pop_df.columns = [s for s in states_list if state_abbr in s] #matBP
    
    # Spread migrant numbers according to profile
    if int_mig_rates == 1:
        im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
        im_df["nmRF"] = im_df['nmUF']
        im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
        im_df['nmRM'] = im_df['nmUM']
    else:
        im_df["nmUF"] = im_df['net_female'] * 0
        im_df["nmRF"] = im_df['nmUF']
        im_df["nmUM"] = im_df['net_male'] * 0
        im_df['nmRM'] = im_df['nmUM']
    
    # Update the base year population with the international migration rates
    urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
    urban_rural_females = urban_rural_females.reset_index(drop=True)
    
    urban_rural_males = pd.concat([im_df['nmUM'], im_df['nmRM']]).sort_index(kind='mergesort')
    urban_rural_males = urban_rural_males.reset_index(drop=True)
    
    urban_rural_all = pd.DataFrame(pd.concat([urban_rural_females, 
                                           urban_rural_males]).reset_index(drop=True)) 
    urban_rural_all.columns = [s for s in states_list if state_abbr in s]

    # Populate the data frames that store base-year population and its updated population by international migration
    initial_all_base_population = pd.DataFrame(data = base_pop_df, index = list(range(404)), 
                                               columns=states_list).fillna(0)

    updated_all_base_pop = base_pop_df + urban_rural_all
    updated_all_base_pop.fillna(0, inplace=True) #matBP_upd
    
    # Keep the international migration for the current state
    current_int_migration = updated_all_base_pop - base_pop_df
    current_int_migration.fillna(0, inplace=True)
    total_int_migration = pd.DataFrame(data = current_int_migration, index = list(range(404)), columns=states_list)
    
    # Calculate the total in out and net migration numbers for all states
    in_migration = in_domestic_migration_calc_ss(state_abbr, updated_all_base_pop, domestic_migration_factor=0)
    #out_migration = 
    #net_migration = 
    
    return current_int_migration #total_int_migration

In [176]:
single_state_base_pop_df("AL", int_mig_rates=1)

Unnamed: 0,1-AL
0,89.899814
1,0.000000
2,89.899814
3,0.000000
4,89.899814
...,...
399,0.000000
400,0.000000
401,0.000000
402,0.000000


## Come back to this later

In [266]:
def multi_state_base_pop_df(int_mig_rates=0):
    
    current_int_mig = []
    
    for states in states_list:
        
        #* Scenario data (The Constant_rate scenario. )
        cr_df = single_state_files(states.split('-')[1], 'Constant_rate.csv')

        #* Base Population data
        bp_df = single_state_files(states.split('-')[1], 'basePop.csv')

        #* International migration data
        # Note: The international migration rates input data needs to be in relative rates (summing up to 1)
        im_df = single_state_files(states.split('-')[1], 'intMig.csv')

        # Annual total net international migration counts for 5-year periods
        female_net_international_migration = cr_df["nim_F"].round(0).astype(np.int).values
        male_net_international_migration = cr_df["nim_M"].round(0).astype(np.int).values

        # Update the base year population with the international migration
        # females -> urban0, rural0, urban1, rural1,...
        females_base_pop = pd.concat([bp_df['urban_female'], bp_df['rural_female']]).sort_index(kind='mergesort')
        females_base_pop = females_base_pop.reset_index(drop=True)

        # males
        males_base_pop = pd.concat([bp_df['urban_male'], bp_df['rural_male']]).sort_index(kind='mergesort')
        males_base_pop = males_base_pop.reset_index(drop=True)

        # combines female and male pieces
        base_pop_df = pd.DataFrame(pd.concat([females_base_pop, 
                                               males_base_pop]).reset_index(drop=True))
        base_pop_df = base_pop_df.fillna(0)
        base_pop_df = base_pop_df.round(0).astype(np.int)
        base_pop_df.columns = [s for s in states_list if states in s]

        # Spread migrant numbers according to profile
        if int_mig_rates == 1:
            im_df["nmUF"] = im_df['net_female'] * female_net_international_migration[0]
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * male_net_international_migration[0]
            im_df['nmRM'] = im_df['nmUM']
        else:
            im_df["nmUF"] = im_df['net_female'] * 0
            im_df["nmRF"] = im_df['nmUF']
            im_df["nmUM"] = im_df['net_male'] * 0
            im_df['nmRM'] = im_df['nmUM']

        # Update the base year population with the international migration rates
        urban_rural_females = pd.concat([im_df['nmUF'], im_df['nmRF']]).sort_index(kind='mergesort')
        urban_rural_females = urban_rural_females.reset_index(drop=True)

        urban_rural_males = pd.concat([im_df['nmUM'], im_df['nmRM']]).sort_index(kind='mergesort')
        urban_rural_males = urban_rural_males.reset_index(drop=True)

        urban_rural_all = pd.DataFrame(pd.concat([urban_rural_females, 
                                               urban_rural_males]).reset_index(drop=True))
        urban_rural_all.columns = [s for s in states_list if states in s]

        # Populate the data frames that store base-year population and its updated population by international migration
        initial_all_base_population = pd.DataFrame(data = base_pop_df, index = list(range(404)), 
                                                   columns=states_list).fillna(0)
        update_all_base_pop = base_pop_df.add(urban_rural_all, fill_value=0)    

        # Keep the international migration for the current state
        current_int_migration = update_all_base_pop.subtract(base_pop_df, fill_value=0)
        current_int_mig.append(current_int_migration)
        
        # Calculate the total in out and net migration numbers for all states
        #in_migration = 
        #out_migration = 
        #net_migration = 

    total_int_migration = pd.concat(current_int_mig, axis=1)
    # drop final 4 rows, for some reason AL is 4 rows longer than the rest
    total_int_migration.drop(total_int_migration.tail(4).index,inplace=True)
    
    
    
    
    return update_all_base_pop

In [267]:
multi_state_base_pop_df(int_mig_rates=1)

Unnamed: 0,56-WY
0,3831.149248
1,11.149248
2,4121.149248
3,11.149248
4,4101.149248
...,...
399,0.000000
400,10.000000
401,0.000000
402,15.000000


## Mortality

In [None]:
def single_state_mortality(state_abbr, scenario='Constant_rate'):
    #* Scenario data (The Constant_rate scenario. )
    cr_df = single_state_files(state_abbr, 'Constant_rate.csv')
    
    #* Mortality data
    # Note: The variable names need to be the same in all three mortality data files
    mortality_df = single_state_files(state_abbr, 'mortality.csv')
    
    # UN e0=30 (life expectancy at age 0 is 30 years)
    mortality_30 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
    mort_30 = pd.read_csv(mortality_30)
    mort_30.columns = mort_30.columns.map(lambda x : x+'_30' if x !='age' else x)
    
    # UN e0=100 (life expectancy at age 0 is 100 years)
    mortality_100 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
    mort_100 = pd.read_csv(mortality_100)
    mort_100.columns = mort_100.columns.map(lambda x : x+'_100' if x !='age' else x)
    
    # Merge UN e0=100 and UN e0=30 values into mortality_df
    mort_joined_df = mortality_df.merge(mort_30,on='age').merge(mort_100,on='age')
    
    # Life expectancies at age zero (e0) for projection period according to the scenario
    # Note: Start with e0 of baseline year and provide expected e0 values for 5-year intervals; 
    #single year e0 values will be linearly interpolated 

    urban_females_e0 = cr_df["m_UF"].round(0).astype(np.int).values
    rural_females_e0 = cr_df["m_RF"].round(0).astype(np.int).values
    urban_males_e0 = cr_df["m_UM"].round(0).astype(np.int).values
    rural_males_e0 = cr_df["m_RM"].round(0).astype(np.int).values
    
    

In [453]:
state_abbr = 'CT'

#* Scenario data (The Constant_rate scenario. )
cr_df = single_state_files(state_abbr, 'Constant_rate.csv')
cr_df

Unnamed: 0,year,f_A,f_R,f_U,m_AM,m_AF,m_RM,m_RF,m_UM,m_UF,pu,sr_A,sr_R,sr_U,nim_M,nim_F
0,2010,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
1,2015,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
2,2020,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
3,2025,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
4,2030,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
5,2035,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
6,2040,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
7,2045,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
8,2050,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555
9,2055,1.79,1.79,1.79,78.2,82.5,78.2,82.5,78.2,82.5,0.88,105,105,105,3890.347677,3621.553555


In [454]:
#* Mortality data
# Note: The variable names need to be the same in all three mortality data files
mortality_df = single_state_files(state_abbr, 'mortality.csv')
mortality_df

Unnamed: 0,region,age,lx_rural_female,lx_rural_male,lx_urban_female,lx_urban_male
0,CT,0,100000.000000,100000.000000,100000.000000,100000.000000
1,CT,1,99544.192200,99451.307230,99544.192200,99451.307230
2,CT,2,99516.735590,99417.353710,99516.735590,99417.353710
3,CT,3,99499.592940,99393.406250,99499.592940,99393.406250
4,CT,4,99487.272120,99375.493850,99487.272120,99375.493850
...,...,...,...,...,...,...
96,CT,96,11817.653210,6807.857033,11817.653210,6807.857033
97,CT,97,9185.842397,5131.178056,9185.842397,5131.178056
98,CT,98,6950.327109,3766.674881,6950.327109,3766.674881
99,CT,99,5109.747346,2689.508377,5109.747346,2689.508377


In [455]:
# UN e0=30 (life expectancy at age 0 is 30 years)
mortality_30 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
mort_30 = pd.read_csv(mortality_30)
mort_30.columns = mort_30.columns.map(lambda x : x+'_30' if x !='age' else x)
mort_30

Unnamed: 0,age,lx_rural_male_30,lx_rural_female_30,lx_urban_male_30,lx_urban_female_30
0,0,100000,100000,100000,100000
1,1,72835,74529,72835,74529
2,2,69291,70504,69291,70504
3,3,66373,67202,66373,67202
4,4,64014,64538,64014,64538
...,...,...,...,...,...
96,96,77,52,77,52
97,97,56,36,56,36
98,98,41,24,41,24
99,99,29,16,29,16


In [456]:
# UN e0=100 (life expectancy at age 0 is 100 years)
mortality_100 = pkg_resources.resource_filename('statepop', 'data/Inputs/AllRegions_mortality_UNe030.csv')
mort_100 = pd.read_csv(mortality_100)
mort_100.columns = mort_100.columns.map(lambda x : x+'_100' if x !='age' else x)
mort_100

Unnamed: 0,age,lx_rural_male_100,lx_rural_female_100,lx_urban_male_100,lx_urban_female_100
0,0,100000,100000,100000,100000
1,1,72835,74529,72835,74529
2,2,69291,70504,69291,70504
3,3,66373,67202,66373,67202
4,4,64014,64538,64014,64538
...,...,...,...,...,...
96,96,77,52,77,52
97,97,56,36,56,36
98,98,41,24,41,24
99,99,29,16,29,16


In [457]:
# Merge UN e0=100 and UN e0=30 values into mortality_df
mort_joined_df = mortality_df.merge(mort_30,on='age').merge(mort_100,on='age')
mort_joined_df

Unnamed: 0,region,age,lx_rural_female,lx_rural_male,lx_urban_female,lx_urban_male,lx_rural_male_30,lx_rural_female_30,lx_urban_male_30,lx_urban_female_30,lx_rural_male_100,lx_rural_female_100,lx_urban_male_100,lx_urban_female_100
0,CT,0,100000.000000,100000.000000,100000.000000,100000.000000,100000,100000,100000,100000,100000,100000,100000,100000
1,CT,1,99544.192200,99451.307230,99544.192200,99451.307230,72835,74529,72835,74529,72835,74529,72835,74529
2,CT,2,99516.735590,99417.353710,99516.735590,99417.353710,69291,70504,69291,70504,69291,70504,69291,70504
3,CT,3,99499.592940,99393.406250,99499.592940,99393.406250,66373,67202,66373,67202,66373,67202,66373,67202
4,CT,4,99487.272120,99375.493850,99487.272120,99375.493850,64014,64538,64014,64538,64014,64538,64014,64538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,CT,96,11817.653210,6807.857033,11817.653210,6807.857033,77,52,77,52,77,52,77,52
97,CT,97,9185.842397,5131.178056,9185.842397,5131.178056,56,36,56,36,56,36,56,36
98,CT,98,6950.327109,3766.674881,6950.327109,3766.674881,41,24,41,24,41,24,41,24
99,CT,99,5109.747346,2689.508377,5109.747346,2689.508377,29,16,29,16,29,16,29,16


In [458]:
# Life expectancies at age zero (e0) for projection period according to the scenario
# Note: Start with e0 of baseline year and provide expected e0 values for 5-year intervals; 
#single year e0 values will be linearly interpolated 

urban_females_e0 = cr_df["m_UF"].round(0).astype(np.int).values
rural_females_e0 = cr_df["m_RF"].round(0).astype(np.int).values
urban_males_e0 = cr_df["m_UM"].round(0).astype(np.int).values
rural_males_e0 = cr_df["m_RM"].round(0).astype(np.int).values

urban_females_e0

array([82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
       82, 82])

In [459]:
scenario ='Constant_rate'

# Modify the constant rate scenario mortality assumptions if current scenario is something else
if scenario != 'Constant_rate':
    
    # Retrive the national-level changes of female life expectancy according to the current scenario
    # Currently urban and rural are assumed equal
    urban_female_mort_rate = np.cumprod(scenario_df['F_Mor_change'].values())
    rural_female_mort_rate = urban_female_mort_rate
    
    # Retrive the national-level changes of male life expectancy according to the current scenario
    # Currently urban and rural are assumed equal
    urban_male_mort_rate = np.cumprod(scenario_df['M_Mor_change'].values())
    rural_male_mort_rate = urban_female_mort_rate    
    
    # Apply the national-level rates to the state-level life expectancy assumptions
    # Female
    e0_urban_female = urban_female_mort_rate * urban_females_e0[0]
    
    e0_rural_female = rural_female_mort_rate * rural_females_e0[0]

else:
    

SyntaxError: unexpected EOF while parsing (<ipython-input-459-1c35f08dc7ea>, line 23)