In [None]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from IPython.display import display


In [None]:
CENSUS_DATA_ROOT = './data/clean_census_data/'
PEW_RESEARCH_DATA_ROOT = './data/pew_research_data/'
NHTS_DATA_ROOT = './data/fha_nhts_data/'

SYNTHETIC_DATA = './synthetic_data/'


### Hardcode sample data:


In [None]:
# Define fixed attributes for each location:
location_attributes_dummy = pd.DataFrame([
    { 'location_name':'loc1', 'density':'rural', 'population':3000, 'employment_rate':0.7, 'wealth_rate':0.6 },
    { 'location_name':'loc2', 'density':'rural', 'population':7000, 'employment_rate':0.7, 'wealth_rate':0.7 },
    { 'location_name':'loc3', 'density':'rural', 'population':4000, 'employment_rate':0.6, 'wealth_rate':0.4 },
    { 'location_name':'loc4', 'density':'rural', 'population':3000, 'employment_rate':0.6, 'wealth_rate':0.5 },
    { 'location_name':'loc5', 'density':'urban', 'population':1000, 'employment_rate':0.6, 'wealth_rate':0.5 },
    { 'location_name':'loc6', 'density':'urban', 'population':2000, 'employment_rate':0.6, 'wealth_rate':0.5 },
    { 'location_name':'loc7', 'density':'urban', 'population':3000, 'employment_rate':0.8, 'wealth_rate':0.5 },
    { 'location_name':'loc8', 'density':'urban', 'population':2000, 'employment_rate':0.7, 'wealth_rate':0.5 },
])
location_attributes_dummy


In [None]:
# Define profiles of sub-poluations for each location:
location_profiles_dummy = pd.DataFrame([
    { 'name':'loc1', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro':10, },
    { 'name':'loc1', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':40, 'gro':10, },
    { 'name':'loc1', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro':10, },
    { 'name':'loc1', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc2', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro':10, },
    { 'name':'loc2', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':40, 'gro':10, },
    { 'name':'loc2', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro':10, },
    { 'name':'loc2', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc3', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro':10, },
    { 'name':'loc3', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':150, 'soc':40, 'gro':10, },
    { 'name':'loc3', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro':10, },
    { 'name':'loc3', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':150, 'soc':50, 'gro':10, },
    { 'name':'loc4', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro':10, },
    { 'name':'loc4', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':150, 'soc':40, 'gro':10, },
    { 'name':'loc4', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro':10, },
    { 'name':'loc4', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':150, 'soc':50, 'gro':10, },
    { 'name':'loc5', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro': 5, },
    { 'name':'loc5', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':40, 'gro': 5, },
    { 'name':'loc5', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro': 5, },
    { 'name':'loc5', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc6', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro': 5, },
    { 'name':'loc6', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':40, 'gro': 5, },
    { 'name':'loc6', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro': 5, },
    { 'name':'loc6', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk': 80, 'soc':50, 'gro': 5, },
    { 'name':'loc7', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro': 5, },
    { 'name':'loc7', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk': 80, 'soc':40, 'gro': 5, },
    { 'name':'loc7', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro': 5, },
    { 'name':'loc7', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk': 80, 'soc':50, 'gro': 5, },
    { 'name':'loc8', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':30, 'gro': 5, },
    { 'name':'loc8', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk': 80, 'soc':40, 'gro': 5, },
    { 'name':'loc8', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':40, 'gro': 5, },
    { 'name':'loc8', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk': 80, 'soc':50, 'gro': 5, },
])
location_profiles_dummy = location_profiles_dummy.rename(columns={
    'name' : 'location_name',
    'wlth' : 'wealth_status',
    'empl' : 'employment_status',
    'phone' : 'phoneownership_rate',
    'wrk' : 'worktravel_mean',
    'soc' : 'socialtravel_mean',
    'gro' : 'grocerytravel_mean',
})
location_profiles_dummy['worktravel_std'] = np.sqrt(location_profiles_dummy['worktravel_mean']/10).round(1)
location_profiles_dummy['socialtravel_std'] = np.sqrt(location_profiles_dummy['socialtravel_mean']/10).round(1)
location_profiles_dummy['grocerytravel_std'] = np.sqrt(location_profiles_dummy['grocerytravel_mean']/10).round(1)
location_profiles_dummy


In [None]:
location_attributes_dummy.to_csv(SYNTHETIC_DATA+"location_attributes_dummy.csv",index=False)
print("Saved : {} .".format(SYNTHETIC_DATA+"location_attributes_dummy.csv"))

location_profiles_dummy.to_csv(SYNTHETIC_DATA+"location_profiles_dummy.csv",index=False)
print("Saved : {} .".format(SYNTHETIC_DATA+"location_profiles_dummy.csv"))


### Load official statistics:

**2018 ACS 1-Year Estimates Subject Tables :**

- [`S0101`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S0101.html):
"Total population : SELECTED AGE CATEGORIES".

- [`S1901`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S1901.html):
"Households : Total".

- [`S2301`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S2301.html): 
"Labor Force Participation Rate".

- [`B28010`](https://api.census.gov/data/2018/acs/acs1/subject/groups/S2801.html): 
"Has one or more types of computing devices : Smartphone".

**ACS Cartographic Boundary Shapefile:**

- [`cb_2018_us_county_500k.gdb`](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.2018.html):
"Land area by county".

**2011-2015 5-Year ACS Commuting Flows (from Micro-data):***

- [`Table 1`](https://www.census.gov/data/tables/2015/demo/metro-micro/commuting-flows-2015.html): 
"Residence County to Workplace County Commuting Flows for the United States and Puerto Rico Sorted by Residence Geography: 5-Year ACS, 2011-2015".

**National Center for Health Statistics (NCHS_ Urban-Rural Classification:**

- [`NCHSurbruralcodes`](https://www.cdc.gov/nchs/data_access/urban_rural.htm):
"NCHS Urban-Rural Classification Scheme for Counties : 2013 codes".

**Federal Highway Administration:**

[`National Household Travel Survey`](https://nhts.ornl.gov/person-miles):
"Person Miles of Travel"

**Pew Reserach:**

[`Mobile Phone Factsheet`](https://www.pewresearch.org/internet/fact-sheet/mobile/):


In [None]:
# Import population pyramid:
census_population = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S0101_population.csv")

# Import employment data:
census_employment = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S2301_employment.csv")

# Import income data:
census_income = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S2503_income.csv")

# Import phone onwership data:
census_internet = pd.read_csv(CENSUS_DATA_ROOT+"ACSST1Y2018_S2801_internet.csv")

# Import geography data:
census_geography = pd.read_csv(CENSUS_DATA_ROOT+"cb_2018_us_county_500k.csv")

# Import commuting data:
census_commuting = pd.read_csv(CENSUS_DATA_ROOT+"ACSCommutingFlows.csv")

# Import urban/rural data:
census_urban_rural = pd.read_csv(CENSUS_DATA_ROOT+"NCHSURCodes2013_urbanrural.csv")

# Import urban/rural data:
pew_mobile = pd.read_csv(PEW_RESEARCH_DATA_ROOT+"MobilePhoneFactsheet2019.csv")

# Import FHA NHTS trip data:
nhts_trips = pd.read_csv(NHTS_DATA_ROOT+"trippub.csv")

# Import FHA NHTS household data:
nhts_households = pd.read_csv(NHTS_DATA_ROOT+"hhpub.csv")


In [None]:
# Designate column groups:
income_lowest_cols = [
    'Less than $10,000',
    '$10,000 to $14,999', 
    '$15,000 to $24,999',
]
income_midlow_cols = [
    '$25,000 to $34,999',
    '$35,000 to $49,999',
]
income_midhigh_cols = [
    '$50,000 to $74,999',
]
income_highest_cols = [
    '$75,000 to $99,999',
    '$100,000 to $149,999',
    '$150,000 to $199,999',
    '$200,000 or more',
]
income_low_cols = income_lowest_cols+income_midlow_cols
income_high_cols = income_midhigh_cols+income_highest_cols


In [None]:
# Get geography data:
data = census_geography[['AFFGEOID','NAME','ALAND']].rename(columns={
    'AFFGEOID' : 'id',
    'NAME' : 'county_name',
    'ALAND' : 'area_square_meters',
})
# Get population data:
data = data.merge(
    census_population[['id','Total Population']],
    left_on=['id'], right_on=['id'],
)
# Get employment data:
data = data.merge(
    census_employment,
    left_on=['id'], right_on=['id'],
)
# Get urban/rural data:
data = data.merge(
    census_urban_rural.rename(columns={
        '2013 code' : 'urbanrural_code_2013', 'type_2013' : 'urbanrural_type_2013',
    })[[
        'id','urbanrural_code_2013','urbanrural_type_2013'
    ]],
    left_on=['id'], right_on=['id'],
)
# Add commuting data:
data = data.merge(
    census_commuting[['id','State Name','County Name','number_work_in_county','number_work_out_of_county']],
    left_on=['id'], right_on=['id'],
)
# Add income data:
data = data.merge(
    census_income[[
        'id',
        'Median income (dollars)',
        'Mean income (dollars)',
    ]+income_high_cols+income_low_cols],
    left_on=['id'], right_on=['id'],
)
# Add phone ownership:
data = data.merge(
    census_internet[['id','smartphone_ownership']],
    left_on=['id'], right_on=['id'],
)
# Compute metrics:
data['pct_above_income_50000'] = census_income[income_high_cols].sum(axis=1)/100
data['pop_density'] = data['Total Population']/(data['area_square_meters']/1e6)
data['pct_work_out_of_county'] = data['number_work_out_of_county']/(
    data['number_work_in_county']+data['number_work_out_of_county']
)/100
data['Labor Force Participation Rate'] = data['Labor Force Participation Rate'].replace('N',np.nan)
data['Labor Force Participation Rate'] = data['Labor Force Participation Rate'].astype(float)/100
data['urban_rural'] = np.where(data['urbanrural_code_2013'].isin([1,2,3]),'urban','rural')
# Rename columns:
data = data.rename(columns={'id':'county_id'})

data


### Build `location_attributes` table:


- For `population`, we used total population in age/gender breakdown (from ACS). 

- For `density`, we classifed counties as `urban` if they had code 1, 2, or 3 in the NCHS data and `rural` if they had code 4, 5, or 6 (NCHS classification).

- For `wealth_rate`, we took the proportion of population in household with income of 50,000 or more (from ACS).

- For `employment_rate`, we used labor force participation rate (from ACS).


In [None]:
new_column_names = {
    'county_id' : 'location_id',
    #'county_name' : 'location_county',
    'County Name' : 'location_county',
    'State Name' : 'location_state',
    'Geographic Area Name' : 'location_name',
    'Total Population' : 'population',
    'urban_rural' : 'density',
    'Labor Force Participation Rate' : 'employment_rate',
    'pct_above_income_50000' : 'wealth_rate',
}
data_columns = [
    'location_id',
    'location_county',
    'location_state',
    'location_name',
    'population',
    'density',
    'employment_rate',
    'wealth_rate',
]
location_attributes_census = data.copy().rename(columns=new_column_names)[data_columns]
location_attributes_census


### Build `location_profiles` table:

- We calculated population density by using the county boundary shapefiles and used it to calibrate parameters of work/social/grocery travel distances.


In [None]:
def estimate_smartphone_breakdown(row):
    
    """
        Estimates this location's breakdown of smartphone ownership by income bracket
        using income breakdown in overall population (from Pew Research),
        and this location's income brackets and overall ownernship (from ACS).
    """

    # Get overall phone ownership by income level:
    pew_income_groups = ['Less than $30,000','$30,000-$49,999','$50,000-$74,999','$75,000+']
    overall_ownership = []
    for pew_income_group in pew_income_groups:
        rate = pew_mobile.set_index(['Category']).loc[pew_income_group]['Smartphone']
        rate = float(rate.replace('%','').strip())/100
        overall_ownership.append(rate)
    overall_ownership = np.array(overall_ownership)

    # Get population by income level for this location:
    location_population = []
    income_col_groups = [income_lowest_cols,income_midlow_cols,income_midhigh_cols,income_highest_cols]
    for income_col_group in income_col_groups:
        count = row['Total Population']*row[income_col_group].sum()/100
        location_population.append(count)
    location_population = np.array(location_population)

    # Get overall phone ownership for this location:
    location_ownership = row['smartphone_ownership']

    # Calculate scaling factor to maintain overall ownership rate for this location:
    unscaled_estimate = (overall_ownership @ location_population) / location_population.sum()
    scaling_factor = location_ownership / unscaled_estimate

    # Estimate ownership rate in each income group for this location using (rescaled) overall breakdown:
    location_ownership_estimated = overall_ownership * scaling_factor

    # Check that overall rate from this estimate is similar to the known rate for this location:
    check = (location_ownership_estimated @ location_population) / location_population.sum()
    assert np.abs(location_ownership-check)<0.0001
    
    # Group highest and lowest income breackes to get two-level ownership rate:
    people = location_population
    owners = location_ownership_estimated * location_population
    people = np.array([people[0:2].sum(),people[2:4].sum()])
    owners = np.array([owners[0:2].sum(),owners[2:4].sum()])
    result = owners/people
    
    return result


In [None]:
# How to Calculate NHTS Person-Miles data (https://nhts.ornl.gov/person-miles): 
#   Sum TRPMILES, weighted by the trip weight (WTTRDFIN),
#   for records in the Trip file where TRPMILES >= 0
#
# Household income codes (https://nhts.ornl.gov/2009/pub/Codebook.pdf):
# HHFAMINC:
#   -9 = Not ascertained
#   -8 = Don't know
#   -7 = Refused
#   01 = < $5,000
#   02 = $5,000 - $9,999
#   03 = $10,000 - $14,999
#   04 = $15,000 - $19,999
#   05 = $20,000 - $24,999
#   06 = $25,000 - $29,999
#   07 = $30,000 - $34,999
#   08 = $35,000 - $39,999
#   09 = $40,000 - $44,999
#   10 = $45,000 - $49,999
#   11 = $50,000 - $54,999
#   12 = $55,000 - $59,999
#   13 = $60,000 - $64,999
#   14 = $65,000 - $69,999
#   15 = $70,000 - $74,999
#   16 = $75,000 - $79,999
#   17 = $80,000 - $99,999
#   18 = > = $100,000
#
# Urban/rural codes (https://nhts.ornl.gov/2009/pub/Codebook.pdf):
# URBRUR
#   -9 = Not ascertained
#   01 = Urban
#   02 = Rural
#
# Travel reason codes (https://nhts.ornl.gov/2009/pub/Codebook.pdf):
# TRIPPURP:
#   -9 = Not Ascertained
#   HBO = Other home-based
#   HBSHOP = Home-based shopping
#   HBSOCREC = Home-based social/recreational
#   HBW = Home-base work
#   NHB = Not home-based
# WHYFROM:
#   -7 = Refused
#   -9 = Not ascertained
#   01 = Home
#   10 = Work
#   11 = Go to work
#   12 = Return to work
#   13 = Attend business meeting/trip
#   14 = Other work related
#   20 = School/religious activity
#   21 = Go to school as student
#   22 = Go to religious activity
#   23 = Go to library: school related
#   24 = OS - Day care
#   30 = Medical/dental services
#   40 = Shopping/errands
#   41 = Buy goods: groceries/clothing/hardware store
#   42 = Buy services: video rentals/dry cleaner/post office/carservice/bank
#   43 = Buy gas
#   50 = Social/recreational
#   51 = Go to gym/exercise/play sports
#   52 = Rest or relaxation/vacation
#   53 = Visit friends/relatives
#   54 = Go out/hang out: entertainment/theater/sports event/go to bar
#   55 = Visit public place: historical site/museum/park/library
#   60 = Family personal business/obligations
#   61 = Use professional services: attorney/accountant
#   62 = Attend funeral/wedding
#   63 = Use personal services: grooming/haircut/nails
#   64 = Pet care: walk the dog/vet visits
#   65 = Attend meeting: PTA/home owners association/localgovernment
#   70 = Transport someone
#   71 = Pick up someone
#   72 = Take and wait
#   80 = Meals
#   81 = Social event
#   82 = Get/eat meal
#   83 = Coffee/ice cream/snacks
#   97 = Other reason
# WHYTO:
#   -1 = Appropriate skip
#   -7 = Refused
#   -8 = Don't know
#   -9 = Not ascertained
#   01 = Home
#   10 = Work
#   11 = Go to work
#   12 = Return to work
#   13 = Attend business meeting/trip
#   14 = Other work related
#   20 = School/religious activity
#   21 = Go to school as student
#   22 = Go to religious activity
#   23 = Go to library: school related
#   24 = OS - Day care
#   30 = Medical/dental services
#   40 = Shopping/errands
#   41 = Buy goods: groceries/clothing/hardware store
#   42 = Buy services: video rentals/dry cleaner/post office/carservice/bank
#   43 = Buy gas
#   50 = Social/recreational
#   51 = Go to gym/exercise/play sports
#   52 = Rest or relaxation/vacation
#   53 = Visit friends/relatives
#   54 = Go out/hang out: entertainment/theater/sports event/go to bar
#   55 = Visit public place: historical site/museum/park/library
#   60 = Family personal business/obligations
#   61 = Use professional services: attorney/accountant
#   62 = Attend funeral/wedding
#   63 = Use personal services: grooming/haircut/nails
#   64 = Pet care: walk the dog/vet visits
#   65 = Attend meeting: PTA/home owners association/local government
#   70 = Transport someone
#   71 = Pick up someone
#   72 = Take and wait
#   73 = Drop someone off
#   80 = Meals
#   81 = Social event
#   82 = Get/eat meal
#   83 = Coffee/ice cream/snacks
#   97 = Other reason
#
    
def label_nhts_urbrur(code):
    if code==1:
        return "urban"
    elif code==2:
        return "rural"
    else:
        return "[missing]"
    
def label_nhts_income(code):
    #if code<=0:
    #    return "[missing]"
    #elif code<=5:
    #    return "$0 - $14,999"
    #elif code<=10:
    #    return "$15,000 - $49,999"
    #elif code<=15:
    #    return "$50,000 - $74,999"
    #else:
    #    return "$75,000 +"
    if code<=0:
        return "[missing]"
    elif code<=10:
        return "under $50,000"
    else:
        return "over $50,000"
    #if code==-9:
    #    return "Not ascertained"
    #elif code==-8:
    #    return "Don't know"
    #elif code==-7:
    #    return "Refused"
    #elif code== 1:
    #    return "< $5,000"
    #elif code== 2:
    #    return "$5,000 - $9,999"
    #elif code== 3:
    #    return "$10,000 - $14,999"
    #elif code== 4:
    #    return "$15,000 - $19,999"
    #elif code== 5:
    #    return "$20,000 - $24,999"
    #elif code== 6:
    #    return "$25,000 - $29,999"
    #elif code== 7:
    #    return "$30,000 - $34,999"
    #elif code== 8:
    #    return "$35,000 - $39,999"
    #elif code== 9:
    #    return "$40,000 - $44,999"
    #elif code==10:
    #    return "$45,000 - $49,999"
    #elif code==11:
    #    return "$50,000 - $54,999"
    #elif code==12:
    #    return "$55,000 - $59,999"
    #elif code==13:
    #    return "$60,000 - $64,999"
    #elif code==14:
    #    return "$65,000 - $69,999"
    #elif code==15:
    #    return "$70,000 - $74,999"
    #elif code==16:
    #    return "$75,000 - $79,999"
    #elif code==17:
    #    return "$80,000 - $99,999"
    #elif code==18:
    #    return "> = $100,000"
    
def label_nhts_reason(code):
    if code<0:
        label = "[missing]"
    elif (code>=0) and (code<10):
        label = "home"
    elif (code>=10) and (code<20):
        label = "work"
    elif (code>=20) and (code<30):
        label = "school"
    elif (code>=30) and (code<40):
        label = "medical"
    elif (code>=40) and (code<50):
        label = "shopping"
    elif (code>=50) and (code<60):
        label = "social"
    elif (code>=60) and (code<70):
        label = "family/personal/business"
    elif (code>=70) and (code<80):
        label = "transportation"
    elif (code>=80) and (code<90):
        label = "dining"
    else:
        label = "[other]"
        
    if label in {'work','school'}:
        return "work"
    elif label in {'medical','shopping'}:
        return "grocery"
    elif label in {'family/personal/business','dining'}:
        return "social"
    else:
        return "[other]"
    
def label_nhts_purpose(code):
    if code==-9:
        label = "[missing]"
    elif code=='HBO':  #Other home-based
        label = "[other]"
    elif code=='HBSHOP':  #Home-based shopping
        label = "grocery"
    elif code=='HBSOCREC':  #Home-based social/recreational
        label = "social"
    elif code=='HBW':  #Home-base work
        label = "work"
    elif code=='NHB':  #Not home-based
        label = "[other]"
    else:
        label = "[other]"
    return label

person_miles = nhts_trips.copy()
person_miles = person_miles[person_miles['TRPMILES']>=0]
person_miles['person_id'] = person_miles['HOUSEID'].astype(str)+"_"+person_miles['PERSONID'].astype(str)
person_miles['nhts_household_income'] = person_miles['HHFAMINC'].apply(label_nhts_income)
person_miles['nhts_household_type'] = person_miles['URBRUR'].apply(label_nhts_urbrur)
person_miles['nhts_trip_reason'] = person_miles['WHYFROM'].apply(label_nhts_reason)
person_miles['nhts_trip_purpose'] = person_miles['TRIPPURP'].apply(label_nhts_purpose)
person_miles['nhts_trip_miles'] = person_miles['TRPMILES']

## Apply weights provided by dataset:
#total_weights = person_miles.drop_duplicates(['HOUSEID','PERSONID'])['WTTRDFIN'].sum()
#person_miles['nhts_trip_miles'] *= person_miles['WTTRDFIN']/total_weights

# Aggregate by income/purpose/person, then by income/purpose:
groupcols1 = ['nhts_household_type', 'nhts_household_income', 'nhts_trip_purpose', 'person_id']
groupcols2 = ['nhts_household_type', 'nhts_household_income', 'nhts_trip_purpose']
travel_profiles = person_miles.copy()
travel_profiles['trip_count'] = 1  # Count rows.
travel_profiles = travel_profiles.groupby(groupcols1).aggregate({
    'trip_count' : np.sum,
    'nhts_trip_miles' : np.sum,
}).reset_index()
travel_profiles['people_count'] = travel_profiles['person_id']  # Count unique IDs.
travel_profiles['baseline_total'] = travel_profiles['nhts_trip_miles']
travel_profiles['baseline_mean'] = travel_profiles['nhts_trip_miles']
travel_profiles['baseline_std'] = travel_profiles['nhts_trip_miles']
travel_profiles = travel_profiles.groupby(groupcols2).aggregate({
    'trip_count' : np.sum,
    'people_count' : lambda x: len(set(x)),
    'baseline_total' : np.sum,
    'baseline_mean' : np.mean,
    'baseline_std' : np.std,
}).reset_index()
travel_profiles = travel_profiles.set_index(groupcols2)
travel_profiles


In [None]:
# Build lookup dictionary from NHTS results:
tavel_baselines = {}
for density in ['rural','urban']:
    for wealth_status,income_label in [(0,'under $50,000'),(1,'over $50,000')]:
        for tavel_type in ['work','social','grocery']:
            row = travel_profiles.loc[(density,income_label,tavel_type)]
            tavel_baselines[(density,wealth_status,tavel_type,'mean')] = np.round(row['baseline_mean'],1)
            tavel_baselines[(density,wealth_status,tavel_type,'std')] = np.round(row['baseline_std'],1)


In [None]:
location_profiles_census = []
wealth_status_values = [0,1]
employment_status_values = [0,1]
for i,row in data.iterrows():
    new_row = row.copy().rename(new_column_names)
    
    density = new_row['density']
    
    smartphone_lowincome, smartphone_highincome = estimate_smartphone_breakdown(row)
    smartphone_ownership = { 0 : smartphone_lowincome, 1 : smartphone_highincome }
    
    for wealth_status in wealth_status_values:
        for employment_status in employment_status_values:
            profile = {
                'location_id' : new_row['location_id'],
                'location_county' : new_row['location_county'],
                'location_state' : new_row['location_state'],
                'location_name' : new_row['location_name'],
                'wealth_status' : wealth_status,
                'employment_status' : employment_status,
                'phoneownership_rate' : smartphone_ownership[wealth_status],
                'worktravel_mean' : tavel_baselines[(density,wealth_status,'work','mean')] * employment_status,
                'worktravel_std' : tavel_baselines[(density,wealth_status,'work','std')] * employment_status,
                'socialtravel_mean' : tavel_baselines[(density,wealth_status,'social','mean')],
                'socialtravel_std' : tavel_baselines[(density,wealth_status,'social','std')],
                'grocerytravel_mean' : tavel_baselines[(density,wealth_status,'grocery','mean')],
                'grocerytravel_std' : tavel_baselines[(density,wealth_status,'grocery','std')],
            }
            location_profiles_census.append(profile)
location_profiles_census = pd.DataFrame(location_profiles_census)
location_profiles_census
location_profiles_census


### Save results:


In [None]:
location_attributes_census.to_csv(SYNTHETIC_DATA+"location_attributes_census.csv",index=False)
print("Saved : {} .".format(SYNTHETIC_DATA+"location_attributes_census.csv"))

location_profiles_census.to_csv(SYNTHETIC_DATA+"location_profiles_census.csv",index=False)
print("Saved : {} .".format(SYNTHETIC_DATA+"location_profiles_census.csv"))
