In [1]:
import pandas as pd
import numpy as np
import os

In [12]:
columns_drop = ['002', '003', '011', '014', '022', '023', '039', '048', '049', '050', '051','052', '053']

rename_col = {
'001' : 'PPL1Y',
'004' : 'Age_18T24',
'005' : 'Age_25T34',
'006' : 'Age_35T44',
'007' : 'Age_45T54',
'008' : 'Age_55T64',
'009' : 'Age_65T74',
'010' : 'Age_75T',
'012' : 'Gender_Male',
'013' : 'Gender_Female',
'015' : 'Race_White',
'016' : 'Race_Black',
'017' : 'Race_AI_AN',
'018' : 'Race_Asian',
'019' : 'Race_NH_OPI',
'020' : 'Race_Other',
'021' : 'Race_Mixed',
'024' : 'Citizen_Native',
'025' : 'Citizen_Foreign_Born',
'026' : 'Citizen_Naturalized',
'027' : 'Citizen_NotUS',
'034' : 'Edu_Less_High',
'035' : 'Edu_High_Grad',
'036' : 'Edu_Some_College',
'037' : 'Edu_Bachelor',
'038' : 'Edu_Grad_Prof',
'040' : 'Income_1k',
'041' : 'Income_15k',
'042' : 'Income_25k',
'043' : 'Income_35k',
'044' : 'Income_50k',
'045' : 'Income_65k',
'046' : 'Income_75k',
'047' : 'Income_75k+',
'054' : 'Housing_Owner',
'055' : 'Housing_Rental',
}

transform_col = {
    'PPL1Y': ['PPL1Y'], 
    'AGE_LOW': ['Age_18T24', 'Age_25T34'],
    'AGE_MID': ['Age_35T44', 'Age_45T54', 'Age_55T64'],
    'AGE_HIGH': ['Age_65T74'],
    'GENDER_0': ['Gender_Male'],
    'GENDER_1': ['Gender_Female'], 
    'RACE_0': ['Race_White'],
    'RACE_1': ['Race_Black', 'Race_AI_AN', 'Race_Asian', 'Race_NH_OPI', 'Race_Other', 'Race_Mixed'], 
    'CITIZENSHIP_0': ['Citizen_Native', 'Citizen_Foreign_Born', 'Citizen_Naturalized'], 
    'CITIZENSHIP_1': ['Citizen_NotUS'],
    'EDU_LOW': ['Edu_Less_High', 'Edu_High_Grad'], 
    'EDU_MID': ['Edu_Some_College', 'Edu_Bachelor'], 
    'EDU_HIGH': ['Edu_Grad_Prof'], 
    'INCOME_LOW': ['Income_1k', 'Income_15k', 'Income_25k', 'Income_35k'], 
    'INCOME_MID': ['Income_50k', 'Income_65k', 'Income_75k'],
    'INCOME_HIGH': ['Income_75k+'], 
    'HOUSE_0': ['Housing_Owner'],
    'HOUSE_1': ['Housing_Rental']
}

top15_cities = {
    'New York-Newark-Jersey City, NY-NJ-PA': 35620, 
    'Los Angeles-Long Beach-Anaheim, CA': 31080,
    'Chicago-Naperville-Elgin, IL-IN-WI': 16980,
    'Dallas-Fort Worth-Arlington, TX': 19100,
    'Houston-The Woodlands-Sugar Land, TX': 26420,
    'Washington-Arlington-Alexandria, DC-VA-MD-WV': 47900,
    'Miami-Fort Lauderdale-Pompano Beach, FL': 33100,
    'Philadelphia-Camden-Wilmington, PA-NJ-DE-MD': 37980,
    'Atlanta-Sandy Springs-Alpharetta, GA': 12060,
    'Phoenix-Mesa-Chandler, AZ': 38060,
    'Boston-Cambridge-Newton, MA-NH': 14460,
    'San Francisco-Oakland-Berkeley, CA': 41860,
    'Riverside-San Bernardino-Ontario, CA': 40140,
    'Detroit-Warren-Dearborn, MI': 19820,
    'Seattle-Tacoma-Bellevue, WA': 42660
    }

total_ppl = {f'S0701_C01_{key}E': f'TT_{val}' for key, val in rename_col.items()}
within_county = {f'S0701_C02_{key}E': f'SC_{val}' for key, val in rename_col.items()}
within_state = {f'S0701_C03_{key}E': f'SS_{val}' for key, val in rename_col.items()}
diff_state = {f'S0701_C04_{key}E': f'DS_{val}' for key, val in rename_col.items()}

all_column = total_ppl | within_county | within_state | diff_state

total_ppl = {f'TT_{key}': list(map(lambda x: 'TT_' + x, val)) for key, val in transform_col.items()}
within_county = {f'SC_{key}': list(map(lambda x: 'SC_' + x, val)) for key, val in transform_col.items()}
within_state = {f'SS_{key}': list(map(lambda x: 'SS_' + x, val)) for key, val in transform_col.items()}
diff_state = {f'DS_{key}': list(map(lambda x: 'DS_' + x, val)) for key, val in transform_col.items()}

all_transform_column = total_ppl | within_county | within_state | diff_state

input_path =  './Data/Migration/'
output_path = './Output/Migration/'

In [3]:
# cross walk for cbsa and zcta
walk = pd.read_csv('./Usage/us_xwalk.csv.gz')
walk = walk.drop(columns=['tabblk2020']).drop_duplicates()

In [23]:
def transform_migration_data(file):
    '''
    input:
        str file: filename in ./data folder
    output:
        csv files: migration + population data at ZCTA
    '''

    # read raw data and pre-precess
    df = pd.read_csv(input_path + file,dtype='O', compression='gzip')
    df = df.iloc[1: , :]
    df['GEO_ID'] = df['GEO_ID'].str.slice(start=-5)

    # merge cross-walk and raw data
    all_needed_data = df[[col for col in df.columns if col in all_column.keys() or col == 'GEO_ID']].copy().rename(columns=all_column).replace('-', pd.NA).replace('**', pd.NA).fillna('0').astype(float)
    all_needed_data = pd.merge(all_needed_data, walk, how='inner', left_on='GEO_ID', right_on='zcta')[['GEO_ID', 'cbsa'] + list(all_column.values())]

    # create transform dataset
    cities_migration = dict()

    for key, val in all_transform_column.items():
        cities_migration[key] = all_needed_data[val].sum(axis=1)

    for key in transform_col.keys():
        cities_migration['TM_' + key] = cities_migration['SC_' + key] + cities_migration['SS_' + key] + cities_migration['DS_' + key]
        cities_migration['SCP_' + key] = cities_migration['SC_' + key]/cities_migration['TM_' + key].values
    
    cities_migration['cbsa'] = all_needed_data['cbsa']
    cities_migration['GEO_ID'] = all_needed_data['GEO_ID']

    cities_migration = pd.DataFrame.from_dict(cities_migration)
    
    pd.DataFrame.from_dict(cities_migration).to_csv(f'{output_path}{file[:-7]}-ZCTA.csv.gz', compression='gzip', sep = ",", header=True, encoding='utf-8-sig', index=False)

In [24]:
# transform all migration data
for file in os.listdir(input_path):
    transform_migration_data(file)

In [25]:
# merge all output file to single historical data file
migration_historical = []
all_zipcode = set(range(0, 100000))
start = 10000
end = 0

# merge all ZCTA that has data available from 2015 to 2022
for file in os.listdir(output_path):
    year = int(file[7:11])
    start = min(year, start) 
    end = max(year, end)
    migration_by_year = pd.read_csv(output_path + file, compression='gzip')
    migration_by_year['year'] = year
    all_zipcode = all_zipcode.intersection(migration_by_year.GEO_ID)
    migration_historical.append(migration_by_year)

all_zipcode = sorted(all_zipcode)

migration_historical = [migration_by_year.set_index('GEO_ID').loc[all_zipcode].reset_index().sort_values('GEO_ID') for migration_by_year in migration_historical]

# calculate the annual growth for total population (tt) and total migration rate (tm)
tt = [migration_by_year[[col for col in migration_by_year.columns if 'TT_' in col]] for migration_by_year in migration_historical]
tm = [migration_by_year[[col for col in migration_by_year.columns if 'TM_' in col]] for migration_by_year in migration_historical]

population_change = []

for idx, (prev_tt, curr_tt, prev_tm, curr_tm) in enumerate(zip(tt[:-1], tt[1:], tm[:-1], tm[1:]), start=1):
    change = curr_tt - prev_tt.values
    migration_historical[idx][[col for col in migration_by_year.columns if 'TT_' in col]] = change
    change = curr_tm - prev_tm.values
    migration_historical[idx][[col for col in migration_by_year.columns if 'TM_' in col]] = change
   
migration_historical.pop(0)

all_tt_col = [col for col in migration_historical[0].columns if col not in ['year', 'cbsa', 'GEO_ID']]

migration_historical_with_lag = []

for idx in range(4, len(migration_historical)):
    migration_current = migration_historical[idx]
    for lag in range(1, 3):
        lagging = migration_historical[idx-lag].rename(columns = {col: col + f'_LAG_{lag}' for col in all_tt_col}).drop(columns=['year'])
        migration_current = pd.merge(migration_current, lagging)
    migration_historical_with_lag.append(migration_current)
    
migration_historical_with_lag = pd.concat(migration_historical_with_lag)


In [26]:
# collect data for cities in top 15
top15_cities = {
    'New York-Newark-Jersey City, NY-NJ-PA': 35620, 
    'Los Angeles-Long Beach-Anaheim, CA': 31080,
    'Chicago-Naperville-Elgin, IL-IN-WI': 16980,
    'Dallas-Fort Worth-Arlington, TX': 19100,
    'Houston-The Woodlands-Sugar Land, TX': 26420,
    'Washington-Arlington-Alexandria, DC-VA-MD-WV': 47900,
    'Miami-Fort Lauderdale-Pompano Beach, FL': 33100,
    'Philadelphia-Camden-Wilmington, PA-NJ-DE-MD': 37980,
    'Atlanta-Sandy Springs-Alpharetta, GA': 12060,
    'Phoenix-Mesa-Chandler, AZ': 38060,
    'Boston-Cambridge-Newton, MA-NH': 14460,
    'San Francisco-Oakland-Berkeley, CA': 41860,
    'Riverside-San Bernardino-Ontario, CA': 40140,
    'Detroit-Warren-Dearborn, MI': 19820,
    'Seattle-Tacoma-Bellevue, WA': 42660
    }

migration_historical_top15 = migration_historical_with_lag.set_index('cbsa').loc[top15_cities.values()].reset_index()[['year', 'cbsa', 'GEO_ID'] + [col for col in migration_historical_with_lag.columns if 'TT' in col or 'TM' in col]].copy()

migration_historical_top15 = migration_historical_top15[migration_historical_top15['year'] > 2015].set_index('GEO_ID').replace(np.inf, np.NaN)

for col in migration_historical_top15.columns:
    if col != 'year' or col != 'cbsa':
        migration_historical_top15[col] = migration_historical_top15[col].fillna(migration_historical_top15.groupby(['year', 'cbsa'])[col].transform('mean'))

migration_historical_top15.to_csv('./Output/final/MIGRATION_CHANGE_TOP15.csv.gz', compression='gzip', sep = ",", header=True, encoding='utf-8-sig')

In [27]:
cbd_info = pd.read_csv('./Usage/zori_panel_zips.csv')[['zip', 'dist_to_cbd']]
cbd_close = cbd_info.loc[cbd_info['dist_to_cbd'] < 7000]['zip']
migration_historical_cbd = migration_historical_with_lag.set_index('GEO_ID').loc[cbd_close][['year', 'cbsa'] + [col for col in migration_historical_with_lag.columns if 'SCP' in col]]

for col in migration_historical_cbd.columns:
    if col != 'year' or col != 'cbsa':
        migration_historical_cbd[col] = migration_historical_cbd[col].fillna(migration_historical_cbd.groupby(['year', 'cbsa'])[col].transform('mean'))

migration_historical_cbd.to_csv('./Output/final/MIGRATION_CBD.csv.gz', compression='gzip', sep = ",", header=True, encoding='utf-8-sig')

In [20]:
migration_historical = []
all_zipcode = set(range(0, 100000))
start = 10000
end = 0

for file in os.listdir(output_path):
    year = int(file[7:11])
    start = min(year, start) 
    end = max(year, end)
    migration_by_year = pd.read_csv(output_path + file, compression='gzip')
    migration_by_year['year'] = year
    all_zipcode = all_zipcode.intersection(migration_by_year.GEO_ID)
    migration_historical.append(migration_by_year)

all_zipcode = sorted(all_zipcode)
migration_historical = [migration_by_year.set_index('GEO_ID').loc[all_zipcode].reset_index().sort_values('GEO_ID') for migration_by_year in migration_historical]

migration_historical = [migration_by_year.set_index('cbsa').loc[top15_cities.values()].reset_index() for migration_by_year in migration_historical]

all_ppl_col = [col for col in migration_historical[0].columns if col not in ['year', 'cbsa', 'GEO_ID']]

migration_historical_with_lag = []

for idx in range(4, len(migration_historical)):
    migration_current = migration_historical[idx]
    for lag in range(1, 3):
        lagging = migration_historical[idx-lag].rename(columns = {col: col + f'_LAG_{lag}' for col in all_ppl_col}).drop(columns=['year'])
        migration_current = pd.merge(migration_current, lagging)
    migration_historical_with_lag.append(migration_current)

migration_historical_top15 = pd.concat(migration_historical_with_lag)
migration_historical_top15.to_csv('./Output/final/MIGRATION_TOTAL_TOP15.csv.gz', compression='gzip', sep = ",", header=True, encoding='utf-8-sig', index = False)