In [3]:
import pandas as pd
import os

In [4]:
# cross walk for cbsa and zcta
walk = pd.read_csv('./Usage/us_xwalk.csv.gz')
walk = walk.drop(columns=['tabblk2020']).drop_duplicates()
occ_score = pd.read_csv('./Usage/NAICS_workfromhome.csv')['teleworkable_emp']
columns = ['003', '004', '011', '005', '006', '007', '008', '010', '012', '014', '015', '017', '018', '019', '021', '022', '024', '025', '026', '027']
columns = [f'S2404_C01_{col}E' for col in columns]

In [5]:
input_path =  './Data/Occupation/'
output_path = './Output/Occupation/'

def transform_occupation_data(file):
    '''
    input:
        str file: filename in ./data folder
    output:
        csv files: occupation data at ZCTA
    '''

    # read raw data and pre-precess
    df = pd.read_csv(input_path + file,dtype='O', compression='gzip')
    df = df.iloc[1: , :]
    df['GEO_ID'] = df['GEO_ID'].str.slice(start=-5)

    # merge cross-walk and raw data
    tmp_data = df[['GEO_ID', 'S2404_C01_001E'] + columns].replace('-', pd.NA).replace('**', pd.NA).fillna('0').astype(int)
    all_needed_data = pd.merge(tmp_data, walk, how='inner', left_on='GEO_ID', right_on='zcta')[['GEO_ID', 'cbsa', 'S2404_C01_001E'] + columns]

    # calculate remote score and remote population density
    all_needed_data['REMOTE'] = (all_needed_data[columns].rename(columns={col: i for col, i in zip(columns, range(0, 20))}).astype(float)@occ_score/all_needed_data['S2404_C01_001E'].values)
    all_needed_data['REMOTE_PPL'] = (all_needed_data['S2404_C01_012E'].astype(int) + all_needed_data['S2404_C01_014E'].astype(int) + all_needed_data['S2404_C01_015E'].astype(int))

    # save  occupation data at ZCTA
    all_needed_data[['GEO_ID', 'REMOTE', 'REMOTE_PPL']].fillna(0).to_csv(f'{output_path}{file[:-7]}-ZCTA.csv.gz', compression='gzip', sep = ",", header=True, encoding='utf-8-sig', index=False)

In [6]:
for file in os.listdir(input_path):
    transform_occupation_data(file)

In [7]:
#combine all yearly occupation to 1 file 
all_file = os.listdir(output_path)

occ_historical = []
for file in all_file:
    year = int(file[7:11])
    occ_by_year = pd.read_csv(output_path + file, compression='gzip')
    occ_by_year['year'] = year
    occ_historical.append(occ_by_year)

occ_historical = pd.concat(occ_historical)
occ_historical.to_csv('./Output/final/OCCUPATION_ZCTA.csv.gz', compression='gzip', sep = ",", header=True, encoding='utf-8-sig', index=False)