In [1]:
import zipfile
import geopandas as gpd
import numpy as np
import pandas as pd

In [2]:
with zipfile.ZipFile("tl_2024_us_county.zip", "r") as zip_ref:
    zip_ref.extractall("tl_2024_us_county")

In [3]:
counties = gpd.read_file("tl_2024_us_county/tl_2024_us_county.shp")
counties.to_file('counties.geojson', driver = 'GeoJSON')

In [4]:
cmd_df = pd.read_csv('Indiana_Rail_FAF.csv')

In [5]:
#Aggregate all data with a origination in Indiana
#USING 2022 as 2023 estimates are not yet finalized
cmd_dms_orig_df = cmd_df.groupby(['dms_orig', 'sctg2']).\
                agg(
                    {'tons_2022': 'sum'})\
                .reset_index()\
                .rename(columns = {'dms_orig' : 'dms_loc'})

cmd_dms_orig_df = cmd_dms_orig_df[(cmd_dms_orig_df['dms_loc']>= 180) & (cmd_dms_orig_df['dms_loc'] <= 189)].copy(deep = True)

#Aggregate all dadta with a destination in  Indiana
cmd_dms_dest_df = cmd_df.groupby(['dms_dest', 'sctg2'])\
                .agg(
                    {'tons_2022': 'sum'})\
                .reset_index()\
                .rename(columns = {'dms_dest' : 'dms_loc'})

cmd_dms_dest_df = cmd_dms_dest_df[(cmd_dms_dest_df['dms_loc']>= 180) & (cmd_dms_dest_df['dms_loc'] <= 189)].copy(deep = True)




In [6]:
#Combine datasets to create city level commodity shares
cmd_dms_loc_df = pd.concat([cmd_dms_orig_df, cmd_dms_dest_df], ignore_index=True)
cmd_dms_ref_df = cmd_dms_loc_df.groupby('dms_loc').agg({'tons_2022': 'sum'})\
                .reset_index()\
                .rename(columns = {'tons_2022': 'total_tons_2022'})


cmd_dms_loc_df = pd.merge(cmd_dms_loc_df, cmd_dms_ref_df, on = 'dms_loc', how = 'left')

cmd_dms_loc_df['loc_cmd_share'] = cmd_dms_loc_df['tons_2022']/cmd_dms_loc_df['total_tons_2022']

#City level commodity shares to be distributed to the city and collar counties?
#Preliminary logic below, but needs to be combined with employment data methodology and structure

In [7]:
IN_Counties = counties[counties['STATEFP'] == '18'].copy(deep = True)

#Primary and Collar Counties for listed cities
FW_Cnt = ['Allen', 'Wells', 'Whitley', 'Huntington', 'Adams']
Gary_Cnt = ['Lake', 'Porter']
Ind_Cnt = ['Marion', 'Boone', 'Hamilton', 'Madison', 'Hancock', 'Shelby', 'Johnson', 'Morgan', 'Hendricks']

FW_df = IN_Counties[IN_Counties['NAME'].apply(lambda x: any(s in x for s in FW_Cnt))]
Gary_df = IN_Counties[IN_Counties['NAME'].apply(lambda x: any(s in x for s in Gary_Cnt))]
Ind_df = IN_Counties[IN_Counties['NAME'].apply(lambda x: any(s in x for s in Ind_Cnt))]

IN_Counties['Commodity_Mapping'] = np.where(IN_Counties['NAME'].str.contains('|'.join(FW_Cnt), case = False), 'Fort Wayne IN',
                                            np.where(IN_Counties['NAME'].str.contains('|'.join(Gary_Cnt), case = False), 'Chicago IL-IN-WI (IN Part)',
                                                     np.where(IN_Counties['NAME'].str.contains('|'.join(Ind_Cnt), case = False), 'Indianapolis IN',
                                                              'Rest of IN')))

IN_Counties['dms_code'] = np.where(IN_Counties['NAME'].str.contains('|'.join(FW_Cnt), case = False), 183,
                                            np.where(IN_Counties['NAME'].str.contains('|'.join(Gary_Cnt), case = False), 181,
                                                     np.where(IN_Counties['NAME'].str.contains('|'.join(Ind_Cnt), case = False), 182,
                                                              189)))

In [9]:
cmd_dms_loc_df

Unnamed: 0,dms_loc,sctg2,tons_2022,total_tons_2022,loc_cmd_share
0,181,2,80.646470,22847.787763,3.529728e-03
1,181,3,103.557118,22847.787763,4.532479e-03
2,181,4,7.620252,22847.787763,3.335225e-04
3,181,5,3.470854,22847.787763,1.519120e-04
4,181,6,39.512312,22847.787763,1.729371e-03
...,...,...,...,...,...
307,189,38,0.029585,76815.439607,3.851439e-07
308,189,39,1.549730,76815.439607,2.017472e-05
309,189,40,0.920492,76815.439607,1.198316e-05
310,189,41,658.989263,76815.439607,8.578865e-03
