In [35]:
import zipfile
import geopandas as gpd
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from shapely.geometry import LineString, Point



In [36]:

# Initialize the geolocator
geolocator = Nominatim(user_agent="geoapi")

# Function to get latitude and longitude for city names, used for the purpose of determining what IN countires are between them
def get_lat_lon(location_name):
    location = geolocator.geocode(location_name)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None
    

# fucntion to check which counties reside between an origin and destination
def add_intersected_counties(df, county_gdf):

    # Create LineStrings for all start and end points in the DataFrame
    lines = gpd.GeoSeries([
        LineString([(lon1, lat1), (lon2, lat2)]) 
        for lon1, lat1, lon2, lat2 in zip(df['dms_orig_lon'], df['dms_orig_lat'], df['dms_dest_lon'], df['dms_dest_lat'])
    ], crs="EPSG:4326")

    # Check intersections for all lines against the cities
    counties_crossed = []
    for line in lines:
        intersected_counties = county_gdf[county_gdf.geometry.intersects(line)]['NAME'].tolist()
        counties_crossed.append(intersected_counties)

    # Add results to a new column in the DataFrame
    df['counties_crossed'] = counties_crossed
    return df


In [37]:
#Load In Datasets
data_df = pd.read_csv('FAF5.6.1_2018-2023.csv')

with zipfile.ZipFile("tl_2024_us_county.zip", "r") as zip_ref:
    zip_ref.extractall("tl_2024_us_county")

counties = gpd.read_file("tl_2024_us_county/tl_2024_us_county.shp")
counties.to_file('counties.geojson', driver = 'GeoJSON')

In [38]:
#Create Indiana Counties df to reference for intersections
IN_Counties = counties[counties['STATEFP'] == '18'].copy(deep = True)


#Create lists of relevant counties for the locations with specified origins (for flagging)
FW_Cnt = ['Allen', 'Wells', 'Whitley', 'Huntington', 'Adams']
Gary_Cnt = ['Lake', 'Porter']
Ind_Cnt = ['Marion', 'Boone', 'Hamilton', 'Madison', 'Hancock', 'Shelby', 'Johnson', 'Morgan', 'Hendricks']

In [39]:
#Load in refference dfs, filter down to just rail
city_df = pd.read_excel('FAF5_metadata.xlsx', sheet_name = 'FAF Zone (Domestic)', header= 0)
comm_type_df = pd.read_excel('FAF5_metadata.xlsx', sheet_name = 'Commodity (SCTG2)', header= 0)
dist_df = pd.read_excel('FAF5_metadata.xlsx', sheet_name = 'Distance Band', header= 0)
rail_df = data_df[data_df['dms_mode'] == 2].copy(deep = True)
indiana_codes = [181, 182, 183, 189]

In [40]:
#Enrich city dataframe with longitude and latitude data
city_df[['loc_lat', 'loc_lon']] = city_df['City_State_Cleaned'].apply(lambda x: pd.Series(get_lat_lon(x)))

In [41]:
orig_df = city_df[['Numeric Label', 'Short Description','loc_lat', 'loc_lon']].rename(columns= {'Numeric Label': 'dms_orig', 'Short Description': 'dms_orig_desc', 'City_State_Cleaned' : 'dms_orig_city_st',
                                                                                                'loc_lat': 'dms_orig_lat', 'loc_lon': 'dms_orig_lon'})
dest_df = city_df[['Numeric Label', 'Short Description','loc_lat', 'loc_lon']].rename(columns= {'Numeric Label': 'dms_dest', 'Short Description': 'dms_dest_desc', 'City_State_Cleaned' : 'dms_dest_city_st',
                                                                                                'loc_lat': 'dms_dest_lat', 'loc_lon': 'dms_dest_lon'})
comm_type_df = comm_type_df.rename(columns= {'Numeric Label': 'sctg2', 'Description' : 'Commodity_Type'})
dist_df = dist_df.rename(columns = {'Numeric Label': 'dist_band', 'Description' : 'Distance_Band'})


In [42]:
#Enrich Data frame
rail_df = pd.merge(rail_df, orig_df, on = 'dms_orig', how = 'left')
rail_df = pd.merge(rail_df, dest_df, on = 'dms_dest', how = 'left')
rail_df = pd.merge(rail_df, comm_type_df, on = 'sctg2', how = 'left')
rail_df = pd.merge(rail_df, dist_df, on = 'dist_band', how = 'left')
rail_df = rail_df.drop(columns = ['fr_orig', 'fr_dest', 'fr_inmode', 'fr_outmode'])

In [43]:
#Create Dataframe for where freight stops and ends
ind_stop_df = rail_df[rail_df['dms_orig'].isin(indiana_codes) | rail_df['dms_dest'].isin(indiana_codes)]
ind_stop_df.to_csv('Indiana_Rail_FAF.csv')

In [44]:

#Filter down Rail DF to only Longitudes and latitudes that could realistically cross through Indiana
ind_min_lat = 37.7667
ind_max_lat = 41.7667
ind_min_lon = -88.1
ind_max_lon = -84.7667

#Only keep records that are east of Indianas westmost point for either the origin or the destination
lim_rail_df = rail_df[(rail_df['dms_dest_lon'] > ind_min_lon) | (rail_df['dms_orig_lon'] > ind_min_lon)]
#Only keep records that are west of Indianas eastmost point for either the origin or the destination
lim_rail_df = lim_rail_df[(lim_rail_df['dms_dest_lon'] < ind_max_lon) | (lim_rail_df['dms_orig_lon'] < ind_max_lon)]
#Only keep records that are north of Indianas southmost point for either the origin or the destination
lim_rail_df = lim_rail_df[(lim_rail_df['dms_dest_lat'] > ind_min_lat) | (lim_rail_df['dms_orig_lat'] > ind_min_lat)]
#Only keep records that are south of Indianas northmost point for either the origin or the destination
lim_rail_df = lim_rail_df[(rail_df['dms_dest_lat'] < ind_max_lat) | (lim_rail_df['dms_orig_lat'] < ind_max_lat)]



  


In [45]:
#Determine which counties in Indiana lay between all Origin and destination cities; only check against Indiana counties as this is expensive to run (20+ minutes)
#Use a limited version of rail df to only feasible origins and destinations given how long this takes to run (20+ min on rail_df)
ind_thru_df = add_intersected_counties(lim_rail_df, IN_Counties)

In [46]:
ind_thru_df

Unnamed: 0,dms_orig,dms_dest,dms_mode,sctg2,trade_type,dist_band,tons_2018,tons_2019,tons_2020,tons_2021,...,tmiles_2023,dms_orig_desc,dms_orig_lat,dms_orig_lon,dms_dest_desc,dms_dest_lat,dms_dest_lon,Commodity_Type,Distance_Band,counties_crossed
4,89,131,2,2,1,7,0.305325,0.347056,0.229162,0.301842,...,0.467771,Rest of CO,38.725178,-105.607716,Atlanta GA,33.748992,-84.390264,Cereal grains,"1,500 - 2,000",[]
20,171,280,2,2,1,5,1.229276,1.004656,1.150573,1.192772,...,0.932487,Chicago IL-IN-WI (IL Part),41.875562,-87.624421,Mississippi,32.971528,-89.734850,Cereal grains,750 - 999,[]
23,172,392,2,2,1,4,0.088608,0.072417,0.082935,0.085977,...,0.048347,St. Louis MO-IL (IL Part),38.585444,-90.223596,Cleveland OH,41.499657,-81.693677,Cereal grains,500 - 749,"[Hamilton, Randolph, Vigo, Delaware, Boone, Ja..."
24,179,131,2,2,1,4,259.605309,212.168824,242.984359,251.896342,...,194.192311,Rest of IL,40.079661,-89.433729,Atlanta GA,33.748992,-84.390264,Cereal grains,500 - 749,"[Posey, Gibson, Vanderburgh]"
25,179,139,2,2,1,5,567.403683,463.724615,531.076275,550.554658,...,508.760506,Rest of IL,40.079661,-89.433729,Rest of GA,32.329381,-83.113737,Cereal grains,750 - 999,"[Posey, Gibson, Vanderburgh]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327859,299,451,2,43,3,6,0.002533,0.000000,0.000194,0.005054,...,0.000222,Rest of MO,38.760481,-92.561787,Charleston SC,32.788436,-79.939931,Mixed freight,"1,000 - 1,499",[]
327863,329,512,2,43,3,8,0.000000,0.000000,0.000000,0.000000,...,0.000000,Rest of NV,39.515882,-116.853722,Virginia Beach-Norfolk VA-NC (VA Part),36.849658,-75.976075,Mixed freight,"Over 2,000",[]
327870,392,61,2,43,3,8,0.000000,0.000000,0.000000,0.000508,...,0.001570,Cleveland OH,41.499657,-81.693677,Los Angeles CA,34.053691,-118.242766,Mixed freight,"Over 2,000","[Huntington, Wabash, Wells, Warren, Tippecanoe..."
327910,513,61,2,43,3,8,0.000000,0.000000,0.000000,0.000000,...,0.004139,Washington DC-VA-MD-WV (VA Part),38.876933,-77.089309,Los Angeles CA,34.053691,-118.242766,Mixed freight,"Over 2,000",[]


In [47]:
#filter to only records where Indiana resides along the expected path
ind_thru_df = ind_thru_df[ind_thru_df['counties_crossed'].apply(len) > 0]

In [48]:
def check_counties_crossed(df, city_list, return_column_name, column_name="counties_crossed"):

    # Create a set for faster lookup
    city_set = set(city_list)
    
    # Check if any city in the list exists in the city_set
    df[return_column_name] = df[column_name].apply(lambda x: any(city in city_set for city in x))
    
    return df

In [49]:
ind_thru_df = check_counties_crossed(ind_thru_df, FW_Cnt, 'crosses_FW_counties')
ind_thru_df = check_counties_crossed(ind_thru_df, Ind_Cnt, 'crosses_IND_counties')
ind_thru_df = check_counties_crossed(ind_thru_df, Gary_Cnt, 'crosses_Gary_counties')
ind_thru_df['any_IN_counties'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [51]:
ind_thru_df.to_csv('ind_passthrough_estimates.csv', index = False)

PermissionError: [Errno 13] Permission denied: 'ind_passthrough_estimates.csv'