In [1]:
## Built-in modules
import os

## Third party modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import OrderedDict
%matplotlib inline

In [2]:
#pd.__version__

## CFS 2017 Data Import and Data Cleaning

In [3]:
df_raw = pd.read_csv('cfs_2017.csv')

In [4]:
# Only keep the TX Austin data + "Rest of TX"
df_tx = df_raw[(df_raw['ORIG_CFS_AREA'] == '48-12420') | (df_raw['ORIG_CFS_AREA'] == '48-41700') | 
               (df_raw['ORIG_CFS_AREA'] == '48-99999') | (df_raw['DEST_CFS_AREA'] == '48-12420') |  
               (df_raw['DEST_CFS_AREA'] == '48-41700') | (df_raw['DEST_CFS_AREA'] == '48-99999') ]

In [5]:
df_tx.head()

Unnamed: 0,SHIPMT_ID,ORIG_STATE,ORIG_MA,ORIG_CFS_AREA,DEST_STATE,DEST_MA,DEST_CFS_AREA,NAICS,QUARTER,SCTG,MODE,SHIPMT_VALUE,SHIPMT_WGHT,SHIPMT_DIST_GC,SHIPMT_DIST_ROUTED,TEMP_CNTL_YN,EXPORT_YN,EXPORT_CNTRY,HAZMAT,WGT_FACTOR
5,6,48,12420,48-12420,36,160,36-160,4541,2,40,14,605,4,1348,1561,N,N,N,N,18201.7
8,9,17,176,17-176,48,41700,48-41700,333,1,40,14,1918,38,1057,1224,N,N,N,N,566.8
12,13,48,206,48-206,48,99999,48-99999,322,2,28,4,672,684,466,517,N,N,N,N,121.0
43,44,13,122,13-122,48,99999,48-99999,4242,2,21,14,3,1,729,831,N,N,N,N,551.7
65,66,48,99999,48-99999,48,99999,48-99999,4239,2,26,15,14782,138577,180,419,N,N,N,N,57.3


In [6]:
df_tx.shape

(253810, 20)

### Generate mode choice variables

In [8]:
mode_agg5_dict = {4:'For-hire Truck', 
                  5:'Private Truck', 
                  6:'Rail/IMX', 
                  15:'Rail/IMX', 
                  11:'Air', 
                  14:'Parcel', 
                  0:'Other',
                  2:'Other',
                  3:'Other',
                  7:'Other',
                  8:'Other',
                  10:'Other',
                  12:'Other',
                  13:'Other',
                  16:'Other',
                  17:'Other',
                  20:'Other',
                  101:'Other'}

df_tx['mode_agg5'] = (df_tx.MODE).replace(mode_agg5_dict)

In [10]:
# Remove shipment with mode other than the five modes

df_tx = df_tx[df_tx['mode_agg5'] != 'Other'] 

In [11]:
df_tx['mode_agg5'].value_counts()

For-hire Truck    107411
Private Truck      72413
Parcel             65151
Rail/IMX            4595
Air                 3111
Name: mode_agg5, dtype: int64

### Generate commodity type variables

In [12]:
df_tx['SCTG'] = df_tx['SCTG'].astype(int)

In [13]:
df_tx['bulk'] = np.where((df_tx['SCTG'] == 2) | (df_tx['SCTG'] == 10) | (df_tx['SCTG'] == 11) | 
                         (df_tx['SCTG'] == 12) | (df_tx['SCTG'] == 13) | (df_tx['SCTG'] == 14) | 
                         (df_tx['SCTG'] == 15) | (df_tx['SCTG'] == 25) | (df_tx['SCTG'] == 26)| (df_tx['SCTG'] == 32), 1, 0)
df_tx['fuel_fert'] = np.where((df_tx['SCTG'] == 16) | (df_tx['SCTG'] == 17) | (df_tx['SCTG'] == 18) | 
                              (df_tx['SCTG'] == 19) | (df_tx['SCTG'] == 20) | (df_tx['SCTG'] == 22) | (df_tx['SCTG'] == 23), 1, 0)
df_tx['interm_food'] = np.where((df_tx['SCTG'] == 1) | (df_tx['SCTG'] == 3) | (df_tx['SCTG'] == 4) | 
                         (df_tx['SCTG'] == 5) | (df_tx['SCTG'] == 6) | (df_tx['SCTG'] == 7) | (df_tx['SCTG'] == 8), 1, 0)
df_tx['mfr_goods'] = np.where((df_tx['SCTG'] == 9) | (df_tx['SCTG'] == 21) | (df_tx['SCTG'] == 24) | 
                              (df_tx['SCTG'] == 27) | (df_tx['SCTG'] == 28) | (df_tx['SCTG'] == 29) | 
                              (df_tx['SCTG'] == 30) | (df_tx['SCTG'] == 31) | (df_tx['SCTG'] == 33) | 
                              (df_tx['SCTG'] == 34) | (df_tx['SCTG'] == 35) | (df_tx['SCTG'] == 36) |
                              (df_tx['SCTG'] == 37) | (df_tx['SCTG'] == 38) | (df_tx['SCTG'] == 39) | (df_tx['SCTG'] == 40), 1, 0)
df_tx['other'] = np.where((df_tx['SCTG'] == 41) | (df_tx['SCTG'] == 43) | (df_tx['SCTG'] == 99), 1, 0)

In [14]:
commodity_dict = {2:'bulk', 10:'bulk', 11:'bulk', 12:'bulk', 13:'bulk', 14:'bulk', 15:'bulk', 25:'bulk', 26:'bulk', 32:'bulk',
                  16:'fuel_fert', 17:'fuel_fert', 18:'fuel_fert', 19:'fuel_fert', 20:'fuel_fert', 22:'fuel_fert', 23:'fuel_fert',
                  1:'interm_food', 3:'interm_food', 4:'interm_food', 5:'interm_food', 6:'interm_food', 7:'interm_food', 8:'interm_food',
                  9:'mfr_goods', 21:'mfr_goods', 24:'mfr_goods', 27:'mfr_goods', 28:'mfr_goods', 29:'mfr_goods', 30:'mfr_goods', 
                  31:'mfr_goods', 33:'mfr_goods', 34:'mfr_goods', 35:'mfr_goods', 36:'mfr_goods', 37:'mfr_goods', 38:'mfr_goods', 
                  39:'mfr_goods', 40:'mfr_goods',
                  41:'other', 43:'other', 99:'other'}

df_tx['commodity'] = df_tx['SCTG'].map(commodity_dict)

### Generate NAICS industry type variables

In [15]:
df_tx['naics2'] = df_tx['NAICS'].astype(str).str[:2].astype(int)

In [16]:
df_tx['naics_name'] = (df_tx.naics2).replace({21:'Mining', 
                                              31:'Manufacturing',
                                              32:'Manufacturing',
                                              33:'Manufacturing',
                                              42:'Wholesale',
                                              45:'Retail',
                                              49:'Trans_Warehouse',
                                              51:'Information',
                                              55:'Mgt_companies'})

In [17]:
df_tx["wholesale"] = (df_tx['naics_name'] == "Wholesale").astype(int)
df_tx["mfring"] = (df_tx['naics_name'] == "Manufacturing").astype(int)
df_tx["mining"] = (df_tx['naics_name'] == "Mining").astype(int)
df_tx["retail"] = (df_tx['naics_name'] == "Retail").astype(int)
df_tx["info"] = (df_tx['naics_name'] == "Information").astype(int)
df_tx["management"] = (df_tx['naics_name'] == "Mgt_companies").astype(int)
df_tx["transwarehouse"] = (df_tx['naics_name'] == "Trans_Warehouse").astype(int)

### Generate additional variables

In [20]:
df_tx['geo'] = np.where(((df_tx['ORIG_CFS_AREA'] == '48-12420') | (df_tx['ORIG_CFS_AREA'] == '48-41700')) & 
               ((df_tx['DEST_CFS_AREA'] == '48-12420') | (df_tx['DEST_CFS_AREA'] == '48-41700')), 'Within Austin', 'External')

df_tx['SHIPMT_WGHT_TON'] = df_tx['SHIPMT_WGHT']/2000

df_tx['value_density'] = df_tx['SHIPMT_VALUE']/df_tx['SHIPMT_WGHT']

df_tx['SHIPMT_DIST'] = df_tx['SHIPMT_DIST_ROUTED']
df_tx['SHIPMT_DIST'] = np.where((df_tx['mode_agg5'] == 'Air') | (df_tx['mode_agg5'] == 'Parcel'), 
                                df_tx['SHIPMT_DIST_GC'],df_tx['SHIPMT_DIST']) # Use GC distance for Air and Parcel and Routed distance for the rest of modes.

### Additional data removal rules

In [21]:
# Based on Stinson et al. (2017)

df_tx.drop(df_tx[(df_tx['mode_agg5'] == 'Air') & (df_tx['SHIPMT_WGHT'] > 15000)].index, inplace = True)
df_tx.drop(df_tx[(df_tx['mode_agg5'] == 'Air') & (df_tx['SHIPMT_WGHT'] > 150) & (df_tx['value_density'] < 1)].index, inplace = True)
df_tx.drop(df_tx[(df_tx['mode_agg5'] == 'Parcel') & (df_tx['SHIPMT_WGHT'] > 150) & (df_tx['value_density'] < 1)].index, inplace = True)
df_tx.drop(df_tx[(df_tx['MODE'] == 6) & (df_tx['SHIPMT_WGHT'] < 1500)].index, inplace = True)
df_tx.drop(df_tx[(df_tx['MODE'] == 6) & (df_tx['value_density'] >= 4)].index, inplace = True)

In [23]:
# For the ML paper, remove "export" shipment

df_tx.drop(df_tx[df_tx['EXPORT_YN'] == 'Y'].index, inplace = True)

## Data Setup for Biogeme

### Create binary variables for alternative-specific variables using cutoff informed by SHAP partial dependence plots 

In [24]:
df_tx['shipmt_dist_less_than_500'] = (df_tx['SHIPMT_DIST']<=500).astype(int)
df_tx['shipmt_dist_more_than_500'] = (df_tx['SHIPMT_DIST']>500).astype(int)

df_tx['shipmt_wght_less_than_150'] = (df_tx['SHIPMT_WGHT']<=150).astype(int) # For air, parcel

df_tx['val_den_less_than_1'] = (df_tx['value_density']<=1).astype(int) 
df_tx['val_den_less_than_5'] = (df_tx['value_density']<=5).astype(int) 
df_tx['val_den_more_than_25'] = (df_tx['value_density']>25).astype(int) 
df_tx['val_den_more_than_5'] = (df_tx['value_density']>5).astype(int) 
df_tx['val_den_5to25'] = ((df_tx['value_density']>5) & (df_tx['value_density']<=25)).astype(int) 
df_tx['val_den_1to10'] = ((df_tx['value_density']>1) & (df_tx['value_density']<=10)).astype(int) 

### Create weight bin binary variables

In [25]:
df_tx['wght_bin_1'] = (df_tx['SHIPMT_WGHT'] <= 150).astype(int)
df_tx['wght_bin_2'] = ((df_tx['SHIPMT_WGHT'] > 150) & (df_tx['SHIPMT_WGHT'] <= 1500)).astype(int)
df_tx['wght_bin_3'] = ((df_tx['SHIPMT_WGHT'] > 1500) & (df_tx['SHIPMT_WGHT'] <= 30000)).astype(int)
df_tx['wght_bin_4'] = ((df_tx['SHIPMT_WGHT'] > 30000) & (df_tx['SHIPMT_WGHT'] <= 45000)).astype(int)
df_tx['wght_bin_5'] = (df_tx['SHIPMT_WGHT'] > 45000).astype(int)

### Create the 'choice' and 'availability' variables

In [26]:
## alt_1 = Air, alt_2 = For-hire Truck, alt_3 = Parcel, alt_4 = Private Truck, alt_5 = Rail/IMX 

choice_dictionary ={'Air' : 1, 'For-hire Truck' : 2, 'Parcel' : 3, 'Private Truck': 4, 'Rail/IMX':5}
df_tx['choice'] = df_tx['mode_agg5'].map(choice_dictionary).astype(int)

## Mode availability 
df_tx['AV_1c'] = np.where(((df_tx['SHIPMT_WGHT_TON'] <= 410) | (df_tx['mode_agg5'] == 'Air')), 1, 0) # the treshold is the national max (unweighted)
df_tx['AV_2c'] = 1
df_tx['AV_3c'] = np.where(((df_tx['SHIPMT_WGHT'] <= 150) | (df_tx['mode_agg5'] == 'Parcel')), 1, 0)
df_tx['AV_4c'] = np.where(((df_tx['SHIPMT_DIST_ROUTED'] <= 468) | (df_tx['mode_agg5'] == 'Private Truck')), 1, 0) # the treshold is the national max (unweighted)
df_tx['AV_5c'] = 1

### Create TravelTime and ShipCost variables

In [27]:
## We assume that For-hire Truck and Private Truck have the same travel time and shipping costs.
## The calculations for Rail/IMX, For-hire Truck and Private Truck are based on Stinson et al. (2017)
## The calculations for Air and Parcel are based on Keya (2016), minor modification to Air (adding loading time for external shipments)

m = df_tx['AV_3c'] == 1
df_tx.loc[m, 'random'] = np.random.rand(m.sum()) ## We are assigning different shipping speeds for Parcels based on a distribution; hence we need to generate this 'random' variable first 


df_tx['alt_1_traveltime'] = np.where(df_tx['geo'] == 'Within Austin', 1 + df_tx['SHIPMT_DIST_GC']/549.5,
                            np.where(df_tx['geo'] == 'External', 12 + df_tx['SHIPMT_DIST_GC']/549.5,
                            np.nan)) # assume average speed of 549.5 mph
df_tx['alt_2_traveltime'] = np.where(df_tx['geo'] == 'Within Austin', 4 + df_tx['SHIPMT_DIST_ROUTED']/20,
                            np.where((df_tx['geo'] == 'External') & (df_tx['SHIPMT_DIST_ROUTED'] <= 650), 16 + df_tx['SHIPMT_DIST_ROUTED']/65,
                            np.where((df_tx['geo'] == 'External') & (df_tx['SHIPMT_DIST_ROUTED'] > 650) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1299), 16 + df_tx['SHIPMT_DIST_ROUTED']/38,
                            np.where((df_tx['geo'] == 'External') & (df_tx['SHIPMT_DIST_ROUTED'] > 1299), 16 + df_tx['SHIPMT_DIST_ROUTED']/32, np.nan)))) 
df_tx['alt_3_traveltime'] = np.where(df_tx['random'] <= 0.09, 3*24,
                            np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27), 24,
                            np.where(df_tx['random'] > 0.27, 5*24, np.nan)))
df_tx['alt_4_traveltime'] =np.where(df_tx['geo'] == 'Within Austin', 4 + df_tx['SHIPMT_DIST_ROUTED']/20,
                           np.where((df_tx['geo'] == 'External') & (df_tx['SHIPMT_DIST_ROUTED'] <= 650), 16 + df_tx['SHIPMT_DIST_ROUTED']/65,
                           np.where((df_tx['geo'] == 'External') & (df_tx['SHIPMT_DIST_ROUTED'] > 650) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1299), 16 + df_tx['SHIPMT_DIST_ROUTED']/38,
                           np.where((df_tx['geo'] == 'External') & (df_tx['SHIPMT_DIST_ROUTED'] > 1299), 16 + df_tx['SHIPMT_DIST_ROUTED']/32, np.nan)))) 
df_tx['alt_5_traveltime'] = 12 + df_tx['SHIPMT_DIST_ROUTED']/22 + 12*2 # Assume 2 trackage changes

df_tx['alt_1_shipcost'] = np.where(df_tx['SHIPMT_WGHT'] <= 100, 55, 55+(df_tx['SHIPMT_WGHT']-100))
df_tx['alt_2_shipcost'] = np.where(df_tx['SHIPMT_WGHT'] < 150, np.maximum(2.83*df_tx['SHIPMT_WGHT_TON']*df_tx['SHIPMT_DIST_GC'], 10),
                          np.where((df_tx['SHIPMT_WGHT'] >= 150) & (df_tx['SHIPMT_WGHT'] < 1500), np.maximum(0.50*df_tx['SHIPMT_WGHT_TON']*df_tx['SHIPMT_DIST_GC'],15),
                          np.where(df_tx['SHIPMT_WGHT'] >= 1500, np.maximum(0.18*df_tx['SHIPMT_WGHT_TON']*df_tx['SHIPMT_DIST_GC'],15), np.NaN)))
df_tx['alt_3_shipcost'] = np.where((df_tx['random'] > 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] <= 150), np.exp(2.056+0.016*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] > 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 150) & (df_tx['SHIPMT_DIST_ROUTED'] <= 300), np.exp(2.251+0.015*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 300) & (df_tx['SHIPMT_DIST_ROUTED'] <= 600), np.exp(2.362+0.015*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] > 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 600) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1000), np.exp(2.555+0.014*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 1000) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1400), np.exp(2.739+0.013*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 1400) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1800), np.exp(2.905+0.013*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 1800), np.exp(3.023+0.013*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] <= 150), np.exp(3.666+0.015*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 150) & (df_tx['SHIPMT_DIST_ROUTED'] <= 300), np.exp(3.993+0.016*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 300) & (df_tx['SHIPMT_DIST_ROUTED'] <= 600), np.exp(4.631+0.01*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 600) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1000), np.exp(4.700+0.01*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 1000) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1400), np.exp(4.767+0.015*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 1400) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1800), np.exp(4.798+0.015*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] > 0.09) & (df_tx['random'] <= 0.27) & (df_tx['SHIPMT_DIST_ROUTED'] > 1800), np.exp(4.855+0.015*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] <= 0.09) & (df_tx['SHIPMT_DIST_ROUTED'] <= 150), np.exp(3.208+0.014*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] <= 0.09) & (df_tx['SHIPMT_DIST_ROUTED'] > 150) & (df_tx['SHIPMT_DIST_ROUTED'] <= 300), np.exp(3.399+0.015*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] <= 0.09) & (df_tx['SHIPMT_DIST_ROUTED'] > 300) & (df_tx['SHIPMT_DIST_ROUTED'] <= 600), np.exp(3.560+0.015*df_tx['SHIPMT_WGHT']),
                          np.where((df_tx['random'] <= 0.09) & (df_tx['SHIPMT_DIST_ROUTED'] > 600) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1000), np.exp(3.624+0.016*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] <= 0.09) & (df_tx['SHIPMT_DIST_ROUTED'] > 1000) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1400), np.exp(3.908+0.016*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] <= 0.09) & (df_tx['SHIPMT_DIST_ROUTED'] > 1400) & (df_tx['SHIPMT_DIST_ROUTED'] <= 1800), np.exp(4.010+0.016*df_tx['SHIPMT_WGHT']), 
                          np.where((df_tx['random'] <= 0.09) & (df_tx['SHIPMT_DIST_ROUTED'] > 1800), np.exp(4.158+0.016*df_tx['SHIPMT_WGHT']), np.NaN)))))))))))))))))))))

## set minimum shipping cost for Parcel based on Stinson et al. (2017)
df_tx['alt_3_shipcost'] = np.where((df_tx['AV_3c'] == 1) & (df_tx['SHIPMT_WGHT'] < 150), np.maximum(df_tx['alt_3_shipcost'], 10), 
                          np.where((df_tx['AV_3c'] == 1) & (df_tx['SHIPMT_WGHT'] >= 150), np.maximum(df_tx['alt_3_shipcost'], 15), np.NaN))
                                   
df_tx['alt_4_shipcost'] = np.where(df_tx['SHIPMT_WGHT'] < 150, np.maximum(2.83*df_tx['SHIPMT_WGHT_TON']*df_tx['SHIPMT_DIST_GC'],10),
                          np.where((df_tx['SHIPMT_WGHT'] >= 150) & (df_tx['SHIPMT_WGHT'] < 1500), np.maximum(0.50*df_tx['SHIPMT_WGHT_TON']*df_tx['SHIPMT_DIST_GC'],15),
                          np.where(df_tx['SHIPMT_WGHT'] >= 1500, np.maximum(0.18*df_tx['SHIPMT_WGHT_TON']*df_tx['SHIPMT_DIST_GC'],15), np.NaN)))
df_tx['alt_5_shipcost'] = np.maximum(0.039*df_tx['SHIPMT_WGHT_TON']*df_tx['SHIPMT_DIST_ROUTED'],200)

  result = getattr(ufunc, method)(*inputs, **kwargs)


### Create Biogeme datasets

In [28]:
## Biogeme only take dataset that contains numbers, keep numerical variables

df_tx_short = df_tx[['SHIPMT_ID','SHIPMT_DIST','SHIPMT_DIST_GC','SHIPMT_DIST_ROUTED','SHIPMT_WGHT_TON','SHIPMT_WGHT','value_density',
                     'bulk','fuel_fert','interm_food','mfr_goods','other',
                     'wholesale','mfring','mining','retail',
                     'info','management','transwarehouse','alt_1_traveltime','alt_2_traveltime','alt_3_traveltime',
                     'alt_4_traveltime','alt_5_traveltime',
                     'alt_1_shipcost','alt_2_shipcost','alt_3_shipcost','alt_4_shipcost','alt_5_shipcost','choice',
                     'AV_1c','AV_2c','AV_3c','AV_4c','AV_5c',
                     'WGT_FACTOR',
                     'shipmt_dist_less_than_500','shipmt_dist_more_than_500','shipmt_wght_less_than_150',
                     'val_den_less_than_1', 'val_den_less_than_5', 'val_den_more_than_5',
                     'val_den_more_than_25','val_den_5to25', 'val_den_1to10', 
                     'wght_bin_1', 'wght_bin_2', 'wght_bin_3', 'wght_bin_4', 'wght_bin_5']]

df_tx_short.columns[df_tx_short.isna().any()].tolist() # 'alt_3_traveltime' and 'alt_3_shipcost' contain NaN

df_tx_short = df_tx_short.fillna(0).copy() # Biogeme does not allow NaN in dataset

In [None]:
df_tx_short.to_csv('CFS2017_austin_forbiogeme.csv', index = False)