In [1]:
## Built-in modules
import os

## Third party modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import OrderedDict
%matplotlib inline

## Process the CFS data

In [2]:
#df_raw = pd.read_csv('cfs_2017.csv')

In [3]:
df = df_raw

In [4]:
df.head()

Unnamed: 0,SHIPMT_ID,ORIG_STATE,ORIG_MA,ORIG_CFS_AREA,DEST_STATE,DEST_MA,DEST_CFS_AREA,NAICS,QUARTER,SCTG,MODE,SHIPMT_VALUE,SHIPMT_WGHT,SHIPMT_DIST_GC,SHIPMT_DIST_ROUTED,TEMP_CNTL_YN,EXPORT_YN,EXPORT_CNTRY,HAZMAT,WGT_FACTOR
0,1,6,99999,06-99999,6,260,06-260,326,4,43,5,4380,391,54,60,N,N,N,N,328.3
1,2,49,482,49-482,47,314,47-314,4541,3,43,14,56,4,1524,1810,N,N,N,N,8425.3
2,3,6,348,06-348,6,348,06-348,4231,4,34,5,255,440,2,5,N,N,N,N,9120.7
3,4,6,260,06-260,6,99999,06-99999,212,4,11,5,250,44912,30,35,N,N,N,N,20.9
4,5,45,273,45-273,45,273,45-273,45431,4,19,5,46,73,9,11,N,N,N,H,1733.8


In [5]:
df.shape

(5978523, 20)

### Generate mode choice variables

In [6]:
mode_agg5_dict = {4:'For-hire Truck', 
                  5:'Private Truck', 
                  6:'Rail/IMX', 
                  15:'Rail/IMX', 
                  11:'Air', 
                  14:'Parcel', 
                  0:'Other',
                  2:'Other',
                  3:'Other',
                  7:'Other',
                  8:'Other',
                  9:'Other',
                  10:'Other',
                  12:'Other',
                  13:'Other',
                  16:'Other',
                  17:'Other',
                  18:'Other',
                  19:'Other',
                  20:'Other',
                  101:'Other'}

mode_agg3_dict = {3:'Truck',
                  4:'Truck', 
                  5:'Truck', 
                  6:'Rail/IMX', 
                  15:'Rail/IMX', 
                  11:'Air/Parcel', 
                  14:'Air/Parcel', 
                  0:'Other',
                  2:'Other',
                  7:'Other',
                  8:'Other',
                  9:'Other',
                  10:'Other',
                  12:'Other',
                  13:'Other',
                  16:'Other',
                  17:'Other',
                  18:'Other',
                  19:'Other',
                  20:'Other',
                  101:'Other'}

df['mode_agg5'] = (df.MODE).replace(mode_agg5_dict).copy()
df['mode_agg3'] = (df.MODE).replace(mode_agg3_dict).copy()

In [7]:
mode_decode_dict = {4:'For-hire Truck', 
                  5:'Private Truck', 
                  6:'Rail/IMX', 
                  15:'Rail/IMX', 
                  11:'Air', 
                  14:'Parcel', 
                  0:'Other',
                  2:'Other',
                  3:'Other',
                  7:'Other',
                  8:'Inland Water',
                  9:'Great Lake',
                  10:'Deep Sea',
                  12:'Pipeline',
                  13:'Other',
                  16:'Truck and Water',
                  17:'Rail and Water',
                  18:'Other',
                  19:'Other',
                  20:'Other',
                  101:'Other'}

df['mode_decode'] = (df.MODE).replace(mode_decode_dict).copy()

In [8]:
df['mode_decode'].value_counts()

For-hire Truck     2354704
Private Truck      1825760
Parcel             1566194
Air                  93633
Rail/IMX             62965
Truck and Water      43929
Other                18933
Pipeline              4487
Inland Water          3758
Rail and Water        2542
Deep Sea              1529
Great Lake              89
Name: mode_decode, dtype: int64

In [9]:
df['mode_agg5'].value_counts()

For-hire Truck    2354704
Private Truck     1825760
Parcel            1566194
Air                 93633
Other               75267
Rail/IMX            62965
Name: mode_agg5, dtype: int64

### Generate commodity type variables

In [10]:
df.drop(df[(df['SCTG'] == '25-30') | (df['SCTG'] == '20-24') | (df['SCTG'] == '10-14') | 
           (df['SCTG'] == '01-05') | (df['SCTG'] == '31-34') | (df['SCTG'] == '06-09') | 
           (df['SCTG'] == '39-43') | (df['SCTG'] == '35-38') | (df['SCTG'] == '15-19') |
           (df['SCTG'] == '00')].index, inplace = True)

In [11]:
df['SCTG'] = df['SCTG'].astype(int)

In [12]:
df['bulk'] = np.where((df['SCTG'] == 2) | (df['SCTG'] == 10) | (df['SCTG'] == 11) | 
                      (df['SCTG'] == 12) | (df['SCTG'] == 13) | (df['SCTG'] == 14) | 
                      (df['SCTG'] == 15) | (df['SCTG'] == 25) | (df['SCTG'] == 26)| (df['SCTG'] == 32), 1, 0)
df['fuel_fert'] = np.where((df['SCTG'] == 16) | (df['SCTG'] == 17) | (df['SCTG'] == 18) | 
                           (df['SCTG'] == 19) | (df['SCTG'] == 20) | (df['SCTG'] == 22) | (df['SCTG'] == 23), 1, 0)
df['interm_food'] = np.where((df['SCTG'] == 1) | (df['SCTG'] == 3) | (df['SCTG'] == 4) | 
                             (df['SCTG'] == 5) | (df['SCTG'] == 6) | (df['SCTG'] == 7) | (df['SCTG'] == 8), 1, 0)
df['mfr_goods'] = np.where((df['SCTG'] == 9) | (df['SCTG'] == 21) | (df['SCTG'] == 24) | 
                           (df['SCTG'] == 27) | (df['SCTG'] == 28) | (df['SCTG'] == 29) | 
                           (df['SCTG'] == 30) | (df['SCTG'] == 31) | (df['SCTG'] == 33) | 
                           (df['SCTG'] == 34) | (df['SCTG'] == 35) | (df['SCTG'] == 36) |
                           (df['SCTG'] == 37) | (df['SCTG'] == 38) | (df['SCTG'] == 39) | (df['SCTG'] == 40), 1, 0)
df['other'] = np.where((df['SCTG'] == 41) | (df['SCTG'] == 43) | (df['SCTG'] == 99), 1, 0)

In [13]:
commodity_dict = {2:'bulk', 10:'bulk', 11:'bulk', 12:'bulk', 13:'bulk', 14:'bulk', 15:'bulk', 25:'bulk', 26:'bulk', 32:'bulk',
                  16:'fuel_fert', 17:'fuel_fert', 18:'fuel_fert', 19:'fuel_fert', 20:'fuel_fert', 22:'fuel_fert', 23:'fuel_fert',
                  1:'interm_food', 3:'interm_food', 4:'interm_food', 5:'interm_food', 6:'interm_food', 7:'interm_food', 8:'interm_food',
                  9:'mfr_goods', 21:'mfr_goods', 24:'mfr_goods', 27:'mfr_goods', 28:'mfr_goods', 29:'mfr_goods', 30:'mfr_goods', 
                  31:'mfr_goods', 33:'mfr_goods', 34:'mfr_goods', 35:'mfr_goods', 36:'mfr_goods', 37:'mfr_goods', 38:'mfr_goods', 
                  39:'mfr_goods', 40:'mfr_goods',
                  41:'other', 43:'other', 99:'other'}

df['commodity'] = df['SCTG'].map(commodity_dict)

### Generate NAICS industry type variables

In [14]:
df['naics2'] = df['NAICS'].astype(str).str[:2].astype(int)

In [15]:
df['naics_name'] = (df.naics2).replace({21:'Mining', 
                                        31:'Manufacturing',
                                        32:'Manufacturing',
                                        33:'Manufacturing',
                                        42:'Wholesale',
                                        45:'Retail',
                                        49:'Trans_Warehouse',
                                        51:'Information',
                                        55:'Mgt_companies'})

df['naics_name'].value_counts() 

Wholesale          2569175
Manufacturing      2550711
Retail              305866
Trans_Warehouse     265653
Mining              150835
Information          64535
Mgt_companies        57265
Name: naics_name, dtype: int64

In [16]:
df["wholesale"] = (df['naics_name'] == "Wholesale").astype(int)
df["mfring"] = (df['naics_name'] == "Manufacturing").astype(int)
df["mining"] = (df['naics_name'] == "Mining").astype(int)
df["retail"] = (df['naics_name'] == "Retail").astype(int)
df["info"] = (df['naics_name'] == "Information").astype(int)
df["management"] = (df['naics_name'] == "Mgt_companies").astype(int)
df["transwarehouse"] = (df['naics_name'] == "Trans_Warehouse").astype(int)

### Generate shipment weight bins

In [17]:
df['wght_bin1'] = np.where(df['SHIPMT_WGHT'] <= 150, 1,
                     np.where((df['SHIPMT_WGHT'] > 150) & (df['SHIPMT_WGHT'] <= 1500), 2,
                     np.where((df['SHIPMT_WGHT'] > 1500) & (df['SHIPMT_WGHT'] <= 30000), 3,
                     np.where((df['SHIPMT_WGHT'] > 30000) & (df['SHIPMT_WGHT'] <= 45000), 4, 5))))

### Generate additional variables

In [18]:
df['SHIPMT_WGHT_TON'] = df['SHIPMT_WGHT']/2000

df['value_density'] = df['SHIPMT_VALUE']/df['SHIPMT_WGHT']

df['SHIPMT_DIST'] = df['SHIPMT_DIST_ROUTED']

df['SHIPMT_DIST'] = np.where((df['mode_agg5'] == 'Air') | (df['mode_agg5'] == 'Parcel'), 
                              df['SHIPMT_DIST_GC'],df['SHIPMT_DIST']) # Use GC distance for Air and Parcel and Routed distance for the rest of modes.

### Additional data removal rules

In [19]:
## based on Stinson et al. (2017)

df.drop(df[(df['mode_agg5'] == 'Air') & (df['SHIPMT_WGHT'] > 15000)].index, inplace = True)
df.drop(df[(df['mode_agg5'] == 'Air') & (df['SHIPMT_WGHT'] > 150) & (df['value_density'] < 1)].index, inplace = True)
df.drop(df[(df['mode_agg5'] == 'Parcel') & (df['SHIPMT_WGHT'] > 150) & (df['value_density'] < 1)].index, inplace = True)
df.drop(df[(df['MODE'] == 6) & (df['SHIPMT_WGHT'] < 1500)].index, inplace = True)
df.drop(df[(df['MODE'] == 6) & (df['value_density'] >= 4)].index, inplace = True)
