In [1]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable, Derive, log
import math
import random
import biogeme.results as res
from random import randint
import os, inspect
from sklearn.utils import shuffle
import numpy as np


import warnings
warnings.filterwarnings("ignore")

os.chdir('/Users/xiaodanxu/Documents/SynthFirm.nosync/CFS')

In [2]:
Austin_df = pd.read_csv('CFS2012_austin_forML.csv')
print(Austin_df.columns)
Austin_df.head(5)

Index(['SHIPMT_ID', 'ORIG_STATE', 'ORIG_MA', 'ORIG_CFS_AREA', 'DEST_STATE',
       'DEST_MA', 'DEST_CFS_AREA', 'NAICS', 'QUARTER', 'SCTG', 'MODE',
       'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_GC', 'SHIPMT_DIST_ROUTED',
       'TEMP_CNTL_YN', 'EXPORT_YN', 'EXPORT_CNTRY', 'HAZMAT', 'WGT_FACTOR',
       'mode_agg5', 'bulk', 'fuel_fert', 'interm_food', 'mfr_goods', 'other',
       'commodity', 'naics2', 'naics_name', 'wholesale', 'mfring', 'mining',
       'retail', 'info', 'management', 'transwarehouse', 'SHIPMT_WGHT_TON',
       'value_density', 'SHIPMT_DIST', 'choice', 'travel_time', 'ship_cost'],
      dtype='object')


Unnamed: 0,SHIPMT_ID,ORIG_STATE,ORIG_MA,ORIG_CFS_AREA,DEST_STATE,DEST_MA,DEST_CFS_AREA,NAICS,QUARTER,SCTG,...,retail,info,management,transwarehouse,SHIPMT_WGHT_TON,value_density,SHIPMT_DIST,choice,travel_time,ship_cost
0,25,48,99999,48-99999,48,99999,48-99999,311,1,5,...,0,0,0,0,16.6345,0.796477,219,2,19.369231,467.09676
1,44,48,41700,48-41700,48,41700,48-41700,4232,2,43,...,0,0,0,0,0.058,0.965517,3,4,4.15,0.49242
2,114,48,41700,48-41700,48,41700,48-41700,4244,2,5,...,0,0,0,0,0.014,3.357143,8,4,4.4,0.23772
3,146,48,99999,48-99999,48,99999,48-99999,4238,4,40,...,0,0,0,0,0.018,10.0,37,2,16.569231,1.37538
4,170,51,99999,51-99999,48,99999,48-99999,326,4,24,...,0,0,0,0,14.1195,0.783031,1290,2,49.947368,2648.25342


In [3]:
# clean the data

df_clean = Austin_df.loc[Austin_df.EXPORT_YN == 'N'] # Cleans out international exports

def removeOutliers(sName, df):
    # Computing IQR
    Q1 = df[sName].quantile(0.25)
    Q3 = df[sName].quantile(0.75)
    IQR = Q3 - Q1

    # Filtering Values between Q1-1.5IQR and Q3+1.5IQR
    filtered = df.query(sName + '<= (@Q3 + 3 * @IQR)')
    
    return filtered

df_clean = removeOutliers('SHIPMT_WGHT', df_clean)
df_clean = removeOutliers('SHIPMT_DIST', df_clean)
print(len(df_clean))

148912


In [7]:
# estimate value density
df_clean.loc[:, 'SHIPMT_WGHT_scaled'] = df_clean.loc[:, 'SHIPMT_WGHT_TON'] * df_clean.loc[:, 'WGT_FACTOR'] 
df_clean.loc[:, 'SHIPMT_VALUE_scaled'] = df_clean.loc[:, 'SHIPMT_VALUE'] * df_clean.loc[:, 'WGT_FACTOR']

value_density_by_sctg = df_clean.groupby(['SCTG'])[['SHIPMT_WGHT_scaled', 'SHIPMT_VALUE_scaled']].sum()
value_density_by_sctg = value_density_by_sctg.reset_index()
value_density_by_sctg.loc[:, 'unitcost'] = value_density_by_sctg.loc[:, 'SHIPMT_VALUE_scaled'] / \
value_density_by_sctg.loc[:, 'SHIPMT_WGHT_scaled']
value_density_by_sctg.head(5)
value_density_by_sctg = value_density_by_sctg[['SCTG', 'unitcost']]
value_density_by_sctg.columns = ['Commodity_SCTG', 'UnitCost']
value_density_by_sctg.to_csv('data_unitcost_cfs2012.csv', index = False)

value_density_by_sctg_zone = df_clean.groupby(['SCTG', 'DEST_CFS_AREA'])[['SHIPMT_WGHT_scaled', 'SHIPMT_VALUE_scaled']].sum()
value_density_by_sctg_zone = value_density_by_sctg_zone.reset_index()
value_density_by_sctg_zone.loc[:, 'unitcost'] = value_density_by_sctg_zone.loc[:, 'SHIPMT_VALUE_scaled'] / \
value_density_by_sctg_zone.loc[:, 'SHIPMT_WGHT_scaled']
value_density_by_sctg_zone.head(5)
value_density_by_sctg_zone = value_density_by_sctg_zone[['SCTG', 'DEST_CFS_AREA', 'unitcost']]
value_density_by_sctg_zone.columns = ['Commodity_SCTG', 'DEST_CFS_AREA', 'UnitCost']
value_density_by_sctg_zone.to_csv('data_unitcost_by_zone_cfs2012.csv', index = False)

In [5]:
var_to_keep = ['SHIPMT_ID', 'ORIG_CFS_AREA', 'DEST_CFS_AREA', 'NAICS', 'naics_name',
               'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_GC', 'value_density']

df_clean_choice_model = df_clean[var_to_keep]
dist_matrix = df_clean_choice_model.groupby(['ORIG_CFS_AREA', 'DEST_CFS_AREA'])[['SHIPMT_DIST_GC']].mean()
dist_matrix = dist_matrix.reset_index()
print(dist_matrix.head(5))



  ORIG_CFS_AREA DEST_CFS_AREA  SHIPMT_DIST_GC
0        01-142      48-12420      683.695652
1        01-142      48-41700      742.881356
2        01-142      48-99999      700.282230
3        01-380      48-12420      589.692308
4        01-380      48-41700      621.314286


In [6]:
unique_naics = df_clean_choice_model.NAICS.unique()
df_clean_choice_model.loc[:, 'chosen'] = 0
supplier_selection_set = None
for naics in unique_naics:
    print(naics)
    all_suppliers = df_clean_choice_model.loc[df_clean_choice_model['NAICS'] == naics]   
    chunk_of_suppliers = np.array_split(all_suppliers, 10)
    #print(chunk_of_suppliers[0].head(5))
    for i in range(10):
        chunk = chunk_of_suppliers[i]
        chunk.loc[:, 'chosen'] = 1
        #print(chunk.columns)
        shipment_to_match = chunk[['SHIPMT_ID', 'ORIG_CFS_AREA', 
                                   'NAICS', 'naics_name','SHIPMT_WGHT']]
        selected_shipment = chunk.SHIPMT_ID.unique()
        non_chosen_set = all_suppliers.loc[~all_suppliers['SHIPMT_ID'].isin(selected_shipment)]
        non_chosen_set = non_chosen_set[['DEST_CFS_AREA', 'NAICS', 
                                         'naics_name', 'value_density', 'chosen']]
        non_chosen_set = pd.merge(shipment_to_match, non_chosen_set,
                                  on = ['NAICS', 'naics_name'], how = 'left')
        non_chosen_set = pd.merge(non_chosen_set, dist_matrix, 
                                  on = ['ORIG_CFS_AREA', 'DEST_CFS_AREA'], how = 'left')
        non_chosen_set.loc[:, 'SHIPMT_DIST_GC'].fillna(3000, inplace = True)
        non_chosen_set.loc[:, 'SHIPMT_VALUE'] = non_chosen_set.loc[:, 'SHIPMT_WGHT'] * non_chosen_set.loc[:, 'value_density']
        non_chosen_set = non_chosen_set.groupby('SHIPMT_ID').sample(n=9, replace = True, random_state=1)
        #print(non_chosen_set.head(5))
        combined_set = pd.concat([chunk, non_chosen_set])
        combined_set = combined_set.sort_values('SHIPMT_ID')
#         print(combined_set.head(20))
        supplier_selection_set = pd.concat([supplier_selection_set, combined_set])
#         break
        
#     break

311
4232
4244
4238
326
4541
4235
332
5111
331
323
4233
4234
339
4246
336
4243
325
333
551114
324
321
4231
4249
4931
4242
212
4237
315
314
335
4247
327
4236
334
312
337
4239
4241
4245
45431
313
316
322
4248


In [7]:
# creat label
supplier_selection_set = shuffle(supplier_selection_set)
print(len(supplier_selection_set))
supplier_selection_set = supplier_selection_set.sort_values('SHIPMT_ID')
supplier_selection_set['alternative']=supplier_selection_set.groupby('SHIPMT_ID').cumcount()+1
supplier_selection_set['choice'] = 0
supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 'choice'] = \
supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 'alternative']

supplier_selection_set.head(10)

1489120


Unnamed: 0,SHIPMT_ID,ORIG_CFS_AREA,DEST_CFS_AREA,NAICS,naics_name,SHIPMT_VALUE,SHIPMT_WGHT,SHIPMT_DIST_GC,value_density,chosen,alternative,choice
5192,25,48-99999,48-99999,311,Manufacturing,6237.842417,33269,58.565367,0.187497,0,1,0
0,25,48-99999,48-99999,311,Manufacturing,26498.0,33269,156.0,0.796477,1,2,2
2895,25,48-99999,06-348,311,Manufacturing,16468.151074,33269,1168.25624,0.495,0,3,0
235,25,48-99999,39-198,311,Manufacturing,112415.75814,33269,1072.559524,3.378994,0,4,0
3980,25,48-99999,48-99999,311,Manufacturing,4241.280533,33269,58.565367,0.127484,0,5,0
5056,25,48-99999,48-41700,311,Manufacturing,22955.834898,33269,192.303529,0.690007,0,6,0
5157,25,48-99999,48-99999,311,Manufacturing,28798.776548,33269,58.565367,0.865634,0,7,0
905,25,48-99999,48-41700,311,Manufacturing,84974.053146,33269,192.303529,2.554151,0,8,0
144,25,48-99999,48-41700,311,Manufacturing,164656.218285,33269,192.303529,4.949239,0,9,0
2763,25,48-99999,48-99999,311,Manufacturing,4628.202663,33269,58.565367,0.139115,0,10,0


In [8]:
# convert long data to wide
choice = supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 
                                     ['SHIPMT_ID',	'NAICS', 'choice']]

factor_1 = pd.pivot_table(supplier_selection_set, values='SHIPMT_VALUE', index=['SHIPMT_ID'],
                    columns=['alternative'], aggfunc=np.mean)
factor_1.columns = ['value_' + str(i+1) for i in range(10)]
factor_1 = factor_1.reset_index()
# factor_1.head(5)

factor_2 = pd.pivot_table(supplier_selection_set, values='SHIPMT_DIST_GC', index=['SHIPMT_ID'],
                    columns=['alternative'], aggfunc=np.mean)
factor_2.columns = ['distance_' + str(i+1) for i in range(10)]
# factor_2 = factor_2.fillna(1)
factor_2 = factor_2.reset_index()
# factor_2.head(5)

destination_choice_data_wide = pd.merge(choice, factor_1, 
                                        on = 'SHIPMT_ID', how = 'left')
destination_choice_data_wide = pd.merge(destination_choice_data_wide, factor_2, 
                                        on = 'SHIPMT_ID', how = 'left')
destination_choice_data_wide.head(5)

Unnamed: 0,SHIPMT_ID,NAICS,choice,value_1,value_2,value_3,value_4,value_5,value_6,value_7,...,distance_1,distance_2,distance_3,distance_4,distance_5,distance_6,distance_7,distance_8,distance_9,distance_10
0,25,311,2,6237.842417,26498.0,16468.151074,112415.75814,4241.280533,22955.834898,28798.776548,...,58.565367,156.0,1168.25624,1072.559524,58.565367,192.303529,58.565367,192.303529,192.303529,58.565367
1,44,4232,2,217.128205,112.0,201.333333,277.111111,194.241611,591.485149,1415.2,...,789.36,3.0,11.730588,166.548253,166.548253,166.548253,63.556857,63.556857,166.548253,180.171304
2,114,4244,2,1.9628,94.0,79.946381,6.609946,162.33543,178.652874,19.056172,...,166.548253,6.0,166.548253,166.548253,166.548253,166.548253,11.730588,63.556857,11.730588,11.730588
3,146,4238,2,27.169811,360.0,2068.363636,920.25,140.273482,1884.0,3888.0,...,58.565367,27.0,58.565367,58.565367,58.565367,192.303529,58.565367,136.59375,192.303529,58.565367
4,170,326,7,136112.938883,46786.36012,61046.073533,593019.0,69545.1149,51292.872887,22112.0,...,1170.714286,3000.0,3000.0,3000.0,3000.0,3000.0,1042.0,1170.714286,1170.714286,1170.714286


In [9]:
database = db.Database('destination_choice', destination_choice_data_wide)
globals().update(database.variables)
database.fullData
# define parameters
B_VALUE = Beta('B_VALUE', 0, None, None, 0)
B_DISTANCE = Beta('B_DISTANCE', 0, None, None, 0)

V1 = B_VALUE * value_1 + B_DISTANCE * distance_1
V2 = B_VALUE * value_2 + B_DISTANCE * distance_2
V3 = B_VALUE * value_3 + B_DISTANCE * distance_3
V4 = B_VALUE * value_4 + B_DISTANCE * distance_4
V5 = B_VALUE * value_5 + B_DISTANCE * distance_5
V6 = B_VALUE * value_6 + B_DISTANCE * distance_6
V7 = B_VALUE * value_7 + B_DISTANCE * distance_7
V8 = B_VALUE * value_8 + B_DISTANCE * distance_8
V9 = B_VALUE * value_9 + B_DISTANCE * distance_9
V10 = B_VALUE * value_10 + B_DISTANCE * distance_10

V = {1: V1, 2: V2, 3: V3, 4: V4, 5: V5,
    6: V6, 7: V7, 8: V8, 9: V9, 10: V10}

logprob = models.loglogit(V, None, choice)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'supplier_selection'

# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
goodness_of_fit = results.getGeneralStatistics()
print('estimation results')
print(pandasResults)
print(goodness_of_fit['Rho-square-bar for the init. model'][0])

estimation results
               Value       Std err     t-test  p-value  Rob. Std err  \
B_DISTANCE -0.000564  5.800288e-06 -97.237104      0.0  5.043293e-06   
B_VALUE    -0.000012  1.382641e-07 -83.458203      0.0  4.486299e-07   

            Rob. t-test  Rob. p-value  
B_DISTANCE  -111.832333           0.0  
B_VALUE      -25.721134           0.0  
0.0006999544854197071


In [19]:
# destination_choice_data_selected = destination_choice_data_wide.loc[destination_choice_data_wide['NAICS'] == 333]
destination_choice_data_selected = destination_choice_data_wide.head(50000)
database = db.Database('destination_choice', destination_choice_data_selected)
globals().update(database.variables)
database.fullData
# define parameters
B_VALUE = Beta('B_VALUE', 0, None, None, 0)
B_DISTANCE = Beta('B_DISTANCE', 0, None, None, 0)

V1 = B_VALUE * value_1 + B_DISTANCE * distance_1
V2 = B_VALUE * value_2 + B_DISTANCE * distance_2
V3 = B_VALUE * value_3 + B_DISTANCE * distance_3
V4 = B_VALUE * value_4 + B_DISTANCE * distance_4
V5 = B_VALUE * value_5 + B_DISTANCE * distance_5
V6 = B_VALUE * value_6 + B_DISTANCE * distance_6
V7 = B_VALUE * value_7 + B_DISTANCE * distance_7
V8 = B_VALUE * value_8 + B_DISTANCE * distance_8
V9 = B_VALUE * value_9 + B_DISTANCE * distance_9
V10 = B_VALUE * value_10 + B_DISTANCE * distance_10

V = {1: V1, 2: V2, 3: V3, 4: V4, 5: V5,
    6: V6, 7: V7, 8: V8, 9: V9, 10: V10}

logprob = models.loglogit(V, None, choice)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'supplier_selection'

# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
goodness_of_fit = results.getGeneralStatistics()
print('estimation results')
print(pandasResults)
print(goodness_of_fit['Rho-square-bar for the init. model'][0])

estimation results
               Value       Std err     t-test  p-value  Rob. Std err  \
B_DISTANCE -0.000451  9.287972e-06 -48.529805      0.0      0.000009   
B_VALUE    -0.000011  2.250443e-07 -46.980704      0.0      0.000001   

            Rob. t-test  Rob. p-value  
B_DISTANCE   -50.218252           0.0  
B_VALUE       -9.797934           0.0  
-5.898282442906577e-06


In [12]:
validationData = database.split(slices=5)

validation_results = biogeme.validate(results, validationData)

for slide in validation_results:
    print(
        f'Log likelihood for {slide.shape[0]} validation data: '
        f'{slide["Loglikelihood"].sum()}'
    )

Log likelihood for 10000 validation data: -22521.86062041235
Log likelihood for 10000 validation data: -22447.934767976683
Log likelihood for 10000 validation data: -22186.99405266812
Log likelihood for 10000 validation data: -22063.792608900138
Log likelihood for 10000 validation data: -21961.317473647527


In [20]:
print(logprob)

_bioLogLogitFullChoiceSet(1:((B_VALUE(-1.0950943913870096e-05) * value_1) + (B_DISTANCE(-0.00044825343739563134) * distance_1)), 2:((B_VALUE(-1.0950943913870096e-05) * value_2) + (B_DISTANCE(-0.00044825343739563134) * distance_2)), 3:((B_VALUE(-1.0950943913870096e-05) * value_3) + (B_DISTANCE(-0.00044825343739563134) * distance_3)), 4:((B_VALUE(-1.0950943913870096e-05) * value_4) + (B_DISTANCE(-0.00044825343739563134) * distance_4)), 5:((B_VALUE(-1.0950943913870096e-05) * value_5) + (B_DISTANCE(-0.00044825343739563134) * distance_5)), 6:((B_VALUE(-1.0950943913870096e-05) * value_6) + (B_DISTANCE(-0.00044825343739563134) * distance_6)), 7:((B_VALUE(-1.0950943913870096e-05) * value_7) + (B_DISTANCE(-0.00044825343739563134) * distance_7)), 8:((B_VALUE(-1.0950943913870096e-05) * value_8) + (B_DISTANCE(-0.00044825343739563134) * distance_8)), 9:((B_VALUE(-1.0950943913870096e-05) * value_9) + (B_DISTANCE(-0.00044825343739563134) * distance_9)), 10:((B_VALUE(-1.0950943913870096e-05) * value_1