In [13]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable, Derive, log, bioDraws, MonteCarlo
import math
import random
import biogeme.results as res
from random import randint
import os, inspect
from sklearn.utils import shuffle
import numpy as np


import warnings
warnings.filterwarnings("ignore")

os.chdir('/Users/xiaodanxu/Documents/SynthFirm.nosync/CFS')

In [2]:
Austin_df = pd.read_csv('CFS2017_austin_forML.csv')
print(Austin_df.columns)
Austin_df.head(5)

Index(['SHIPMT_ID', 'ORIG_STATE', 'ORIG_MA', 'ORIG_CFS_AREA', 'DEST_STATE',
       'DEST_MA', 'DEST_CFS_AREA', 'NAICS', 'QUARTER', 'SCTG', 'MODE',
       'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_GC', 'SHIPMT_DIST_ROUTED',
       'TEMP_CNTL_YN', 'EXPORT_YN', 'EXPORT_CNTRY', 'HAZMAT', 'WGT_FACTOR',
       'mode_agg5', 'bulk', 'fuel_fert', 'interm_food', 'mfr_goods', 'other',
       'commodity', 'naics2', 'naics_name', 'wholesale', 'mfring', 'mining',
       'retail', 'info', 'management', 'transwarehouse', 'SHIPMT_WGHT_TON',
       'value_density', 'SHIPMT_DIST', 'choice', 'travel_time', 'ship_cost'],
      dtype='object')


Unnamed: 0,SHIPMT_ID,ORIG_STATE,ORIG_MA,ORIG_CFS_AREA,DEST_STATE,DEST_MA,DEST_CFS_AREA,NAICS,QUARTER,SCTG,...,retail,info,management,transwarehouse,SHIPMT_WGHT_TON,value_density,SHIPMT_DIST,choice,travel_time,ship_cost
0,6,48,12420,48-12420,36,160,36-160,4541,2,40,...,1,0,0,0,0.002,151.25,1348,3,24.0,128.766412
1,9,17,176,17-176,48,41700,48-41700,333,1,40,...,0,0,0,0,0.019,50.473684,1057,3,120.0,25.35561
2,13,48,206,48-206,48,99999,48-99999,322,2,28,...,0,0,0,0,0.342,0.982456,517,2,23.953846,79.686
3,44,13,122,13-122,48,99999,48-99999,4242,2,21,...,0,0,0,0,0.0005,3.0,729,3,120.0,13.052765
4,66,48,99999,48-99999,48,99999,48-99999,4239,2,26,...,0,0,0,0,69.2885,0.10667,419,5,55.045455,1132.243379


In [3]:
# clean the data

df_clean = Austin_df.loc[Austin_df.EXPORT_YN == 'N'] # Cleans out international exports

def removeOutliers(sName, df):
    # Computing IQR
    Q1 = df[sName].quantile(0.25)
    Q3 = df[sName].quantile(0.75)
    IQR = Q3 - Q1

    # Filtering Values between Q1-1.5IQR and Q3+1.5IQR
    filtered = df.query(sName + '<= (@Q3 + 3 * @IQR)')
    
    return filtered

df_clean = removeOutliers('SHIPMT_WGHT', df_clean)
df_clean = removeOutliers('SHIPMT_DIST', df_clean)
print(len(df_clean))

206269


In [4]:
var_to_keep = ['SHIPMT_ID', 'ORIG_CFS_AREA', 'DEST_CFS_AREA', 'NAICS', 'naics_name',
               'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_GC', 'value_density']

df_clean_choice_model = df_clean[var_to_keep]
dist_matrix = df_clean_choice_model.groupby(['ORIG_CFS_AREA', 'DEST_CFS_AREA'])[['SHIPMT_DIST_GC']].mean()
dist_matrix = dist_matrix.reset_index()
print(dist_matrix.head(5))



  ORIG_CFS_AREA DEST_CFS_AREA  SHIPMT_DIST_GC
0        01-142      48-12420      690.556962
1        01-142      48-41700      755.881579
2        01-142      48-99999      698.684659
3        01-380      48-12420      586.800000
4        01-380      48-41700      631.050000


In [5]:
unique_naics = df_clean_choice_model.NAICS.unique()
df_clean_choice_model.loc[:, 'chosen'] = 0
supplier_selection_set = None
for naics in unique_naics:
    print(naics)
    all_suppliers = df_clean_choice_model.loc[df_clean_choice_model['NAICS'] == naics]   
    chunk_of_suppliers = np.array_split(all_suppliers, 10)
    #print(chunk_of_suppliers[0].head(5))
    for i in range(10):
        chunk = chunk_of_suppliers[i]
        chunk.loc[:, 'chosen'] = 1
        #print(chunk.columns)
        shipment_to_match = chunk[['SHIPMT_ID', 'ORIG_CFS_AREA', 
                                   'NAICS', 'naics_name','SHIPMT_WGHT']]
        selected_shipment = chunk.SHIPMT_ID.unique()
        non_chosen_set = all_suppliers.loc[~all_suppliers['SHIPMT_ID'].isin(selected_shipment)]
        non_chosen_set = non_chosen_set[['DEST_CFS_AREA', 'NAICS', 
                                         'naics_name', 'value_density', 'chosen']]
        non_chosen_set = pd.merge(shipment_to_match, non_chosen_set,
                                  on = ['NAICS', 'naics_name'], how = 'left')
        non_chosen_set = pd.merge(non_chosen_set, dist_matrix, 
                                  on = ['ORIG_CFS_AREA', 'DEST_CFS_AREA'], how = 'left')
        non_chosen_set.loc[:, 'SHIPMT_DIST_GC'].fillna(3000, inplace = True)
        non_chosen_set.loc[:, 'SHIPMT_VALUE'] = non_chosen_set.loc[:, 'SHIPMT_WGHT'] * non_chosen_set.loc[:, 'value_density']
        non_chosen_set = non_chosen_set.groupby('SHIPMT_ID').sample(n=9, replace = True, random_state=1)
        #print(non_chosen_set.head(5))
        combined_set = pd.concat([chunk, non_chosen_set])
        combined_set = combined_set.sort_values('SHIPMT_ID')
#         print(combined_set.head(20))
        supplier_selection_set = pd.concat([supplier_selection_set, combined_set])
#         break
        
#     break

4541
333
322
4242
334
314
4244
337
4234
4235
4931
311
4241
4239
4233
321
4248
332
4236
312
323
339
4232
4238
326
4249
313
4243
4237
336
335
325
4231
4247
551114
327
5111
315
4246
316
45431
331
212
324
4245


In [6]:
# creat label
supplier_selection_set = shuffle(supplier_selection_set)
print(len(supplier_selection_set))
supplier_selection_set = supplier_selection_set.sort_values('SHIPMT_ID')
supplier_selection_set['alternative']=supplier_selection_set.groupby('SHIPMT_ID').cumcount()+1
supplier_selection_set['choice'] = 0
supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 'choice'] = \
supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 'alternative']

supplier_selection_set.head(10)

2062690


Unnamed: 0,SHIPMT_ID,ORIG_CFS_AREA,DEST_CFS_AREA,NAICS,naics_name,SHIPMT_VALUE,SHIPMT_WGHT,SHIPMT_DIST_GC,value_density,chosen,alternative,choice
5192,6,48-12420,48-41700,4541,Retail,64.0,4,60.734532,16.0,0,1,0
5157,6,48-12420,48-99999,4541,Retail,7.529412,4,131.634192,1.882353,0,2,0
144,6,48-12420,48-12420,4541,Retail,4.0,4,12.616728,1.0,0,3,0
905,6,48-12420,36-104,4541,Retail,224.0,4,1575.703704,56.0,0,4,0
5056,6,48-12420,48-41700,4541,Retail,204.0,4,60.734532,51.0,0,5,0
0,6,48-12420,36-160,4541,Retail,605.0,4,1348.0,151.25,1,6,6
3980,6,48-12420,48-99999,4541,Retail,596.8,4,131.634192,149.2,0,7,0
2763,6,48-12420,06-472,4541,Retail,80.0,4,1457.526316,20.0,0,8,0
235,6,48-12420,48-12420,4541,Retail,17.855769,4,12.616728,4.463942,0,9,0
2895,6,48-12420,48-41700,4541,Retail,6444.0,4,60.734532,1611.0,0,10,0


In [7]:
# convert long data to wide
choice = supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 
                                     ['SHIPMT_ID',	'NAICS', 'choice']]

factor_1 = pd.pivot_table(supplier_selection_set, values='SHIPMT_VALUE', index=['SHIPMT_ID'],
                    columns=['alternative'], aggfunc=np.mean)
factor_1.columns = ['value_' + str(i+1) for i in range(10)]
factor_1 = factor_1.reset_index()
# factor_1.head(5)

factor_2 = pd.pivot_table(supplier_selection_set, values='SHIPMT_DIST_GC', index=['SHIPMT_ID'],
                    columns=['alternative'], aggfunc=np.mean)
factor_2.columns = ['distance_' + str(i+1) for i in range(10)]
# factor_2 = factor_2.fillna(1)
factor_2 = factor_2.reset_index()
# factor_2.head(5)

destination_choice_data_wide = pd.merge(choice, factor_1, 
                                        on = 'SHIPMT_ID', how = 'left')
destination_choice_data_wide = pd.merge(destination_choice_data_wide, factor_2, 
                                        on = 'SHIPMT_ID', how = 'left')
destination_choice_data_wide.head(5)

Unnamed: 0,SHIPMT_ID,NAICS,choice,value_1,value_2,value_3,value_4,value_5,value_6,value_7,...,distance_1,distance_2,distance_3,distance_4,distance_5,distance_6,distance_7,distance_8,distance_9,distance_10
0,6,4541,6,64.0,7.529412,4.0,224.0,204.0,605.0,596.8,...,60.734532,131.634192,12.616728,1575.703704,60.734532,1348.0,131.634192,1457.526316,12.616728,60.734532
1,9,333,8,11767.333335,99.860465,440.779005,857.111111,11.843501,176.644022,181.20296,...,969.204134,993.896161,1032.37415,993.896161,969.204134,1032.37415,993.896161,1057.0,993.896161,993.896161
2,13,322,8,1597.425,395.242745,1015.557919,1300.018349,2382.411438,476.633484,1465.497223,...,3000.0,198.052255,3000.0,198.052255,3000.0,198.052255,198.052255,466.0,3000.0,3000.0
3,44,4242,1,3.0,226.125,8.0,639.5,44.363636,104.042553,184.0,...,729.0,884.636042,884.636042,884.636042,825.007718,3000.0,3000.0,824.025806,825.007718,825.007718
4,130,334,5,350.35,4884.0,8046.5,16.923077,2984.0,4268.0,241.845506,...,131.634192,1479.373418,1343.764706,60.734532,3.0,60.734532,131.634192,131.634192,12.616728,865.766423


In [None]:
database = db.Database('destination_choice', destination_choice_data_wide)
globals().update(database.variables)
database.fullData
# define parameters
B_VALUE = Beta('B_VALUE', 0, None, None, 0)
B_DISTANCE = Beta('B_DISTANCE', 0, None, None, 0)

B_VALUE_S = Beta('B_VALUE_S', 0.0001, None, None, 0)
B_DISTANCE_S = Beta('B_DISTANCE_S', 0.01, None, None, 0)
# Define a random parameter with a normal distribution, designed to be used
# for quasi Monte-Carlo simulation with Halton draws (base 5).
B_VALUE_RND = B_VALUE + B_VALUE_S * bioDraws('B_TIME_RND', 'NORMAL_HALTON5')
B_DISTANCE_RND = B_DISTANCE + B_DISTANCE_S * bioDraws('B_DISTANCE_RND', 'NORMAL_HALTON5')


V1 = B_VALUE_RND * value_1 + B_DISTANCE_RND * distance_1
V2 = B_VALUE_RND * value_2 + B_DISTANCE_RND * distance_2
V3 = B_VALUE_RND * value_3 + B_DISTANCE_RND * distance_3
V4 = B_VALUE_RND * value_4 + B_DISTANCE_RND * distance_4
V5 = B_VALUE_RND * value_5 + B_DISTANCE_RND * distance_5
V6 = B_VALUE_RND * value_6 + B_DISTANCE_RND * distance_6
V7 = B_VALUE_RND * value_7 + B_DISTANCE_RND * distance_7
V8 = B_VALUE_RND * value_8 + B_DISTANCE_RND * distance_8
V9 = B_VALUE_RND * value_9 + B_DISTANCE_RND * distance_9
V10 = B_VALUE_RND * value_10 + B_DISTANCE_RND * distance_10

V = {1: V1, 2: V2, 3: V3, 4: V4, 5: V5,
    6: V6, 7: V7, 8: V8, 9: V9, 10: V10}

# logprob = models.loglogit(V, None, choice)
prob = models.logit(V, None, choice)
logprob = log(MonteCarlo(prob))
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'supplier_selection_MX'

# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
goodness_of_fit = results.getGeneralStatistics()
print('estimation results')
print(pandasResults)
print(goodness_of_fit['Rho-square-bar for the init. model'][0])

In [9]:
# destination_choice_data_selected = destination_choice_data_wide.loc[destination_choice_data_wide['NAICS'] == 333]
destination_choice_data_selected = destination_choice_data_wide.head(50000)
database = db.Database('destination_choice', destination_choice_data_selected)
globals().update(database.variables)
database.fullData
# define parameters
B_VALUE = Beta('B_VALUE', 0, None, None, 0)
B_DISTANCE = Beta('B_DISTANCE', 0, None, None, 0)

V1 = B_VALUE * value_1 + B_DISTANCE * distance_1
V2 = B_VALUE * value_2 + B_DISTANCE * distance_2
V3 = B_VALUE * value_3 + B_DISTANCE * distance_3
V4 = B_VALUE * value_4 + B_DISTANCE * distance_4
V5 = B_VALUE * value_5 + B_DISTANCE * distance_5
V6 = B_VALUE * value_6 + B_DISTANCE * distance_6
V7 = B_VALUE * value_7 + B_DISTANCE * distance_7
V8 = B_VALUE * value_8 + B_DISTANCE * distance_8
V9 = B_VALUE * value_9 + B_DISTANCE * distance_9
V10 = B_VALUE * value_10 + B_DISTANCE * distance_10

V = {1: V1, 2: V2, 3: V3, 4: V4, 5: V5,
    6: V6, 7: V7, 8: V8, 9: V9, 10: V10}

logprob = models.loglogit(V, None, choice)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'supplier_selection'

# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
goodness_of_fit = results.getGeneralStatistics()
print('estimation results')
print(pandasResults)
print(goodness_of_fit['Rho-square-bar for the init. model'][0])

estimation results
               Value       Std err     t-test  p-value  Rob. Std err  \
B_DISTANCE -0.000566  1.064970e-05 -53.166136      0.0  9.463055e-06   
B_VALUE    -0.000001  5.947764e-08 -18.736431      0.0  5.169378e-08   

            Rob. t-test  Rob. p-value  
B_DISTANCE   -59.833025           0.0  
B_VALUE      -21.557694           0.0  
0.004612733807154434


In [12]:
validationData = database.split(slices=5)

validation_results = biogeme.validate(results, validationData)

for slide in validation_results:
    print(
        f'Log likelihood for {slide.shape[0]} validation data: '
        f'{slide["Loglikelihood"].sum()}'
    )

Log likelihood for 10000 validation data: -22521.86062041235
Log likelihood for 10000 validation data: -22447.934767976683
Log likelihood for 10000 validation data: -22186.99405266812
Log likelihood for 10000 validation data: -22063.792608900138
Log likelihood for 10000 validation data: -21961.317473647527


In [20]:
print(logprob)

_bioLogLogitFullChoiceSet(1:((B_VALUE(-1.0950943913870096e-05) * value_1) + (B_DISTANCE(-0.00044825343739563134) * distance_1)), 2:((B_VALUE(-1.0950943913870096e-05) * value_2) + (B_DISTANCE(-0.00044825343739563134) * distance_2)), 3:((B_VALUE(-1.0950943913870096e-05) * value_3) + (B_DISTANCE(-0.00044825343739563134) * distance_3)), 4:((B_VALUE(-1.0950943913870096e-05) * value_4) + (B_DISTANCE(-0.00044825343739563134) * distance_4)), 5:((B_VALUE(-1.0950943913870096e-05) * value_5) + (B_DISTANCE(-0.00044825343739563134) * distance_5)), 6:((B_VALUE(-1.0950943913870096e-05) * value_6) + (B_DISTANCE(-0.00044825343739563134) * distance_6)), 7:((B_VALUE(-1.0950943913870096e-05) * value_7) + (B_DISTANCE(-0.00044825343739563134) * distance_7)), 8:((B_VALUE(-1.0950943913870096e-05) * value_8) + (B_DISTANCE(-0.00044825343739563134) * distance_8)), 9:((B_VALUE(-1.0950943913870096e-05) * value_9) + (B_DISTANCE(-0.00044825343739563134) * distance_9)), 10:((B_VALUE(-1.0950943913870096e-05) * value_1