In [1]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable, Derive, log, bioDraws, MonteCarlo
import math
import random
import biogeme.results as res
from random import randint
import os, inspect
from sklearn.utils import shuffle
import numpy as np


import warnings
warnings.filterwarnings("ignore")

os.chdir('/Users/xiaodanxu/Documents/SynthFirm.nosync/CFS')

In [3]:
region_df = pd.read_csv('CFS2017_bayarea_forML.csv')
print(region_df.columns)
region_df.head(5)

Index(['SHIPMT_ID', 'ORIG_STATE', 'ORIG_MA', 'ORIG_CFS_AREA', 'DEST_STATE',
       'DEST_MA', 'DEST_CFS_AREA', 'NAICS', 'QUARTER', 'SCTG', 'MODE',
       'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_GC', 'SHIPMT_DIST_ROUTED',
       'TEMP_CNTL_YN', 'EXPORT_YN', 'EXPORT_CNTRY', 'HAZMAT', 'WGT_FACTOR',
       'mode_agg5', 'mode_agg3', 'bulk', 'fuel_fert', 'interm_food',
       'mfr_goods', 'other', 'commodity', 'naics2', 'naics_name', 'wholesale',
       'mfring', 'mining', 'retail', 'info', 'management', 'transwarehouse',
       'geo', 'SHIPMT_WGHT_TON', 'value_density', 'SHIPMT_DIST', 'choice',
       'AV_1c', 'AV_2c', 'AV_3c', 'AV_4c', 'AV_5c', 'random_b',
       'alt_1_traveltime', 'alt_2_traveltime', 'alt_3_traveltime_b',
       'alt_4_traveltime', 'alt_5_traveltime', 'alt_1_shipcost',
       'alt_2_shipcost', 'alt_3_shipcost_b', 'alt_4_shipcost',
       'alt_5_shipcost'],
      dtype='object')


Unnamed: 0,SHIPMT_ID,ORIG_STATE,ORIG_MA,ORIG_CFS_AREA,DEST_STATE,DEST_MA,DEST_CFS_AREA,NAICS,QUARTER,SCTG,...,alt_1_traveltime,alt_2_traveltime,alt_3_traveltime_b,alt_4_traveltime,alt_5_traveltime,alt_1_shipcost,alt_2_shipcost,alt_3_shipcost_b,alt_4_shipcost,alt_5_shipcost
0,1,6,99999,06-99999,6,260,06-260,326,4,43,...,12.098271,16.923077,,16.923077,38.727273,346,5.2785,,5.2785,0.45747
1,4,6,260,06-260,6,99999,06-99999,212,4,11,...,12.054595,16.538462,,16.538462,37.590909,44867,121.2624,,121.2624,30.65244
2,14,6,488,06-488,48,206,48-206,334,2,35,...,14.644222,68.90625,120.0,68.90625,112.954545,55,2.055995,18.504242,2.055995,0.033014
3,43,6,348,06-348,6,472,06-472,4234,4,43,...,12.709736,22.846154,,22.846154,56.227273,429,46.215,,46.215,4.113135
4,48,6,488,06-488,25,148,25-148,4236,1,35,...,16.857143,110.125,120.0,110.125,172.909091,55,83.08597,27.357754,83.08597,1.292148


In [4]:
# clean the data

df_clean = region_df.loc[region_df.EXPORT_YN == 'N'] # Cleans out international exports

def removeOutliers(sName, df):
    # Computing IQR
    Q1 = df[sName].quantile(0.25)
    Q3 = df[sName].quantile(0.75)
    IQR = Q3 - Q1

    # Filtering Values between Q1-1.5IQR and Q3+1.5IQR
    filtered = df.query(sName + '<= (@Q3 + 3 * @IQR)')
    
    return filtered

df_clean = removeOutliers('SHIPMT_WGHT', df_clean)
df_clean = removeOutliers('SHIPMT_DIST', df_clean)
print(len(df_clean))

254555


In [5]:
var_to_keep = ['SHIPMT_ID', 'ORIG_CFS_AREA', 'DEST_CFS_AREA', 'NAICS', 'naics_name',
               'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_GC', 'value_density']

df_clean_choice_model = df_clean[var_to_keep]
dist_matrix = df_clean_choice_model.groupby(['ORIG_CFS_AREA', 'DEST_CFS_AREA'])[['SHIPMT_DIST_GC']].mean()
dist_matrix = dist_matrix.reset_index()
print(dist_matrix.head(5))


  ORIG_CFS_AREA DEST_CFS_AREA  SHIPMT_DIST_GC
0        01-142        06-260     1882.818182
1        01-142        06-472     1955.916667
2        01-142        06-488     2008.041667
3        01-142      06-99999     1965.477876
4        01-380        06-260     1856.000000


In [6]:
unique_naics = df_clean_choice_model.NAICS.unique()
df_clean_choice_model.loc[:, 'chosen'] = 0
supplier_selection_set = None
for naics in unique_naics:
    print(naics)
    all_suppliers = df_clean_choice_model.loc[df_clean_choice_model['NAICS'] == naics]   
    chunk_of_suppliers = np.array_split(all_suppliers, 10)
    #print(chunk_of_suppliers[0].head(5))
    for i in range(10):
        chunk = chunk_of_suppliers[i]
        chunk.loc[:, 'chosen'] = 1
        #print(chunk.columns)
        shipment_to_match = chunk[['SHIPMT_ID', 'ORIG_CFS_AREA', 
                                   'NAICS', 'naics_name','SHIPMT_WGHT']]
        selected_shipment = chunk.SHIPMT_ID.unique()
        non_chosen_set = all_suppliers.loc[~all_suppliers['SHIPMT_ID'].isin(selected_shipment)]
        non_chosen_set = non_chosen_set[['DEST_CFS_AREA', 'NAICS', 
                                         'naics_name', 'value_density', 'chosen']]
        non_chosen_set = pd.merge(shipment_to_match, non_chosen_set,
                                  on = ['NAICS', 'naics_name'], how = 'left')
        non_chosen_set = pd.merge(non_chosen_set, dist_matrix, 
                                  on = ['ORIG_CFS_AREA', 'DEST_CFS_AREA'], how = 'left')
        non_chosen_set.loc[:, 'SHIPMT_DIST_GC'].fillna(3000, inplace = True)
        non_chosen_set.loc[:, 'SHIPMT_VALUE'] = non_chosen_set.loc[:, 'SHIPMT_WGHT'] * non_chosen_set.loc[:, 'value_density']
        non_chosen_set = non_chosen_set.groupby('SHIPMT_ID').sample(n=9, replace = True, random_state=1)
        #print(non_chosen_set.head(5))
        combined_set = pd.concat([chunk, non_chosen_set])
        combined_set = combined_set.sort_values('SHIPMT_ID')
#         print(combined_set.head(20))
        supplier_selection_set = pd.concat([supplier_selection_set, combined_set])
#         break
        
#     break

326
334
4234
4236
4232
4249
325
4235
4541
333
4931
4245
323
335
4244
339
311
4248
315
4242
4239
312
336
4231
4241
332
4237
4243
322
321
551114
45431
4247
337
314
4233
313
4238
327
331
4246
5111
316
212
324


In [7]:
# creat label
supplier_selection_set = shuffle(supplier_selection_set)
print(len(supplier_selection_set))
supplier_selection_set = supplier_selection_set.sort_values('SHIPMT_ID')
supplier_selection_set['alternative']=supplier_selection_set.groupby('SHIPMT_ID').cumcount()+1
supplier_selection_set['choice'] = 0
supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 'choice'] = \
supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 'alternative']

supplier_selection_set.head(10)

2545550


Unnamed: 0,SHIPMT_ID,ORIG_CFS_AREA,DEST_CFS_AREA,NAICS,naics_name,SHIPMT_VALUE,SHIPMT_WGHT,SHIPMT_DIST_GC,value_density,chosen,alternative,choice
5157,1,06-99999,06-472,326,Manufacturing,30041.833332,391,128.15625,76.833333,0,1,0
5192,1,06-99999,06-472,326,Manufacturing,687.481405,391,128.15625,1.758264,0,2,0
5056,1,06-99999,39-212,326,Manufacturing,2215.666667,391,1982.833333,5.666667,0,3,0
2895,1,06-99999,29-312,326,Manufacturing,846.586376,391,1435.603774,2.165183,0,4,0
905,1,06-99999,06-41740,326,Manufacturing,2480.368421,391,286.472,6.343653,0,5,0
0,1,06-99999,06-260,326,Manufacturing,4380.0,391,54.0,11.202046,1,6,6
235,1,06-99999,06-488,326,Manufacturing,894.132169,391,95.44364,2.286783,0,7,0
144,1,06-99999,06-488,326,Manufacturing,17583.386139,391,95.44364,44.970297,0,8,0
2763,1,06-99999,06-99999,326,Manufacturing,8211.0,391,30.606095,21.0,0,9,0
3980,1,06-99999,06-99999,326,Manufacturing,4235.635259,391,30.606095,10.832827,0,10,0


In [8]:
# convert long data to wide
choice = supplier_selection_set.loc[supplier_selection_set['chosen'] == 1, 
                                     ['SHIPMT_ID',	'NAICS', 'choice']]

factor_1 = pd.pivot_table(supplier_selection_set, values='SHIPMT_VALUE', index=['SHIPMT_ID'],
                    columns=['alternative'], aggfunc=np.mean)
factor_1.columns = ['value_' + str(i+1) for i in range(10)]
factor_1 = factor_1.reset_index()
# factor_1.head(5)

factor_2 = pd.pivot_table(supplier_selection_set, values='SHIPMT_DIST_GC', index=['SHIPMT_ID'],
                    columns=['alternative'], aggfunc=np.mean)
factor_2.columns = ['distance_' + str(i+1) for i in range(10)]
# factor_2 = factor_2.fillna(1)
factor_2 = factor_2.reset_index()
# factor_2.head(5)

destination_choice_data_wide = pd.merge(choice, factor_1, 
                                        on = 'SHIPMT_ID', how = 'left')
destination_choice_data_wide = pd.merge(destination_choice_data_wide, factor_2, 
                                        on = 'SHIPMT_ID', how = 'left')
destination_choice_data_wide.head(5)

Unnamed: 0,SHIPMT_ID,NAICS,choice,value_1,value_2,value_3,value_4,value_5,value_6,value_7,...,distance_1,distance_2,distance_3,distance_4,distance_5,distance_6,distance_7,distance_8,distance_9,distance_10
0,1,326,6,30041.833332,687.481405,2215.666667,846.586376,2480.368421,4380.0,894.132169,...,128.15625,128.15625,1982.833333,1435.603774,286.472,54.0,95.44364,95.44364,30.606095,30.606095
1,14,334,6,69.0,63.731707,87.166667,86.442308,32.666667,2681.0,216.625,...,74.089645,22.803029,22.803029,74.089645,22.803029,1453.0,1448.595212,22.803029,110.369583,22.803029
2,43,4234,4,25122.0,206.329412,17656.5,1179.0,1265.680851,43134.0,12640.000002,...,342.437034,204.158879,3000.0,390.0,204.158879,342.437034,220.99872,3000.0,342.437034,342.437034
3,48,4236,5,242.0,506.0,666.844444,608.666667,1925.0,352.956522,114.0,...,22.803029,532.866157,110.369583,110.369583,2669.0,397.966102,2417.638095,110.369583,2026.457143,22.803029
4,73,4232,5,279.074074,310.357143,14740.0,6794.333334,233.0,11696.666663,3813.333334,...,1707.384058,3000.0,1786.688172,1707.384058,1649.0,1750.905882,1707.384058,1707.384058,1707.384058,1750.905882


In [9]:
# add availability
destination_choice_data_wide.loc[:, 'av_1'] = 1
destination_choice_data_wide.loc[:, 'av_2'] = 1
destination_choice_data_wide.loc[:, 'av_3'] = 1
destination_choice_data_wide.loc[:, 'av_4'] = 1
destination_choice_data_wide.loc[:, 'av_5'] = 1
destination_choice_data_wide.loc[:, 'av_6'] = 1
destination_choice_data_wide.loc[:, 'av_7'] = 1
destination_choice_data_wide.loc[:, 'av_8'] = 1
destination_choice_data_wide.loc[:, 'av_9'] = 1
destination_choice_data_wide.loc[:, 'av_10'] = 1

In [None]:
database = db.Database('destination_choice', destination_choice_data_wide)
globals().update(database.variables)
database.fullData
# define parameters
B_VALUE = Beta('B_VALUE', 0, None, None, 0)
B_DISTANCE = Beta('B_DISTANCE', 0, None, None, 0)

# B_VALUE_S = Beta('B_VALUE_S', 0.0001, None, None, 0)
# B_DISTANCE_S = Beta('B_DISTANCE_S', 0.01, None, None, 0)
# Define a random parameter with a normal distribution, designed to be used
# for quasi Monte-Carlo simulation with Halton draws (base 5).
# B_VALUE_RND = B_VALUE + B_VALUE_S * bioDraws('B_TIME_RND', 'NORMAL_HALTON5')
# B_DISTANCE_RND = B_DISTANCE + B_DISTANCE_S * bioDraws('B_DISTANCE_RND', 'NORMAL_HALTON5')


V1 = B_VALUE * value_1 + B_DISTANCE * distance_1
V2 = B_VALUE * value_2 + B_DISTANCE * distance_2
V3 = B_VALUE * value_3 + B_DISTANCE * distance_3
V4 = B_VALUE * value_4 + B_DISTANCE * distance_4
V5 = B_VALUE * value_5 + B_DISTANCE * distance_5
V6 = B_VALUE * value_6 + B_DISTANCE * distance_6
V7 = B_VALUE * value_7 + B_DISTANCE * distance_7
V8 = B_VALUE * value_8 + B_DISTANCE * distance_8
V9 = B_VALUE * value_9 + B_DISTANCE * distance_9
V10 = B_VALUE * value_10 + B_DISTANCE * distance_10

V = {1: V1, 2: V2, 3: V3, 4: V4, 5: V5,
    6: V6, 7: V7, 8: V8, 9: V9, 10: V10}

av = {1: av_1, 2: av_2, 3: av_3, 4: av_4, 5: av_5, 
      6: av_6, 7: av_7, 8: av_8, 9: av_9, 10: av_10}
# logprob = models.loglogit(V, None, choice)
logprob = models.loglogit(V, av, choice)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'supplier_selection_bay'
biogeme.calculateNullLoglikelihood(av)
# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
goodness_of_fit = results.getGeneralStatistics()
print('estimation results')
print(pandasResults)
print(goodness_of_fit['Rho-square for the null model'][0])

In [9]:
# destination_choice_data_selected = destination_choice_data_wide.loc[destination_choice_data_wide['NAICS'] == 333]
destination_choice_data_selected = destination_choice_data_wide.head(50000)
database = db.Database('destination_choice', destination_choice_data_selected)
globals().update(database.variables)
database.fullData
# define parameters
B_VALUE = Beta('B_VALUE', 0, None, None, 0)
B_DISTANCE = Beta('B_DISTANCE', 0, None, None, 0)

V1 = B_VALUE * value_1 + B_DISTANCE * distance_1
V2 = B_VALUE * value_2 + B_DISTANCE * distance_2
V3 = B_VALUE * value_3 + B_DISTANCE * distance_3
V4 = B_VALUE * value_4 + B_DISTANCE * distance_4
V5 = B_VALUE * value_5 + B_DISTANCE * distance_5
V6 = B_VALUE * value_6 + B_DISTANCE * distance_6
V7 = B_VALUE * value_7 + B_DISTANCE * distance_7
V8 = B_VALUE * value_8 + B_DISTANCE * distance_8
V9 = B_VALUE * value_9 + B_DISTANCE * distance_9
V10 = B_VALUE * value_10 + B_DISTANCE * distance_10

V = {1: V1, 2: V2, 3: V3, 4: V4, 5: V5,
    6: V6, 7: V7, 8: V8, 9: V9, 10: V10}

av = {1: av_1, 2: av_2, 3: av_3, 4: av_4, 5: av_5, 
      6: av_6, 7: av_7, 8: av_8, 9: av_9, 10: av_10}

logprob = models.loglogit(V, av, choice)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'supplier_selection'
biogeme.calculateNullLoglikelihood(av)
# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
goodness_of_fit = results.getGeneralStatistics()
print('estimation results')
print(pandasResults)
print(goodness_of_fit['Rho-square-bar for the init. model'][0])

estimation results
               Value       Std err     t-test  p-value  Rob. Std err  \
B_DISTANCE -0.000566  1.064970e-05 -53.166136      0.0  9.463055e-06   
B_VALUE    -0.000001  5.947764e-08 -18.736431      0.0  5.169378e-08   

            Rob. t-test  Rob. p-value  
B_DISTANCE   -59.833025           0.0  
B_VALUE      -21.557694           0.0  
0.004612733807154434


In [12]:
validationData = database.split(slices=5)

validation_results = biogeme.validate(results, validationData)

for slide in validation_results:
    print(
        f'Log likelihood for {slide.shape[0]} validation data: '
        f'{slide["Loglikelihood"].sum()}'
    )

Log likelihood for 10000 validation data: -22521.86062041235
Log likelihood for 10000 validation data: -22447.934767976683
Log likelihood for 10000 validation data: -22186.99405266812
Log likelihood for 10000 validation data: -22063.792608900138
Log likelihood for 10000 validation data: -21961.317473647527


In [20]:
print(logprob)

_bioLogLogitFullChoiceSet(1:((B_VALUE(-1.0950943913870096e-05) * value_1) + (B_DISTANCE(-0.00044825343739563134) * distance_1)), 2:((B_VALUE(-1.0950943913870096e-05) * value_2) + (B_DISTANCE(-0.00044825343739563134) * distance_2)), 3:((B_VALUE(-1.0950943913870096e-05) * value_3) + (B_DISTANCE(-0.00044825343739563134) * distance_3)), 4:((B_VALUE(-1.0950943913870096e-05) * value_4) + (B_DISTANCE(-0.00044825343739563134) * distance_4)), 5:((B_VALUE(-1.0950943913870096e-05) * value_5) + (B_DISTANCE(-0.00044825343739563134) * distance_5)), 6:((B_VALUE(-1.0950943913870096e-05) * value_6) + (B_DISTANCE(-0.00044825343739563134) * distance_6)), 7:((B_VALUE(-1.0950943913870096e-05) * value_7) + (B_DISTANCE(-0.00044825343739563134) * distance_7)), 8:((B_VALUE(-1.0950943913870096e-05) * value_8) + (B_DISTANCE(-0.00044825343739563134) * distance_8)), 9:((B_VALUE(-1.0950943913870096e-05) * value_9) + (B_DISTANCE(-0.00044825343739563134) * distance_9)), 10:((B_VALUE(-1.0950943913870096e-05) * value_1