In [1]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable, Derive, log
import math
import random
import biogeme.results as res
from random import randint
import os, inspect
from sklearn.utils import shuffle
import numpy as np


import warnings
warnings.filterwarnings("ignore")

os.chdir('/Users/xiaodanxu/Documents/SynthFirm.nosync/CFS')

In [2]:
Austin_df = pd.read_csv('CFS2012_austin_forML.csv')
print(Austin_df.columns)
Austin_df.head(5)

Index(['SHIPMT_ID', 'ORIG_STATE', 'ORIG_MA', 'ORIG_CFS_AREA', 'DEST_STATE',
       'DEST_MA', 'DEST_CFS_AREA', 'NAICS', 'QUARTER', 'SCTG', 'MODE',
       'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_GC', 'SHIPMT_DIST_ROUTED',
       'TEMP_CNTL_YN', 'EXPORT_YN', 'EXPORT_CNTRY', 'HAZMAT', 'WGT_FACTOR',
       'mode_agg5', 'bulk', 'fuel_fert', 'interm_food', 'mfr_goods', 'other',
       'commodity', 'naics2', 'naics_name', 'wholesale', 'mfring', 'mining',
       'retail', 'info', 'management', 'transwarehouse', 'SHIPMT_WGHT_TON',
       'value_density', 'SHIPMT_DIST', 'choice', 'travel_time', 'ship_cost'],
      dtype='object')


Unnamed: 0,SHIPMT_ID,ORIG_STATE,ORIG_MA,ORIG_CFS_AREA,DEST_STATE,DEST_MA,DEST_CFS_AREA,NAICS,QUARTER,SCTG,...,retail,info,management,transwarehouse,SHIPMT_WGHT_TON,value_density,SHIPMT_DIST,choice,travel_time,ship_cost
0,25,48,99999,48-99999,48,99999,48-99999,311,1,5,...,0,0,0,0,16.6345,0.796477,219,2,19.369231,467.09676
1,44,48,41700,48-41700,48,41700,48-41700,4232,2,43,...,0,0,0,0,0.058,0.965517,3,4,4.15,0.49242
2,114,48,41700,48-41700,48,41700,48-41700,4244,2,5,...,0,0,0,0,0.014,3.357143,8,4,4.4,0.23772
3,146,48,99999,48-99999,48,99999,48-99999,4238,4,40,...,0,0,0,0,0.018,10.0,37,2,16.569231,1.37538
4,170,51,99999,51-99999,48,99999,48-99999,326,4,24,...,0,0,0,0,14.1195,0.783031,1290,2,49.947368,2648.25342


In [3]:
# clean the data

df_clean = Austin_df.loc[Austin_df.EXPORT_YN == 'N'] # Cleans out international exports

def removeOutliers(sName, df):
    # Computing IQR
    Q1 = df[sName].quantile(0.25)
    Q3 = df[sName].quantile(0.75)
    IQR = Q3 - Q1

    # Filtering Values between Q1-1.5IQR and Q3+1.5IQR
    filtered = df.query(sName + '<= (@Q3 + 3 * @IQR)')
    
    return filtered

df_clean = removeOutliers('SHIPMT_WGHT', df_clean)
df_clean = removeOutliers('SHIPMT_DIST', df_clean)
print(len(df_clean))

148912


In [10]:
var_to_keep = ['SHIPMT_ID', 'ORIG_CFS_AREA', 'DEST_CFS_AREA', 'NAICS', 'naics_name',
               'SHIPMT_VALUE', 'SHIPMT_WGHT', 'SHIPMT_DIST_ROUTED', 'value_density']

df_clean_choice_model = df_clean[var_to_keep]
dist_matrix = df_clean_choice_model.groupby(['ORIG_CFS_AREA', 'DEST_CFS_AREA'])[['SHIPMT_DIST_ROUTED']].mean()
dist_matrix = dist_matrix.reset_index()
print(dist_matrix.head(5))



  ORIG_CFS_AREA DEST_CFS_AREA  SHIPMT_DIST_ROUTED
0        01-142      48-12420          831.869565
1        01-142      48-41700          873.169492
2        01-142      48-99999          817.024390
3        01-380      48-12420          652.153846
4        01-380      48-41700          663.171429


In [18]:
unique_naics = df_clean_choice_model.NAICS.unique()
df_clean_choice_model.loc[:, 'chosen'] = 0
supplier_selection_set = None
for naics in unique_naics:
    print(naics)
    all_suppliers = df_clean_choice_model.loc[df_clean_choice_model['NAICS'] == naics]   
    chunk_of_suppliers = np.array_split(all_suppliers, 10)
    #print(chunk_of_suppliers[0].head(5))
    for i in range(10):
        chunk = chunk_of_suppliers[i]
        chunk.loc[:, 'chosen'] = 1
        #print(chunk.columns)
        shipment_to_match = chunk[['SHIPMT_ID', 'ORIG_CFS_AREA', 
                                   'NAICS', 'naics_name','SHIPMT_WGHT']]
        selected_shipment = chunk.SHIPMT_ID.unique()
        non_chosen_set = all_suppliers.loc[~all_suppliers['SHIPMT_ID'].isin(selected_shipment)]
        non_chosen_set = non_chosen_set[['DEST_CFS_AREA', 'NAICS', 
                                         'naics_name', 'value_density', 'chosen']]
        non_chosen_set = pd.merge(shipment_to_match, non_chosen_set,
                                  on = ['NAICS', 'naics_name'], how = 'left')
        non_chosen_set = pd.merge(non_chosen_set, dist_matrix, 
                                  on = ['ORIG_CFS_AREA', 'DEST_CFS_AREA'], how = 'left')
        non_chosen_set.loc[:, 'SHIPMT_DIST_ROUTED'].fillna(3000, inplace = True)
        non_chosen_set.loc[:, 'SHIPMT_VALUE'] = non_chosen_set.loc[:, 'SHIPMT_WGHT'] * non_chosen_set.loc[:, 'value_density']
        non_chosen_set = non_chosen_set.groupby('SHIPMT_ID').sample(n=9, replace = True, random_state=1)
        #print(non_chosen_set.head(5))
        combined_set = pd.concat([chunk, non_chosen_set])
        combined_set = combined_set.sort_values('SHIPMT_ID')
#         print(combined_set.head(20))
        supplier_selection_set = pd.concat([supplier_selection_set, combined_set])
#         break
        
#     break

311
       SHIPMT_ID ORIG_CFS_AREA DEST_CFS_AREA  NAICS     naics_name  \
0             25      48-99999      48-99999    311  Manufacturing   
5192          25      48-99999      48-99999    311  Manufacturing   
3980          25      48-99999      48-99999    311  Manufacturing   
235           25      48-99999        39-198    311  Manufacturing   
2895          25      48-99999        06-348    311  Manufacturing   
5056          25      48-99999      48-41700    311  Manufacturing   
144           25      48-99999      48-41700    311  Manufacturing   
905           25      48-99999      48-41700    311  Manufacturing   
2763          25      48-99999      48-99999    311  Manufacturing   
5157          25      48-99999      48-99999    311  Manufacturing   
9739        1061        33-148      48-12420    311  Manufacturing   
6716        1061        33-148        48-206    311  Manufacturing   
10910       1061        33-148      48-41700    311  Manufacturing   
10888       1061