In [1]:
# Translated to .py by Meritxell Pacheco (December 2016)
# Adapted to PandasBiogeme by Nicola Ortelli (November 2019)


import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta
from biogeme.models import loglogit
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from rumbooster import rum_train

df = pd.read_csv("Data/swissmetro.dat", sep = '\t')
database = db.Database("swissmetro", df)

globals().update(database.variables)

# Exclude data
exclude = (( PURPOSE != 1 ) * ( PURPOSE != 3 ) + ( CHOICE == 0 )) > 0
database.remove(exclude)

df_train, df_test = train_test_split(df, test_size=0.2, random_state = 42)

database_train = db.Database("swissmetro_train", df_train)
database_test = db.Database("swissmetro_test", df_test)

# Parameters to be estimated
ASC_CAR   = Beta('ASC_CAR', 0, None, None, 0)
ASC_SM    = Beta('ASC_SM',  0, None, None, 0)
ASC_TRAIN = Beta('ASC_SBB', 0, None, None, 1)

B_TIME = Beta('B_TIME', 0, None, 0, 0)
B_COST = Beta('B_COST', 0, None, 0, 0)
B_HE   = Beta('B_HE',   0, None, 0, 0)

# Definition of new variables
TRAIN_COST = database_train.DefineVariable('TRAIN_COST', TRAIN_CO * ( GA == 0 ))
SM_COST    = database_train.DefineVariable('SM_COST', SM_CO * ( GA == 0 ))

# Utilities
V_TRAIN = ASC_TRAIN + B_TIME * TRAIN_TT + B_COST * TRAIN_COST + B_HE * TRAIN_HE
V_SM    = ASC_SM    + B_TIME * SM_TT    + B_COST * SM_COST    + B_HE * SM_HE
V_CAR   = ASC_CAR   + B_TIME * CAR_TT   + B_COST * CAR_CO

V = {1: V_TRAIN, 2: V_SM, 3: V_CAR}
av = {1: TRAIN_AV, 2: SM_AV, 3: CAR_AV}

logprob = loglogit(V, av, CHOICE)
biogeme_model = bio.BIOGEME(database_train, logprob)
biogeme_model.modelName = "Base Model"

In [5]:
biogeme_model.loglike.av

{1: TRAIN_AV, 2: SM_AV, 3: CAR_AV}

In [3]:
def process_parent(parent, pairs):
    if parent.getClassName() == 'Times':
        pairs.append(get_pair(parent))
    else:
        try:
            left = parent.left
            right = parent.right
        except:
            return pairs
        else:
            process_parent(left, pairs)
            process_parent(right, pairs)
        return pairs
    
def get_pair(parent):
    left = parent.left
    right = parent.right
    beta = None
    variable = None
    for exp in [left, right]:
        if exp.getClassName() == 'Beta':
            beta = exp.name
        elif exp.getClassName() == 'Variable':
            variable = exp.name
    if beta and variable:
        return (beta, variable)
    else:
        raise ValueError("Parent does not contain beta and variable")
        
def bio_to_rumboost(biogeme_model):
    '''
    Converts a biogeme model to a rumboost dict
    '''
    utils = biogeme_model.loglike.util
    rum_structure = []
    
    for k, v in utils.items():
        rum_structure.append({'columns': [], 'monotone_constraints': [], 'interaction_constraints': [], 'betas': []})
        for i, pair in enumerate(process_parent(v, [])):
            rum_structure[-1]['columns'].append(pair[1])
            rum_structure[-1]['betas'].append(pair[0])
            rum_structure[-1]['interaction_constraints'].append([i])
            bounds = biogeme_model.getBoundsOnBeta(pair[0])
            if (bounds[0] is None) and (bounds[1] is None):
                raise ValueError("Only one bound can be not None")
            if bounds[0] is not None:
                if bounds[0] >= 0:
                    rum_structure[-1]['monotone_constraints'].append(1)
            elif bounds[1] is not None:
                if bounds[1] <= 0:
                    rum_structure[-1]['monotone_constraints'].append(-1)
            else:
                rum_structure[k]['monotone_constraints'].append(0)
    return rum_structure

def bio_rum_train(biogeme_model, param):
    rum_structure = bio_to_rumboost(biogeme_model)
    data = biogeme_model.database.data
    target = biogeme_model.loglike.choice.name
    train_data = lgb.Dataset(data, label=data[target]-1, free_raw_data=False)
    return rum_train(param, train_data, valid_sets=[train_data], rum_structure=rum_structure)

In [5]:
biogeme_model.database.getNumberOfObservations()

5414

In [27]:
param = {'max_depth': 3, 
         'num_boost_round': 300, 
         'objective':'multiclass',
         'monotone_constraints': [-1, -1, -1, -1, -1, -1], 
         'interaction_constraints': [[0], [1], [2], [3], [4], [5]],
         'learning_rate': 0.2,
         'verbosity': 2,
         'num_classes': 3
        }

bio_rum_train(biogeme_model, param)



[LightGBM] [Info] Number of positive: 747, number of negative: 4667
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.044884
[LightGBM] [Debug] init for col-wise cost 0.000006 seconds, init for row-wise cost 0.000132 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 5414, number of used features: 3
[LightGBM] [Info] Number of positive: 3250, number of negative: 2164
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.044884
[LightGBM] [Debug] init for col-wise cost 0.000008 seconds, init for row-wise cost 0.000107 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 351
[LightGBM] [Info] Number of data points in the train set: 5414, number of used features: 3
[LightGBM] [Info] Number of positive: 1417, number of negative: 3997
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.169745

<rumbooster.RUMBooster at 0x22e85c6d950>

In [1]:
from biogeme_model import estimate_model, SwissMetro
from utils import load_prep_data


dataset = load_prep_data('swissmetro.dat')

model = SwissMetro(dataset)

crentrop = estimate_model(model)




Im here
            Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CAR  0.189166      0.079763     2.371610  1.771077e-02
ASC_SM   0.451009      0.093241     4.837040  1.317868e-06
B_COST  -0.010847      0.000682   -15.895902  0.000000e+00
B_HE    -0.005354      0.000983    -5.445909  5.154136e-08
B_TIME  -0.012768      0.001044   -12.225484  0.000000e+00
Nbr of observations: 6768
LL(0) = -5315.386
LL(beta) = -5315.386
rho bar square = -0.000941
Output file: None


In [1]:
from utils import compare_models

compare_models(['swissmetro.dat'])

            Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CAR  0.180990      0.088848     2.037087      0.041641
ASC_SM   0.437200      0.107215     4.077789      0.000045
B_COST  -0.010865      0.000727   -14.952283      0.000000
B_HE    -0.005198      0.001093    -4.754450      0.000002
B_TIME  -0.012486      0.001206   -10.356048      0.000000
Nbr of observations: 5414
LL(0) = -4294.640
LL(beta) = -4294.640
rho bar square = -0.00116
Output file: None
[LightGBM] [Info] Number of positive: 747, number of negative: 4667
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.044884
[LightGBM] [Debug] init for col-wise cost 0.000007 seconds, init for row-wise cost 0.000226 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 5414, number of used features: 3
[



[8] -- Logloss value: 0.8451999832570903
[9] -- Logloss value: 0.8396033822368697
[10] -- Logloss value: 0.8344888750446511
[11] -- Logloss value: 0.8294614315856085
[12] -- Logloss value: 0.825368876466664
[13] -- Logloss value: 0.8218296751530165
[14] -- Logloss value: 0.8185435155202303
[15] -- Logloss value: 0.8155873424736181
[16] -- Logloss value: 0.812729642921651
[17] -- Logloss value: 0.8101286317504329
[18] -- Logloss value: 0.8078289755741784
[19] -- Logloss value: 0.8056647513281899
[20] -- Logloss value: 0.8035403319092339
[21] -- Logloss value: 0.801587025013456
[22] -- Logloss value: 0.799832310511406
[23] -- Logloss value: 0.7980143141943479
[24] -- Logloss value: 0.7963508446454496
[25] -- Logloss value: 0.7947201562250229
[26] -- Logloss value: 0.7931371033627805
[27] -- Logloss value: 0.791665150819721
[28] -- Logloss value: 0.7902477367044384
[29] -- Logloss value: 0.7889587301441279
[30] -- Logloss value: 0.7876457693481285
[31] -- Logloss value: 0.786409865998278
