In [None]:
import sys

sys.path.append("../")

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Assisted specification

## SwissMetro

In [None]:
import pandas as pd
import copy
import os
from scipy.special import softmax
from sklearn.preprocessing import MinMaxScaler

from rumboost.metrics import cross_entropy
from rumboost.datasets import load_preprocess_SwissMetro
from rumboost.post_process import (
    estimate_dcm_with_assisted_spec,
    predict_with_assisted_spec,
)
from rumboost.rumboost import RUMBoost

from helper import set_all_seeds
from constants import (
    sm_bin_vars,
    sm_cont_vars,
    sm_structure,
    sm_monotone_constraints,
    PATH_TO_DATA,
)
from utils import transform_mono_cons, transform_vars_list, augment_dataset


path = (
    os.getcwd() + "/results/SwissMetro/RUMBoost/linear/monoTrue/model_True_10bins.json"
)

sm_model_fully_trained = RUMBoost(model_file=path)

In [None]:
dataset_loader = {
    "SwissMetro": load_preprocess_SwissMetro,
}

dataset_vars = {
    "SwissMetro": (sm_bin_vars, sm_cont_vars),
}

dataset_structure = {
    "SwissMetro": sm_structure,
}
dataset_monotone_constraints = {
    "SwissMetro": sm_monotone_constraints,
}
dataset_num_classes = {
    "LPMC": 4,
    "SwissMetro": 3,
}

# set the random seed for reproducibility
set_all_seeds(42)

data_train, data_test, folds = dataset_loader["SwissMetro"](path=PATH_TO_DATA)

target = "choice"
y_train = data_train[target]
y_test = data_test[target]

bin_vars, cont_vars = dataset_vars["SwissMetro"]

X_train = augment_dataset(data_train, cont_vars, type="constant_linear")
X_test = augment_dataset(data_test, cont_vars, type="constant_linear")
X_train_bin = augment_dataset(data_train, bin_vars, type="constant")
X_test_bin = augment_dataset(data_test, bin_vars, type="constant")
X_train = pd.concat([X_train, X_train_bin], axis=1)
X_test = pd.concat([X_test, X_test_bin], axis=1)

# transform the monotone constraints to a specific format
monotone_constraints = transform_mono_cons(
    dataset_monotone_constraints["SwissMetro"],
    "constant_linear",
    cont_vars,
)
# binary variables are never boosted from parameter space
new_bin_vars = transform_vars_list(bin_vars, "constant", bin_vars)
bin_vars = new_bin_vars

structure = copy.deepcopy(dataset_structure["SwissMetro"])
for u in structure:
    structure[u] = (
        transform_vars_list(structure[u], "constant_linear", cont_vars) + bin_vars
    )
new_cont_vars = transform_vars_list(cont_vars, "constant_linear", cont_vars)
cont_vars = new_cont_vars

num_classes = dataset_num_classes["SwissMetro"]

# scale the features
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

### Assisted-specified piece-wise linear DCM

In [None]:
results = estimate_dcm_with_assisted_spec(
    X_train_scaled, y_train.reset_index(drop=True), sm_model_fully_trained
)

In [None]:
from rumboost.post_process import (
    predict_with_assisted_spec,
)

preds = predict_with_assisted_spec(
    X_train_scaled.drop(columns=["choice"]),
    X_test_scaled,
    y_train.reset_index(drop=True),
    y_test.reset_index(drop=True),
    sm_model_fully_trained,
    results.get_beta_values(),
    utilities=True,
)

In [None]:
real_preds = softmax(preds, axis=1)
print(cross_entropy(real_preds, y_test.astype(int).values))

0.7672599666764388


### MNL for comparison

In [None]:
from simple_mnl import SwissMetro_normalised

In [None]:
new_df = X_train_scaled.copy().reset_index(drop=True)
new_df["choice"] = y_train.reset_index(drop=True)
swissmetro = SwissMetro_normalised(new_df)

results_s = swissmetro.estimate()
betas_l = results_s.get_beta_values()

In [None]:
new_df_test = X_test_scaled.copy().reset_index(drop=True)
new_df_test["choice"] = y_test.reset_index(drop=True)
biogeme_test = SwissMetro_normalised(new_df_test, for_prob=False)

biogeme_test.simulate(results_s.get_beta_values()).mean()

log_like   -0.794958
dtype: float64

## LPMC

In [None]:
import pandas as pd
import copy
import os
from scipy.special import softmax
from sklearn.preprocessing import MinMaxScaler

from rumboost.metrics import cross_entropy
from rumboost.datasets import load_preprocess_LPMC
from rumboost.post_process import (
    estimate_dcm_with_assisted_spec,
    predict_with_assisted_spec,
)
from rumboost.rumboost import RUMBoost

from helper import set_all_seeds
from constants import (
    lpmc_bin_vars,
    lpmc_cont_vars,
    lpmc_mono_cons,
    lpmc_structure,
    PATH_TO_DATA,
)
from utils import transform_mono_cons, transform_vars_list, augment_dataset


path = (
    os.getcwd() + "/results/LPMC/RUMBoost/linear/monoTrue/model_True_10bins.json"
)

lpmc_model_fully_trained = RUMBoost(model_file=path)

In [None]:
dataset_loader = {
    "LPMC": load_preprocess_SwissMetro,
}

dataset_vars = {
    "LPMC": (lpmc_bin_vars, lpmc_cont_vars),
}

dataset_structure = {
    "LPMC": lpmc_structure,
}
dataset_monotone_constraints = {
    "LPMC": lpmc_mono_cons,
}
dataset_num_classes = {
    "LPMC": 4,
    "SwissMetro": 3,
}

# set the random seed for reproducibility
set_all_seeds(42)

data_train, data_test, folds = dataset_loader["LPMC"](path=PATH_TO_DATA)

target = "choice"
y_train = data_train[target]
y_test = data_test[target]

bin_vars, cont_vars = dataset_vars["LPMC"]

X_train = augment_dataset(data_train, cont_vars, type="constant_linear")
X_test = augment_dataset(data_test, cont_vars, type="constant_linear")
X_train_bin = augment_dataset(data_train, bin_vars, type="constant")
X_test_bin = augment_dataset(data_test, bin_vars, type="constant")
X_train = pd.concat([X_train, X_train_bin], axis=1)
X_test = pd.concat([X_test, X_test_bin], axis=1)

# transform the monotone constraints to a specific format
monotone_constraints = transform_mono_cons(
    dataset_monotone_constraints["LPMC"],
    "constant_linear",
    cont_vars,
)
# binary variables are never boosted from parameter space
new_bin_vars = transform_vars_list(bin_vars, "constant", bin_vars)
bin_vars = new_bin_vars

structure = copy.deepcopy(dataset_structure["LPMC"])
for u in structure:
    structure[u] = (
        transform_vars_list(structure[u], "constant_linear", cont_vars) + bin_vars
    )
new_cont_vars = transform_vars_list(cont_vars, "constant_linear", cont_vars)
cont_vars = new_cont_vars

num_classes = dataset_num_classes["LPMC"]

# scale the features
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

### Assisted-specified piece-wise linear DCM

In [None]:
results = estimate_dcm_with_assisted_spec(
    X_train_scaled, y_train.reset_index(drop=True), lpmc_model_fully_trained
)

In [None]:
from rumboost.post_process import (
    predict_with_assisted_spec,
)

preds = predict_with_assisted_spec(
    X_train_scaled.drop(columns=["choice"]),
    X_test_scaled,
    y_train.reset_index(drop=True),
    y_test.reset_index(drop=True),
    lpmc_model_fully_trained,
    results.get_beta_values(),
    utilities=True,
)

In [None]:
real_preds = softmax(preds, axis=1)
print(cross_entropy(real_preds, y_test.astype(int).values))

0.7672599666764388
