# Introduction:
In this notebook we'll test out Tensorflow Decsion Trees.

In [1]:
!pip install --upgrade tensorflow_decision_forests

[0m

# Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from category_encoders import LeaveOneOutEncoder
import optuna
from sklearn.preprocessing import StandardScaler

import tensorflow_decision_forests as tfdf

2023-02-15 17:40:28.765120: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-15 17:40:28.935079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:
2023-02-15 17:40:28.935115: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-15 17:40:30.203675: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [3]:
from warnings import filterwarnings
filterwarnings("ignore")

# Loading Data

In [5]:
BASE_PATH = Path("/kaggle/input/playground-series-s3e7")

train = pd.read_csv(BASE_PATH / "train.csv").drop(columns="id")
train["is_original"] = 0
test = pd.read_csv(BASE_PATH / "test.csv")
# we'll need the test ids to make the submission file
test_idx = test.id
test = test.drop(columns="id")
test["is_original"] = 0

original = pd.read_csv("/kaggle/input/reservation-cancellation-prediction/train__dataset.csv")
original["is_original"] =  1

In [6]:
all_datasets = {"train": train,
               "test": test,
               "original": original}

# Preliminary Data Analysis

# Removing anomalies.
Huge thanks to https://www.kaggle.com/competitions/playground-series-s3e7/discussion/386655

In [7]:
train['arrival_year_month'] = pd.to_datetime(train['arrival_year'].astype(str)
                                            +train['arrival_month'].astype(str), format='%Y%m')
test['arrival_year_month'] = pd.to_datetime(test['arrival_year'].astype(str)
                                            +test['arrival_month'].astype(str), format='%Y%m')
original["arrival_year_month"] = pd.to_datetime(original["arrival_year"].astype(str)
                                            +original["arrival_month"].astype(str), format="%Y%m")

train.loc[train.arrival_date > train.arrival_year_month.dt.days_in_month, 'arrival_date'] = train.arrival_year_month.dt.days_in_month
test.loc[test.arrival_date > test.arrival_year_month.dt.days_in_month, 'arrival_date'] = test.arrival_year_month.dt.days_in_month
original.loc[original.arrival_date > original.arrival_year_month.dt.days_in_month, 'arrival_date'] = original.arrival_year_month.dt.days_in_month

train.drop(columns='arrival_year_month', inplace=True)
test.drop(columns='arrival_year_month', inplace=True)
original.drop(columns="arrival_year_month", inplace=True)

In [8]:
combined_df = pd.concat([train, original], axis=0)

In [14]:
# train_c_df = train_test_split()

In [15]:
combined_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combined_df, label="booking_status")

In [16]:
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

In [17]:
# tuner = tfdf.tuner.RandomSearch()
model = tfdf.keras.GradientBoostedTreesModel(verbose=0)
model.compile(metrics=[])
model.fit(combined_ds)

2023-02-15 17:41:32.236015: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:41:32.236252: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:41:32.236292: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:42:03.970290578+00:00 kernel.cc:1214] Loading model from path /tmp/tmp0tnba40i/model/ with prefix 2264d2757cb94d97
[INFO 2023-02-15T17:42:04.034283215+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T17:42:04.034558079+00:00 kernel.cc:1046] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


<keras.callbacks.History at 0x7f8fd831c6d0>

In [18]:
model.predict(test_ds)



array([[0.20864603],
       [0.05845499],
       [0.38541314],
       ...,
       [0.09983215],
       [0.5451986 ],
       [0.9301926 ]], dtype=float32)

# Setting up cross validation

In [19]:
def cross_validate(X, y, X_org, y_org):
    N_FOLDS = 5
    
    skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
    cv_scores = np.zeros(N_FOLDS)
    
    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # combining with original
        X_tr = pd.concat([X_tr, X_org], axis=0)
        y_tr = pd.concat([y_tr, y_org], axis=0)

        X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
        X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
        X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
        model = tfdf.keras.GradientBoostedTreesModel(verbose=0)
        model.fit(combined_ds)
        
        y_pred = model.predict(X_val_ds)[:, 0]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
        cv_scores[fold_id] = auc
    
    avg_auc = np.mean(cv_scores)
    print(f"AVG AUC: {avg_auc}")

In [20]:
X = train.drop(columns="booking_status")
y = train.booking_status
X_original = original.drop(columns="booking_status")
y_original = original.booking_status

In [21]:
cross_validate(X, y, X_original, y_original)

2023-02-15 17:42:22.846171: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:42:22.846245: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:42:22.846260: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:42:54.39970624+00:00 kernel.cc:1214] Loading model from path /tmp/tmph969yqo3/model/ with prefix e2fdf5aeba7740c8
[INFO 2023-02-15T17:42:54.463120383+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T17:42:54.463184919+00:00 kernel.cc:1046] Use fast generic engine


Fold: 0 	 | 	 auc: 0.9194227331912879


2023-02-15 17:43:04.560151: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:43:04.560376: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:43:04.560415: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:43:36.164091464+00:00 kernel.cc:1214] Loading model from path /tmp/tmpgrha5zm3/model/ with prefix 2e061e88a5324cba
[INFO 2023-02-15T17:43:36.227065816+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T17:43:36.227124758+00:00 kernel.cc:1046] Use fast generic engine


Fold: 1 	 | 	 auc: 0.9206668328025054


2023-02-15 17:43:46.167845: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:43:46.167935: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:43:46.167950: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:44:17.639698179+00:00 kernel.cc:1214] Loading model from path /tmp/tmp16d0r5sg/model/ with prefix b30846906fe144ee
[INFO 2023-02-15T17:44:17.702737558+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T17:44:17.702803057+00:00 kernel.cc:1046] Use fast generic engine


Fold: 2 	 | 	 auc: 0.9160126818733234


2023-02-15 17:44:27.749405: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:44:27.749467: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:44:27.749479: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:44:59.196459688+00:00 kernel.cc:1214] Loading model from path /tmp/tmpklzq4jna/model/ with prefix 02db5b55baca425c
[INFO 2023-02-15T17:44:59.259962079+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T17:44:59.260036746+00:00 kernel.cc:1046] Use fast generic engine


Fold: 3 	 | 	 auc: 0.9216130199998


2023-02-15 17:45:00.914234: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:45:00.914312: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:45:00.914325: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:45:32.407779845+00:00 kernel.cc:1214] Loading model from path /tmp/tmp6aflbkq0/model/ with prefix fb051d9b62394758
[INFO 2023-02-15T17:45:32.471035026+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T17:45:32.471094746+00:00 kernel.cc:1046] Use fast generic engine


Fold: 4 	 | 	 auc: 0.9168535596221027
AVG AUC: 0.9189137654978039


## SHOCKING RESULTS | AVG AUC: 0.9189137654978039

# NEXT UP:
Currently TFDF is treating all features as numerical, but we clearly know there are many categorical, so let's set those categorical as categorical and see how it fares.

## Checking for categorical values

In [22]:
pd.concat([train.dtypes.rename("Data Type")] + \
          [dataset.nunique().rename(f"{dataset_name} UniqueValues") for dataset_name, dataset in all_datasets.items()],
          axis=1).sort_values(by="train UniqueValues")

Unnamed: 0,Data Type,train UniqueValues,test UniqueValues,original UniqueValues
is_original,int64,1,1.0,1
repeated_guest,int64,2,2.0,2
booking_status,int64,2,,2
required_car_parking_space,int64,2,2.0,2
arrival_year,int64,2,2.0,2
type_of_meal_plan,int64,4,4.0,4
market_segment_type,int64,5,5.0,5
no_of_adults,int64,5,5.0,5
no_of_children,int64,6,6.0,5
no_of_special_requests,int64,6,6.0,6


In [52]:
cat_features = [col for col in train.columns if train[col].nunique() <= 31]

# removinng booking status and is_original
cat_features = cat_features[:-2]
cat_features.append("is_original")

In [53]:
len(cat_features)

15

## Defining feature semantics

In [54]:
tf_cat_features = []
for feature in cat_features:
    tf_cat_features.append(tfdf.keras.FeatureUsage(name=str(feature), semantic=tfdf.keras.FeatureSemantic.CATEGORICAL))

# tf_cat_features

In [56]:
feat_1 = tfdf.keras.FeatureUsage(name="lead_time", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feat_2 = tfdf.keras.FeatureUsage(name="avg_price_per_room", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feat_3 = tfdf.keras.FeatureUsage(name="no_of_previous_bookings_not_canceled", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)

tf_num_features = [feat_1, feat_2, feat_3]

all_features = tf_cat_features + tf_num_features

In [57]:
len(all_features)

18

In [58]:
set(X.columns) - set(cat_features)

{'avg_price_per_room', 'lead_time', 'no_of_previous_bookings_not_canceled'}

In [66]:
def cross_validate_with_features(X, y, X_org, y_org, all_features):
    N_FOLDS = 5
    
    skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
    cv_scores = np.zeros(N_FOLDS)
    all_test_preds  = []
    
    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # combining with original
        X_tr = pd.concat([X_tr, X_org], axis=0)
        y_tr = pd.concat([y_tr, y_org], axis=0)

        X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
        X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
        X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
        model = tfdf.keras.GradientBoostedTreesModel(verbose=0, features=all_features, exclude_non_specified_features=True)
        model.fit(combined_ds)
        
        y_pred = model.predict(X_val_ds)[:, 0]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
        cv_scores[fold_id] = auc
        all_test_preds.append(model.predict(test_ds)[:, 0])

    
    avg_auc = np.mean(cv_scores)
    print(f"AVG AUC: {avg_auc}")
    return all_test_preds

In [67]:
final_test_preds = cross_validate_with_features(X, y, X_original, y_original, all_features)

2023-02-15 18:16:04.369922: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:16:04.370537: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:16:04.370580: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T18:16:25.575637936+00:00 kernel.cc:1214] Loading model from path /tmp/tmp7z58hnlr/model/ with prefix ffb55fea72e547d3
[INFO 2023-02-15T18:16:25.655277885+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T18:16:25.655418296+00:00 kernel.cc:1046] Use fast generic engine


Fold: 0 	 | 	 auc: 0.9316305338541667


2023-02-15 18:16:27.979901: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:16:27.979972: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:16:27.979985: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T18:16:49.018759416+00:00 kernel.cc:1214] Loading model from path /tmp/tmpbqlmhc98/model/ with prefix 86ba417e65744379
[INFO 2023-02-15T18:16:49.091454255+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T18:16:49.091520933+00:00 kernel.cc:1046] Use fast generic engine


Fold: 1 	 | 	 auc: 0.9331438572042936


2023-02-15 18:16:51.190759: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:16:51.190833: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:16:51.190859: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T18:17:12.084912413+00:00 kernel.cc:1214] Loading model from path /tmp/tmpov5f4t70/model/ with prefix d16b7545c0b640a8
[INFO 2023-02-15T18:17:12.152882295+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T18:17:12.152946904+00:00 kernel.cc:1046] Use fast generic engine


Fold: 2 	 | 	 auc: 0.9287090541092907


2023-02-15 18:17:33.639391: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:17:33.639470: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:17:33.639481: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T18:17:54.414394742+00:00 kernel.cc:1214] Loading model from path /tmp/tmpa6ns2a_2/model/ with prefix f64354cb228d455f
[INFO 2023-02-15T18:17:54.483743343+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T18:17:54.483833675+00:00 kernel.cc:1046] Use fast generic engine


Fold: 3 	 | 	 auc: 0.9329264031056316


2023-02-15 18:17:56.626388: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:17:56.626759: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 18:17:56.626799: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T18:18:17.900896799+00:00 kernel.cc:1214] Loading model from path /tmp/tmpxvams44w/model/ with prefix 7e713c7f3dad4008
[INFO 2023-02-15T18:18:17.968603692+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-15T18:18:17.968671987+00:00 kernel.cc:1046] Use fast generic engine


Fold: 4 	 | 	 auc: 0.9273402088162975
AVG AUC: 0.930750011417936


## EVEN MORE SHCOKING RESULTS | AVG AUC: 0.930750011417936

# Let's make the submission

In [76]:
y_preds_final = np.array(final_test_preds).mean(axis=0)

In [78]:
submission = pd.DataFrame({"id": test_idx, "booking_status": y_preds_final})
submission.head()

Unnamed: 0,id,booking_status
0,42100,0.094111
1,42101,0.04302
2,42102,0.257303
3,42103,0.070845
4,42104,0.435102


In [79]:
submission.to_csv("submission.csv", index=False)

# Next Up: Let's use predefined hyperparameters templates

In [32]:
# def cross_validate_with_features_and_hp_template(X, y, X_org, y_org, all_features):
#     N_FOLDS = 5
    
#     skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
#     cv_scores = np.zeros(N_FOLDS)
    
#     for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
#         X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         # combining with original
#         X_tr = pd.concat([X_tr, X_org], axis=0)
#         y_tr = pd.concat([y_tr, y_org], axis=0)

#         X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
#         X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
#         X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
#         model = tfdf.keras.GradientBoostedTreesModel(verbose=0, features=all_features, exclude_non_specified_features=True,
#                                                     hyperparameter_template="benchmark_rank1")
#         model.fit(combined_ds)
        
#         y_pred = model.predict(X_val_ds)[:, 0]
        
#         auc = roc_auc_score(y_val, y_pred)
        
#         print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
#         cv_scores[fold_id] = auc
    
#     avg_auc = np.mean(cv_scores)
#     print(f"AVG AUC: {avg_auc}")

In [33]:
# cross_validate_with_features_and_hp_template(X, y, X_original, y_original, all_features)

Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.


2023-02-15 17:53:17.727612: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:53:17.727712: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:53:17.727725: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:53:49.062177349+00:00 kernel.cc:1214] Loading model from path /tmp/tmps0vmf4no/model/ with prefix 8a81f6327a87497e
[INFO 2023-02-15T17:53:49.137919038+00:00 decision_forest.cc:661] Model loaded with 289 root(s), 17541 node(s), and 15 input feature(s).
[INFO 2023-02-15T17:53:49.138098438+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesGeneric" built
[INFO 2023-02-15T

Fold: 0 	 | 	 auc: 0.9212626657196971
Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.


2023-02-15 17:53:59.541324: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:53:59.541406: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:53:59.541420: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:54:31.030491246+00:00 kernel.cc:1214] Loading model from path /tmp/tmpx4c7ssvw/model/ with prefix 6af1a220a7324a84
[INFO 2023-02-15T17:54:31.106561174+00:00 decision_forest.cc:661] Model loaded with 289 root(s), 17541 node(s), and 15 input feature(s).
[INFO 2023-02-15T17:54:31.106748027+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesGeneric" built
[INFO 2023-02-15T

Fold: 1 	 | 	 auc: 0.9223771718705236
Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.


2023-02-15 17:54:33.234293: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:54:33.234374: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:54:33.234388: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:55:04.622060969+00:00 kernel.cc:1214] Loading model from path /tmp/tmp66tt04yl/model/ with prefix 036d772e11af481b
[INFO 2023-02-15T17:55:04.695497769+00:00 decision_forest.cc:661] Model loaded with 289 root(s), 17541 node(s), and 15 input feature(s).
[INFO 2023-02-15T17:55:04.695884754+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesGeneric" built
[INFO 2023-02-15T

Fold: 2 	 | 	 auc: 0.9194569429344699
Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.


2023-02-15 17:55:06.507816: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:55:06.508056: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:55:06.508085: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:55:37.621475427+00:00 kernel.cc:1214] Loading model from path /tmp/tmp84z61934/model/ with prefix 0b0254cd81e44e72
[INFO 2023-02-15T17:55:37.696144662+00:00 decision_forest.cc:661] Model loaded with 289 root(s), 17541 node(s), and 15 input feature(s).
[INFO 2023-02-15T17:55:37.696211216+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesGeneric" built
[INFO 2023-02-15T

Fold: 3 	 | 	 auc: 0.9253424066147236
Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.


2023-02-15 17:55:39.845949: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:55:39.846021: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-15 17:55:39.846038: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-15T17:56:10.945493516+00:00 kernel.cc:1214] Loading model from path /tmp/tmp8757xtzr/model/ with prefix 3cee2ff447944804
[INFO 2023-02-15T17:56:11.020667592+00:00 decision_forest.cc:661] Model loaded with 289 root(s), 17541 node(s), and 15 input feature(s).
[INFO 2023-02-15T17:56:11.020738362+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesGeneric" built
[INFO 2023-02-15T

Fold: 4 	 | 	 auc: 0.9169879852541916
AVG AUC: 0.9210854344787212


## RESULTS - a bit worse | AVG AUC: 0.9210854344787212

# NEXT UP: Let's tune hps automatically

In [None]:
# tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)
# tuned_model = tfdf.keras.GradientBoostedTreesModel(verbose=2, tuner=tuner)
# tuned_model.fit(combined_ds)

In [None]:
# def cross_validate_with_features_and_hp_template(X, y, X_org, y_org, all_features):
#     N_FOLDS = 5
    
#     skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
#     cv_scores = np.zeros(N_FOLDS)
    
#     for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
#         X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         # combining with original
#         X_tr = pd.concat([X_tr, X_org], axis=0)
#         y_tr = pd.concat([y_tr, y_org], axis=0)

#         X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
#         X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
#         X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
#         model = tfdf.keras.GradientBoostedTreesModel(verbose=0, features=all_features, exclude_non_specified_features=True,
#                                                     hyperparameter_template="benchmark_rank1")
#         model.fit(combined_ds)
        
#         y_pred = model.predict(X_val_ds)[:, 0]
        
#         auc = roc_auc_score(y_val, y_pred)
        
#         print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
#         cv_scores[fold_id] = auc
    
#     avg_auc = np.mean(cv_scores)
#     print(f"AVG AUC: {avg_auc}")

# Preprocessing
Tensorflow docs says, it doesn't need preprocessing, so let's see first how well it fares.

In [None]:
# X = train.drop(columns="booking_status")
# y = train.booking_status
# X_original = original.drop(columns="booking_status")
# y_original = original.booking_status

In [None]:
# len_X = len(X)

In [None]:
# X[cat_features] = X[cat_features].astype("category")
# test[cat_features] = test[cat_features].astype("category")
# X_original[cat_features] = X_original[cat_features].astype("category")

In [None]:
# X_combined = pd.concat([X, X_original], axis=0)
# y_combined = pd.concat([y, y_original], axis=0)

# Encoding Categorical features

In [None]:
# loe = LeaveOneOutEncoder(sigma=0.05)
# loe.fit(X_combined[cat_features], y=y_combined)
# X_combined[cat_features] = loe.transform(X_combined[cat_features])
# test[cat_features] = loe.transform(test[cat_features])

In [None]:
# test["no_of_children"] = test["no_of_children"].astype("int")

In [None]:
# test["no_of_previous_cancellations"] = test["no_of_previous_cancellations"].astype("int")

# Normalizing numeriacl features

In [None]:
# numerical_features = ["lead_time", "avg_price_per_room"]

# sc = StandardScaler()
# sc.fit(X_combined[numerical_features])
# X_combined[numerical_features] = sc.transform(X_combined[numerical_features])
# test[numerical_features] = sc.transform(test[numerical_features])

# Separating datasets

In [None]:
# X = X_combined.iloc[:len_X, :]
# y = y_combined.iloc[:len_X]
# X_org = X_combined.iloc[len_X: , :]
# y_org = y_combined.iloc[len_X:]

In [None]:
# len(X), len(X_org), len(y), len(y_org)

# Preparing for training
We'll only use data points from competition datset for validation since our goal is to get a model that performs best on competition dataset, not original dataset.

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True,
#                                                  random_state=1337, stratify=y)

In [None]:
# y_preds_final = np.mean([y_preds_xgb, y_preds_lgbm], axis=0)
# y_preds_final.shape

In [None]:
# submission = pd.DataFrame({"id": test_idx, "booking_status": y_preds_final})
# submission.head()

In [None]:
# submission.to_csv("submission.csv", index=False)