# Introduction:
In this notebook we'll test out Tensorflow Decsion Trees.

In [1]:
!pip install --upgrade tensorflow_decision_forests

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tensorflow~=2.11.0
  Downloading tensorflow-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting wurlitzer
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_io_gcs_filesystem-0.30.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting absl-py
  Downloading absl_py-1.4.0-py3-none-any.whl (

# Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from category_encoders import LeaveOneOutEncoder
import optuna
from sklearn.preprocessing import StandardScaler

import tensorflow_decision_forests as tfdf

2023-02-16 05:58:58.433918: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-16 05:58:58.585346: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:
2023-02-16 05:58:58.585386: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-16 05:58:59.587779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [10]:
from warnings import filterwarnings
filterwarnings("ignore")

# Loading Data

In [4]:
BASE_PATH = Path("/kaggle/input/playground-series-s3e7")

train = pd.read_csv(BASE_PATH / "train.csv").drop(columns="id")
train["is_original"] = 0
test = pd.read_csv(BASE_PATH / "test.csv")
# we'll need the test ids to make the submission file
test_idx = test.id
test = test.drop(columns="id")
test["is_original"] = 0

original = pd.read_csv("/kaggle/input/reservation-cancellation-prediction/train__dataset.csv")
original["is_original"] =  1

In [5]:
all_datasets = {"train": train,
               "test": test,
               "original": original}

# Preliminary Data Analysis

# Removing anomalies.
Huge thanks to https://www.kaggle.com/competitions/playground-series-s3e7/discussion/386655

In [6]:
train['arrival_year_month'] = pd.to_datetime(train['arrival_year'].astype(str)
                                            +train['arrival_month'].astype(str), format='%Y%m')
test['arrival_year_month'] = pd.to_datetime(test['arrival_year'].astype(str)
                                            +test['arrival_month'].astype(str), format='%Y%m')
original["arrival_year_month"] = pd.to_datetime(original["arrival_year"].astype(str)
                                            +original["arrival_month"].astype(str), format="%Y%m")

train.loc[train.arrival_date > train.arrival_year_month.dt.days_in_month, 'arrival_date'] = train.arrival_year_month.dt.days_in_month
test.loc[test.arrival_date > test.arrival_year_month.dt.days_in_month, 'arrival_date'] = test.arrival_year_month.dt.days_in_month
original.loc[original.arrival_date > original.arrival_year_month.dt.days_in_month, 'arrival_date'] = original.arrival_year_month.dt.days_in_month

train.drop(columns='arrival_year_month', inplace=True)
test.drop(columns='arrival_year_month', inplace=True)
original.drop(columns="arrival_year_month", inplace=True)

In [7]:
combined_df = pd.concat([train, original], axis=0)

In [8]:
# train_c_df = train_test_split()

In [9]:
combined_ds = tfdf.keras.pd_dataframe_to_tf_dataset(combined_df, label="booking_status")

2023-02-16 05:59:14.256542: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:
2023-02-16 05:59:14.256607: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-16 05:59:14.256638: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (9b03a22c60ff): /proc/driver/nvidia/version does not exist
2023-02-16 05:59:14.257036: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other 

In [22]:
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

In [11]:
# # tuner = tfdf.tuner.RandomSearch()
# model = tfdf.keras.GradientBoostedTreesModel(verbose=0)
# model.compile(metrics=[])
# model.fit(combined_ds)

2023-02-16 05:59:38.947495: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1790] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2023-02-16 05:59:38.947706: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1800] "goss_beta" set but "sampling_method" not equal to "GOSS".
2023-02-16 05:59:38.947734: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1814] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
[INFO 2023-02-16T06:00:04.273370795+00:00 kernel.cc:1214] Loading model from path /tmp/tmpiy3gjau3/model/ with prefix 3b9475149f35445b
[INFO 2023-02-16T06:00:04.301515588+00:00 abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 2023-02-16T06:00:04.301578047+00:00 kernel.cc:1046] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


<keras.callbacks.History at 0x7faf95a08a90>

In [None]:
# model.predict(test_ds)

# Setting up cross validation

In [None]:
def cross_validate(X, y, X_org, y_org):
    N_FOLDS = 5
    
    skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
    cv_scores = np.zeros(N_FOLDS)
    
    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # combining with original
        X_tr = pd.concat([X_tr, X_org], axis=0)
        y_tr = pd.concat([y_tr, y_org], axis=0)

        X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
        X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
        X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
        model = tfdf.keras.GradientBoostedTreesModel(verbose=0)
        model.fit(combined_ds)
        
        y_pred = model.predict(X_val_ds)[:, 0]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
        cv_scores[fold_id] = auc
    
    avg_auc = np.mean(cv_scores)
    print(f"AVG AUC: {avg_auc}")

In [None]:
# X = train.drop(columns="booking_status")
# y = train.booking_status
# X_original = original.drop(columns="booking_status")
# y_original = original.booking_status

In [None]:
# cross_validate(X, y, X_original, y_original)

## SHOCKING RESULTS | AVG AUC: 0.9189137654978039

# NEXT UP:
Currently TFDF is treating all features as numerical, but we clearly know there are many categorical, so let's set those categorical as categorical and see how it fares.

## Checking for categorical values

In [12]:
# pd.concat([train.dtypes.rename("Data Type")] + \
#           [dataset.nunique().rename(f"{dataset_name} UniqueValues") for dataset_name, dataset in all_datasets.items()],
#           axis=1).sort_values(by="train UniqueValues")

Unnamed: 0,Data Type,train UniqueValues,test UniqueValues,original UniqueValues
is_original,int64,1,1.0,1
repeated_guest,int64,2,2.0,2
booking_status,int64,2,,2
required_car_parking_space,int64,2,2.0,2
arrival_year,int64,2,2.0,2
type_of_meal_plan,int64,4,4.0,4
market_segment_type,int64,5,5.0,5
no_of_adults,int64,5,5.0,5
no_of_children,int64,6,6.0,5
no_of_special_requests,int64,6,6.0,6


In [13]:
cat_features = [col for col in train.columns if train[col].nunique() <= 31]

# removinng booking status and is_original
cat_features = cat_features[:-2]
cat_features.append("is_original")

In [14]:
len(cat_features)

15

## Defining feature semantics

In [15]:
tf_cat_features = []
for feature in cat_features:
    tf_cat_features.append(tfdf.keras.FeatureUsage(name=str(feature), semantic=tfdf.keras.FeatureSemantic.CATEGORICAL))

# tf_cat_features

In [16]:
feat_1 = tfdf.keras.FeatureUsage(name="lead_time", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feat_2 = tfdf.keras.FeatureUsage(name="avg_price_per_room", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feat_3 = tfdf.keras.FeatureUsage(name="no_of_previous_bookings_not_canceled", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)

tf_num_features = [feat_1, feat_2, feat_3]

all_features = tf_cat_features + tf_num_features

In [17]:
len(all_features)

18

In [18]:
set(X.columns) - set(cat_features)

NameError: name 'X' is not defined

In [None]:
# def cross_validate_with_features(X, y, X_org, y_org, all_features):
#     N_FOLDS = 5
    
#     skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
#     cv_scores = np.zeros(N_FOLDS)
#     all_test_preds  = []
    
#     for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
#         X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         # combining with original
#         X_tr = pd.concat([X_tr, X_org], axis=0)
#         y_tr = pd.concat([y_tr, y_org], axis=0)

#         X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
#         X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
#         X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
#         model = tfdf.keras.GradientBoostedTreesModel(verbose=0, features=all_features, exclude_non_specified_features=True)
#         model.fit(combined_ds)
        
#         y_pred = model.predict(X_val_ds)[:, 0]
        
#         auc = roc_auc_score(y_val, y_pred)
        
#         print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
#         cv_scores[fold_id] = auc
#         all_test_preds.append(model.predict(test_ds)[:, 0])

    
#     avg_auc = np.mean(cv_scores)
#     print(f"AVG AUC: {avg_auc}")
#     return all_test_preds

In [None]:
# final_test_preds = cross_validate_with_features(X, y, X_original, y_original, all_features)

## EVEN MORE SHCOKING RESULTS | AVG AUC: 0.930750011417936

# Let's make the submission

In [None]:
# y_preds_final = np.array(final_test_preds).mean(axis=0)

In [None]:
# submission = pd.DataFrame({"id": test_idx, "booking_status": y_preds_final})
# submission.head()

In [None]:
# submission.to_csv("submission.csv", index=False)

# Next Up: Let's use predefined hyperparameters templates

In [None]:
# def cross_validate_with_features_and_hp_template(X, y, X_org, y_org, all_features):
#     N_FOLDS = 5
    
#     skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
#     cv_scores = np.zeros(N_FOLDS)
    
#     for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
#         X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         # combining with original
#         X_tr = pd.concat([X_tr, X_org], axis=0)
#         y_tr = pd.concat([y_tr, y_org], axis=0)

#         X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
#         X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
#         X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
#         model = tfdf.keras.GradientBoostedTreesModel(verbose=0, features=all_features, exclude_non_specified_features=True,
#                                                     hyperparameter_template="benchmark_rank1")
#         model.fit(combined_ds)
        
#         y_pred = model.predict(X_val_ds)[:, 0]
        
#         auc = roc_auc_score(y_val, y_pred)
        
#         print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
#         cv_scores[fold_id] = auc
    
#     avg_auc = np.mean(cv_scores)
#     print(f"AVG AUC: {avg_auc}")

In [None]:
# cross_validate_with_features_and_hp_template(X, y, X_original, y_original, all_features)

## RESULTS - a bit worse | AVG AUC: 0.9210854344787212

# NEXT UP: Let's tune hps automatically

In [19]:
tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)
tuned_model = tfdf.keras.GradientBoostedTreesModel(verbose=2, tuner=tuner, features=all_features, exclude_non_specified_features=True)
tuned_model.fit(combined_ds)

Use 4 thread(s) for training
Use /tmp/tmp85tojlvh as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'no_of_adults': <tf.Tensor 'data_7:0' shape=(None,) dtype=int64>, 'no_of_children': <tf.Tensor 'data_8:0' shape=(None,) dtype=int64>, 'no_of_weekend_nights': <tf.Tensor 'data_13:0' shape=(None,) dtype=int64>, 'no_of_week_nights': <tf.Tensor 'data_12:0' shape=(None,) dtype=int64>, 'type_of_meal_plan': <tf.Tensor 'data_17:0' shape=(None,) dtype=int64>, 'required_car_parking_space': <tf.Tensor 'data_15:0' shape=(None,) dtype=int64>, 'room_type_reserved': <tf.Tensor 'data_16:0' shape=(None,) dtype=int64>, 'lead_time': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>, 'arrival_year': <tf.Tensor 'data_2:0' shape=(None,) dtype=int64>, 'arrival_month': <tf.Tensor 'data_1:0' shape=(None,) dtype=int64>, 'arrival_date': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'market_segment_type': <tf.Tensor 'data_6:0' shape=(None,) dtype=int64>, 'repeated_gue

[INFO 2023-02-16T06:01:53.050453918+00:00 kernel.cc:756] Start Yggdrasil model training
[INFO 2023-02-16T06:01:53.050575449+00:00 kernel.cc:757] Collect training examples
[INFO 2023-02-16T06:01:53.052324288+00:00 kernel.cc:388] Number of batches: 61
[INFO 2023-02-16T06:01:53.052345438+00:00 kernel.cc:389] Number of examples: 60237
[INFO 2023-02-16T06:01:53.063126123+00:00 kernel.cc:774] Training dataset:
Number of records: 60237
Number of columns: 19

Number of columns by type:
	CATEGORICAL: 16 (84.2105%)
	NUMERICAL: 3 (15.7895%)

Columns:

CATEGORICAL: 16 (84.2105%)
	0: "__LABEL" CATEGORICAL integerized vocab-size:3 no-ood-item
	1: "arrival_date" CATEGORICAL integerized vocab-size:33 no-ood-item
	2: "arrival_month" CATEGORICAL integerized vocab-size:14 no-ood-item
	3: "arrival_year" CATEGORICAL integerized vocab-size:2020 no-ood-item
	5: "is_original" CATEGORICAL integerized vocab-size:3 no-ood-item
	7: "market_segment_type" CATEGORICAL integerized vocab-size:6 no-ood-item
	8: "no_of_

Model trained in 0:15:37.530129
Compiling model...
Model compiled.


<keras.callbacks.History at 0x7faf8c1b3150>

In [24]:
y_preds_tuned = tuned_model.predict(test_ds)[:, 0]



In [26]:
submission = pd.DataFrame({"id": test_idx, "booking_status": y_preds_tuned})
submission.head()

Unnamed: 0,id,booking_status
0,42100,0.06419
1,42101,0.057226
2,42102,0.219323
3,42103,0.062739
4,42104,0.562021


In [27]:
submission.to_csv("submission.csv", index=False)

In [None]:
# def cross_validate_with_features_and_hp_template(X, y, X_org, y_org, all_features):
#     N_FOLDS = 5
    
#     skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
#     cv_scores = np.zeros(N_FOLDS)
    
#     for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
#         X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         # combining with original
#         X_tr = pd.concat([X_tr, X_org], axis=0)
#         y_tr = pd.concat([y_tr, y_org], axis=0)

#         X_tr = pd.concat([X_tr, y_tr], axis=1)
        
        
#         X_tr_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_tr, label="booking_status")
#         X_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val)
        
#         model = tfdf.keras.GradientBoostedTreesModel(verbose=0, features=all_features, exclude_non_specified_features=True,
#                                                     hyperparameter_template="benchmark_rank1")
#         model.fit(combined_ds)
        
#         y_pred = model.predict(X_val_ds)[:, 0]
        
#         auc = roc_auc_score(y_val, y_pred)
        
#         print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
#         cv_scores[fold_id] = auc
    
#     avg_auc = np.mean(cv_scores)
#     print(f"AVG AUC: {avg_auc}")

# Preprocessing
Tensorflow docs says, it doesn't need preprocessing, so let's see first how well it fares.

In [None]:
# X = train.drop(columns="booking_status")
# y = train.booking_status
# X_original = original.drop(columns="booking_status")
# y_original = original.booking_status

In [None]:
# len_X = len(X)

In [None]:
# X[cat_features] = X[cat_features].astype("category")
# test[cat_features] = test[cat_features].astype("category")
# X_original[cat_features] = X_original[cat_features].astype("category")

In [None]:
# X_combined = pd.concat([X, X_original], axis=0)
# y_combined = pd.concat([y, y_original], axis=0)

# Encoding Categorical features

In [None]:
# loe = LeaveOneOutEncoder(sigma=0.05)
# loe.fit(X_combined[cat_features], y=y_combined)
# X_combined[cat_features] = loe.transform(X_combined[cat_features])
# test[cat_features] = loe.transform(test[cat_features])

In [None]:
# test["no_of_children"] = test["no_of_children"].astype("int")

In [None]:
# test["no_of_previous_cancellations"] = test["no_of_previous_cancellations"].astype("int")

# Normalizing numeriacl features

In [None]:
# numerical_features = ["lead_time", "avg_price_per_room"]

# sc = StandardScaler()
# sc.fit(X_combined[numerical_features])
# X_combined[numerical_features] = sc.transform(X_combined[numerical_features])
# test[numerical_features] = sc.transform(test[numerical_features])

# Separating datasets

In [None]:
# X = X_combined.iloc[:len_X, :]
# y = y_combined.iloc[:len_X]
# X_org = X_combined.iloc[len_X: , :]
# y_org = y_combined.iloc[len_X:]

In [None]:
# len(X), len(X_org), len(y), len(y_org)

# Preparing for training
We'll only use data points from competition datset for validation since our goal is to get a model that performs best on competition dataset, not original dataset.

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True,
#                                                  random_state=1337, stratify=y)

In [None]:
# y_preds_final = np.mean([y_preds_xgb, y_preds_lgbm], axis=0)
# y_preds_final.shape

In [None]:
# submission = pd.DataFrame({"id": test_idx, "booking_status": y_preds_final})
# submission.head()

In [None]:
# submission.to_csv("submission.csv", index=False)