## Setup

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import tensorflow_decision_forests as tfdf

## Read Input CSV

In [2]:
df = pd.read_csv("data/train.csv")

## Feature Engineering

Remove columns that aren't useful, and fill NaN values:

In [3]:
try:
    df = df.drop(["PassengerId", "Name"], axis=1)
except KeyError:
    print("Field already dropped")

for c in ("VIP", "CryoSleep", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"):
    df[c].fillna(value=0, inplace=True)

df.isnull().sum().sort_values(ascending=False)

HomePlanet      201
Cabin           199
Destination     182
RoomService     181
Age             179
CryoSleep         0
VIP               0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

Change boolean to integer becuase TF-DF focuses on numerical and categorical features:

In [4]:
LABEL = "Transported"

df[LABEL] = df[LABEL].astype(int)
df["VIP"] = df["VIP"].astype(int)
df["CryoSleep"] = df["CryoSleep"].astype(int)

df["Cabin"] = df["Cabin"].apply(
    lambda x: x[0] + x[-1] if pd.notnull(x) and x != "" else x
)

In [5]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,0,BP,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0
1,Earth,0,FS,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1
2,Europa,0,AS,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0
3,Europa,0,AS,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0
4,Earth,0,FS,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1


## Train/Validation Split

In [6]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.shape

(6954, 12)

## PD-DF 2 TF-DS

In [7]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=LABEL)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_df, label=LABEL)

## Automated Hyper-Parameter Tuning

In [None]:
# tuner = tfdf.tuner.RandomSearch(num_trials=50)
# local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
# local_search_space.choice("max_depth", [5, 8, 10, 13])
# global_search_space = tuner.choice(
#     "growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True
# )
# global_search_space.choice("max_num_nodes", [25, 50, 100, 150, 200])
# tuned_model = tfdf.keras.RandomForestModel(tuner=tuner)
# tuned_model.fit(train_ds, verbose=2)

In [8]:
# automatic definition of the hyper-parameters
# Set use_predefined_hps=True to automatically configure the search space for the hyper-parameters.

tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)
tuned_model = tfdf.keras.RandomForestModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=2)

Use /var/folders/2j/5hwf_xns24z3hc6bmf268x340000gn/T/tmporblgvqk as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'HomePlanet': <tf.Tensor 'data:0' shape=(None,) dtype=string>, 'CryoSleep': <tf.Tensor 'data_1:0' shape=(None,) dtype=int64>, 'Cabin': <tf.Tensor 'data_2:0' shape=(None,) dtype=string>, 'Destination': <tf.Tensor 'data_3:0' shape=(None,) dtype=string>, 'Age': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'VIP': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>, 'RoomService': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'FoodCourt': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'ShoppingMall': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'Spa': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>, 'VRDeck': <tf.Tensor 'data_10:0' shape=(None,) dtype=float64>}
Label: Tensor("data_11:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'HomePlanet': SemanticTensor(semantic=<Semantic.C

[INFO 23-11-24 10:44:46.1284 CST kernel.cc:771] Start Yggdrasil model training
[INFO 23-11-24 10:44:46.1290 CST kernel.cc:772] Collect training examples
[INFO 23-11-24 10:44:46.1290 CST kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-11-24 10:44:46.1292 CST kernel.cc:391] Number of batches: 7
[INFO 23-11-24 10:44:46.1292 CST kernel.cc:392] Number of examples: 6954
[INFO 23-11-24 10:44:46.1301 CST data_spec_inference.cc:305] 2 item(s) have been pruned (i.e. they are considered out of dictionary) for the column Cabin (14 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO 23-11-24 10:44:46.1305 CST kernel.cc:792] Tr

Model trained in 0:04:20.965194
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x1202bda10>

In [9]:
model = tfdf.keras.RandomForestModel(
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_projection_density_factor=2.0,
    sparse_oblique_weights="CONTINUOUS",
    max_depth=12,
    min_examples=2,
)
model.fit(train_ds)

Use /var/folders/2j/5hwf_xns24z3hc6bmf268x340000gn/T/tmppraqn49l as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.126559. Found 6954 examples.
Training model...


[INFO 23-11-24 10:57:58.8168 CST kernel.cc:1233] Loading model from path /var/folders/2j/5hwf_xns24z3hc6bmf268x340000gn/T/tmppraqn49l/model/ with prefix ab04fa1e16994515


Model trained in 0:00:04.037549
Compiling model...
Model compiled.


[INFO 23-11-24 10:57:59.0286 CST decision_forest.cc:660] Model loaded with 300 root(s), 142808 node(s), and 11 input feature(s).
[INFO 23-11-24 10:57:59.0286 CST abstract_model.cc:1344] Engine "RandomForestGeneric" built
[INFO 23-11-24 10:57:59.0286 CST kernel.cc:1061] Use fast generic engine


<keras.src.callbacks.History at 0x29dce69d0>

In [10]:
valid_preds = model.predict(valid_ds)



In [11]:
from sklearn.metrics import accuracy_score


def best_threshold(y_test_value, y_pred_probability):
    best_score = 0
    best_threshold = 0

    for threshold in np.arange(0.2, 0.81, 0.01):
        predictions = (y_pred_probability > threshold).astype(int)
        m = accuracy_score(y_test_value, predictions)
        if m > best_score:
            best_score = m
            best_threshold = threshold

    return best_threshold


best_threshold(val_df.Transported, valid_preds)

0.49000000000000027

In [12]:
test_df = pd.read_csv("data/test.csv")
submission_id = test_df.PassengerId
for c in ("VIP", "CryoSleep", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"):
    test_df[c].fillna(value=0, inplace=True)
test_df["Cabin"] = test_df["Cabin"].apply(
    lambda x: x[0] + x[-1] if pd.notnull(x) and x != "" else x
)
test_df["VIP"] = test_df["VIP"].astype(int)
test_df["CryoSleep"] = test_df["CryoSleep"].astype(int)

In [13]:
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df)
predictions = model.predict(test_ds)



In [14]:
n_predictions = (predictions > 0.48).astype(bool)
sample_submission_df = pd.read_csv("data/sample_submission.csv")
sample_submission_df["Transported"] = n_predictions
sample_submission_df.to_csv("my_submission.csv", index=False)