In [1]:
# Heavily based on Gusthema`s "Titanic competition w/ TensorFlow Decision Forests" for learning
# and https://blog.tensorflow.org/2019/03/how-to-train-boosted-trees-models-in-tensorflow.html

In [2]:
# !pip install tensorflow==2.15.1 --force-reinstall
# !pip install tensorflow-estimator==2.15 --force-reinstall
# print("update GPU server TensorFlow version...")
# !pip install cloud-gpu-client
# from cloud_gpu_client import Client
# Client().configure_tpu_version(tf.__version__, restart_type='ifNeeded')

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf 
import tensorflow_decision_forests as tfdf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
print(tf.__version__)
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
2.16.1


In [4]:
# Import for first random forest tutorial:
train_df=pd.read_csv("/kaggle/input/titanic/train.csv")
test_df=pd.read_csv("/kaggle/input/titanic/train.csv")
# train_y=train_df.pop("Survived")
# test_y=test_df.pop("Survived")
train_df.head(10) # Shows the first 10'

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
# Preprocess (this tokenizes the name and extracts prefixes. However, some of the data will be discarded later anyway). Just for learning
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    def extract_deck(x):
        if isinstance(x, str):
            return x[0]
        else:
            pass
    def make_categorial(x):
        return str(x)

    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item) 
    df["Cabin"] = df["Cabin"].apply(extract_deck)
    df["Pclass"] = df["Pclass"].apply(make_categorial)
    return df
    
preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

# preprocessed_train_df.head(5) 
# preprocessed_train_df["Cabin"].unique() # Check unique values of Cabin
# preprocessed_train_df["Embarked"].unique() # Check unique values of Cabin



# Trim useless columns
list_useless=["Name","Ticket","PassengerId","Ticket_number","Ticket_item"]
preprocessed_train_df = preprocessed_train_df.drop(columns=list_useless)
preprocessed_test_df = preprocessed_test_df.drop(columns=list_useless)

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df,label="Survived")
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_test_df,label="Survived")

preprocessed_train_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C,S
4,0,3,male,35.0,0,0,8.05,,S
5,0,3,male,,0,0,8.4583,,Q
6,0,1,male,54.0,0,0,51.8625,E,S
7,0,3,male,2.0,3,1,21.075,,S
8,1,3,female,27.0,0,2,11.1333,,S
9,1,2,female,14.0,1,0,30.0708,,C


In [6]:
input_features=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin','Embarked']

model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True, # Only use the features in "features"
    random_seed=1999,
    
    min_examples=1,
    categorical_algorithm="RANDOM",
    #max_depth=4,
    shrinkage=0.05,
    #num_candidate_attributes_ratio=0.2,
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    num_trees=2000,
    compute_permutation_variable_importance=True,
)
model.fit(train_ds)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")
# model.summary()

[INFO 24-12-02 16:08:19.2411 UTC kernel.cc:1233] Loading model from path /tmp/tmpocc1qnft/model/ with prefix 95316ad64d544d0b
[INFO 24-12-02 16:08:19.2542 UTC decision_forest.cc:734] Model loaded with 49 root(s), 2577 node(s), and 8 input feature(s).
[INFO 24-12-02 16:08:19.2543 UTC abstract_model.cc:1362] Engine "GradientBoostedTreesGeneric" built
[INFO 24-12-02 16:08:19.2544 UTC kernel.cc:1061] Use fast generic engine


Accuracy: 0.791208803653717 Loss:0.8863461017608643


In [7]:
# Automatic hyperparameter_tuning
tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)

# Tune the model. Notice the `tuner=tuner`.
tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=0)

tuned_self_evaluation = tuned_model.make_inspector().evaluation()
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")
# Much better!

Use /tmp/tmpg2_8wb6r as temporary training directory


[INFO 24-12-02 16:09:00.5719 UTC kernel.cc:1233] Loading model from path /tmp/tmpg2_8wb6r/model/ with prefix 3016d8681adf467f
[INFO 24-12-02 16:09:00.5935 UTC decision_forest.cc:734] Model loaded with 106 root(s), 4834 node(s), and 8 input feature(s).
[INFO 24-12-02 16:09:00.5935 UTC abstract_model.cc:1362] Engine "GradientBoostedTreesGeneric" built
[INFO 24-12-02 16:09:00.5936 UTC kernel.cc:1061] Use fast generic engine


Accuracy: 0.8767123222351074 Loss:0.5442384481430054


In [8]:
def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(test_ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions):
    path="/kaggle/working/submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")
    
kaggle_predictions = prediction_to_kaggle_format(model)
make_submission(kaggle_predictions)
!head /kaggle/working/submission.csv

Submission exported to /kaggle/working/submission.csv
PassengerId,Survived
1,0
2,1
3,1
4,1
5,0
6,0
7,0
8,0
9,1


In [9]:
inspector = tuned_model.make_inspector()
# inspector.__dict__ # See hyperparameters
# tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0, max_depth=3)