In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow_decision_forests as tfdf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
dataset = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
dataset = dataset.map(lambda x: int(x) if isinstance(x,bool) else x)

tf_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(dataset, label="Transported")

model = tfdf.keras.RandomForestModel()
model.fit(tf_dataset)

#print(model.summary())

# Imports


In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# EXPLORATION


In [None]:
dataset_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

In [None]:
#dataset_df.head(5)

In [None]:
#print(dataset_df.describe())
#dataset_df.info()

In [None]:
dataset_df = dataset_df.drop(['PassengerId', 'Name'], axis=1)

In [None]:
#dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(value=0)
dataset_df.isnull().sum().sort_values(ascending=False)

In [None]:
dataset_df = dataset_df.map(lambda x: int(x) if isinstance(x,bool) else x)

In [None]:
dataset_df[["Deck", "Cabin_num", "Side"]] = dataset_df["Cabin"].str.split("/", expand=True)
try:
    dataset_df = dataset_df.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

In [None]:
#dataset_df.head(5)

# More visualisation

In [None]:
# 1. Correlation Heatmap for numerical features
plt.figure(figsize=(10,6))
num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Transported"]
sns.heatmap(dataset_df[num_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# 3. Feature Distributions by Target
fig, axes = plt.subplots(2, 3, figsize=(15,10))
num_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for i, feature in enumerate(num_features):
    sns.histplot(data=dataset_df, x=feature, hue="Transported", element="step", kde=True, ax=axes[i//3, i%3])
    axes[i//3, i%3].set_title(f"{feature} Distribution by Transported")
plt.tight_layout()
plt.show()

In [None]:
# 5. Categorical Feature Impact
fig, axes = plt.subplots(1, 3, figsize=(18,5))
categories = ["HomePlanet", "CryoSleep", "Destination"]
for i, cat in enumerate(categories):
    sns.countplot(data=dataset_df, x=cat, hue="Transported", ax=axes[i], palette=["#1f77b4", "#ff7f0e"])
    axes[i].set_title(f"{cat} vs Transported")
    axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

# Working on features


Replace NaN with median (for numerical) and most frequent (for categorical)

In [None]:
import pandas as pd

# Copy the dataset
cleaned_df = dataset_df.copy()

# List of numerical and categorical columns
numerical_columns = ['RoomService', 'Age', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
categorical_columns = ['HomePlanet', 'CryoSleep', 'Deck', 'Side', 'Destination', 'VIP', 'Name']

# Check if 'Name' column exists in the DataFrame, and remove it from categorical_columns if it's missing
if 'Name' not in cleaned_df.columns:
    categorical_columns.remove('Name')

# Handle potential non-numeric issues in 'Cabin_num' by converting it to numeric
cleaned_df['Cabin_num'] = pd.to_numeric(cleaned_df['Cabin_num'], errors='coerce')

# Fill numerical columns with median (direct assignment)
for col in numerical_columns:
    cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())

# Fill categorical columns with the most frequent value (direct assignment)
for col in categorical_columns:
    cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].mode()[0])

In [None]:
bins = [0, 5, 10, 18, 25, 35, 45, 60, 100]
labels = ['0-5', '6-10', '11-18', '19-25', '26-35', '36-45', '46-60', '60+']

# Create a new feature 'Age_Group' based on the bins
cleaned_df['Age_Group'] = pd.cut(cleaned_df['Age'], bins=bins, labels=labels, right=False)


cleaned_df["TotalSpending"] = (
    cleaned_df["Spa"] + cleaned_df["VRDeck"] + 
    cleaned_df["RoomService"] + cleaned_df["ShoppingMall"] + 
    cleaned_df["FoodCourt"]
)

cleaned_df["LuxuryUser"] = (cleaned_df["Spa"] > 0) | (cleaned_df["VRDeck"] > 0) | (cleaned_df["RoomService"] > 0)
cleaned_df["SpendingPerAge"] = cleaned_df["TotalSpending"] / (cleaned_df["Age"] + 1)

cleaned_df["HighSpender"] = cleaned_df["TotalSpending"] > cleaned_df["TotalSpending"].median()

cleaned_df = cleaned_df.drop(columns = ["Age"])
cleaned_df = pd.get_dummies(cleaned_df, columns=['Age_Group'], drop_first=False)

print(cleaned_df.head())

# Training


In [None]:
from sklearn.model_selection import train_test_split

# Split dataset (80% train, 20% test by default)


cleaned_df = cleaned_df.map(lambda x: int(x) if isinstance(x,bool) else x)
train_ds_pd, valid_ds_pd = train_test_split(cleaned_df, test_size=0.3, random_state=42)
print(cleaned_df.head())
# Print dataset sizes
#print(f"{len(train_ds_pd)} examples in training, {len(valid_ds_pd)} examples in testing.")

In [None]:
print(type(train_ds_pd))

In [None]:
tfdf.keras.get_all_models()

# Random Tree Forest


train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="Transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label="Transported")

rf = tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1")
rf.compile(metrics=["accuracy"])
rf.fit(x=train_ds)
inspector = rf.make_inspector()
print(inspector.evaluation())
evaluation = rf.evaluate(x=valid_ds,return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")


In [None]:
import tensorflow_decision_forests as tfdf



# Convert data to TensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="Transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label="Transported")


# Initialize Gradient Boosted Trees model with tuned hyperparameters
gbm = tfdf.keras.GradientBoostedTreesModel(
    num_trees=300,                # More trees for better learning
    max_depth=8,                  # Deeper trees capture more complexity
    shrinkage=0.05,               # Lower learning rate for better convergence
    subsample=0.8,                # Random subsampling to reduce overfitting
)

gbm.compile(metrics=["accuracy"])

# Train the model
gbm.fit(x=train_ds)

# Inspect the trained model
inspector = gbm.make_inspector()
print(inspector.evaluation())

# Evaluate the model on validation data
evaluation = gbm.evaluate(x=valid_ds, return_dict=True)
for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="Transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label="Transported")
rf = RandomForestClassifier(n_estimators=200, max_depth=8)
xgb = XGBClassifier(n_estimators=300, learning_rate=0.05)
lr = LogisticRegression()

ensemble_model = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb),
    ('lr', lr)
], voting='soft')

ensemble_model.fit(train_ds_pd.drop(columns=["Transported"]), train_ds_pd["Transported"])

In [None]:
print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)
inspector.variable_importances()["NUM_AS_ROOT"]

In [None]:
#tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=3)

# Submission


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tensorflow_decision_forests as tfdf

# Load the test dataset
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
submission_id = test_df["PassengerId"]

# Fill missing values
test_df[['VIP', 'CryoSleep']] = test_df[['VIP', 'CryoSleep']].fillna(value=0)

# Create new features from Cabin
test_df[["Deck", "Cabin_num", "Side"]] = test_df["Cabin"].str.split("/", expand=True)
test_df.drop(columns=['Cabin'], inplace=True)

# Convert boolean features to 0s and 1s
test_df['VIP'] = test_df['VIP'].astype(int)
test_df['CryoSleep'] = test_df['CryoSleep'].astype(int)

bins = [0, 5, 10, 18, 25, 35, 45, 60, 100]
labels = ['0-5', '6-10', '11-18', '19-25', '26-35', '36-45', '46-60', '60+']

# Create a new feature 'Age_Group' based on the bins
test_df['Age_Group'] = pd.cut(test_df['Age'], bins=bins, labels=labels, right=False)

drop_features = ["Age"]

test_df['Cabin_num'] = pd.to_numeric(test_df['Cabin_num'], errors='coerce')

test_df["TotalSpending"] = (
    test_df["Spa"] + test_df["VRDeck"] + 
    test_df["RoomService"] + test_df["ShoppingMall"] + 
    test_df["FoodCourt"]
)

test_df["LuxuryUser"] = (test_df["Spa"] > 0) | (test_df["VRDeck"] > 0) | (test_df["RoomService"] > 0)
test_df["SpendingPerAge"] = test_df["TotalSpending"] / (test_df["Age"] + 1)

test_df["HighSpender"] = test_df["TotalSpending"] > test_df["TotalSpending"].median()
test_df = pd.get_dummies(test_df, columns=['Age_Group'], drop_first=False)

test_df = test_df.drop(columns=drop_features)
# Convert DataFrame to TensorFlow dataset
test_df = test_df.map(lambda x: int(x) if isinstance(x,bool) else x)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df)


# Get predictions for test data
predictions = gbm.predict(test_ds)
n_predictions = (predictions > 0.5).astype(bool)

# Create submission file
output = pd.DataFrame({'PassengerId': submission_id,
                       'Transported': n_predictions.squeeze()})

output.head()


In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)
output.head()

# Extra visualisation

In [None]:
import matplotlib.pyplot as plt
logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")
plt.show()