In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow_decision_forests as tfdf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
dataset = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
dataset = dataset.map(lambda x: int(x) if isinstance(x,bool) else x)

tf_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(dataset, label="Transported")

model = tfdf.keras.RandomForestModel()
model.fit(tf_dataset)

#print(model.summary())

# Imports


In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# EXPLORATION


In [None]:
dataset_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

In [None]:
#dataset_df.head(5)

In [None]:
#print(dataset_df.describe())
#dataset_df.info()

In [None]:
#dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(value=0)
dataset_df.isnull().sum().sort_values(ascending=False)

In [None]:
dataset_df = dataset_df.map(lambda x: int(x) if isinstance(x,bool) else x)

In [None]:
dataset_df[["Deck", "Cabin_num", "Side"]] = dataset_df["Cabin"].str.split("/", expand=True)
try:
    dataset_df = dataset_df.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

In [None]:
#dataset_df.head(5)

# More visualisation

In [None]:
# 1. Correlation Heatmap for numerical features
plt.figure(figsize=(10,6))
num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Transported"]
sns.heatmap(dataset_df[num_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# 3. Feature Distributions by Target
fig, axes = plt.subplots(2, 3, figsize=(15,10))
num_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for i, feature in enumerate(num_features):
    sns.histplot(data=dataset_df, x=feature, hue="Transported", element="step", kde=True, ax=axes[i//3, i%3])
    axes[i//3, i%3].set_title(f"{feature} Distribution by Transported")
plt.tight_layout()
plt.show()

In [None]:
# 5. Categorical Feature Impact
fig, axes = plt.subplots(1, 3, figsize=(18,5))
categories = ["HomePlanet", "CryoSleep", "Destination"]
for i, cat in enumerate(categories):
    sns.countplot(data=dataset_df, x=cat, hue="Transported", ax=axes[i], palette=["#1f77b4", "#ff7f0e"])
    axes[i].set_title(f"{cat} vs Transported")
    axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

# Working on features


**Filling NaN with median (for numerical) and most frequent (for categorical)**

In [None]:
import pandas as pd

# Copy the dataset
cleaned_df = dataset_df.copy()

# List of numerical and categorical columns
numerical_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
categorical_columns = ['HomePlanet', 'CryoSleep', 'Deck', 'Side', 'Destination', 'VIP', 'Name']



# Handle potential non-numeric issues in 'Cabin_num' by converting it to numeric
cleaned_df['Cabin_num'] = pd.to_numeric(cleaned_df['Cabin_num'], errors='coerce')
cleaned_df['Age'] = cleaned_df['Age'].fillna(cleaned_df.groupby('HomePlanet')['Age'].transform('median'))

# Fill numerical columns with median (direct assignment)
for col in numerical_columns:
    cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())

# Fill categorical columns with the most frequent value (direct assignment)
for col in categorical_columns:
    cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].mode()[0])

plt.figure(figsize=(12, 6))
sns.violinplot(data=cleaned_df, x='HomePlanet', y='Age', hue='Transported', split=True, palette='coolwarm')

plt.title("Age Distribution by HomePlanet & Transported")
plt.ylabel("Age")
plt.xlabel("HomePlanet")
plt.legend(title="Transported", labels=["Not Transported", "Transported"])
plt.show()

bins = [0, 10, 18, 25, 35, 45, 60, 100]
labels = ['0-10', '11-18', '19-25', '26-35', '36-45', '46-60', '60+']
cleaned_df['Age_Bin'] = pd.cut(cleaned_df['Age'], bins=bins, labels=labels, right=False)

# Calculate Transported rate per Age Bin & HomePlanet
age_transport_rate = cleaned_df.groupby(['Age_Bin', 'HomePlanet'])['Transported'].mean().reset_index()

# Plot Transported rate for each HomePlanet
plt.figure(figsize=(12, 6))
sns.lineplot(data=age_transport_rate, x='Age_Bin', y='Transported', hue='HomePlanet', marker="o", palette='tab10')

plt.title("Transported Rate by Age Group & HomePlanet")
plt.ylabel("Transported Rate")
plt.xlabel("Age Group")
plt.xticks(rotation=45)
plt.legend(title="HomePlanet")
plt.grid(True)
plt.show()

In [None]:
bins = [0, 5, 10, 18, 25, 35, 45, 60, 100]
labels = ['0-5', '6-10', '11-18', '19-25', '26-35', '36-45', '46-60', '60+']

# Create a new feature 'Age_Group' based on the bins

cleaned_df["TotalSpending"] = (
    cleaned_df["Spa"] + cleaned_df["VRDeck"] + 
    cleaned_df["RoomService"] + cleaned_df["ShoppingMall"] + 
    cleaned_df["FoodCourt"]
)

cleaned_df["LuxuryUser"] = (cleaned_df["Spa"] > 0) | (cleaned_df["VRDeck"] > 0) | (cleaned_df["RoomService"] > 0)
cleaned_df["SpendingPerAge"] = cleaned_df["TotalSpending"] / (cleaned_df["Age"] + 1)

cleaned_df["HighSpender"] = cleaned_df["TotalSpending"] > cleaned_df["TotalSpending"].median()
cleaned_df['FamilySize'] = cleaned_df['PassengerId'].apply(lambda x: int(x.split('_')[1]))

cleaned_df['Age_Group'] = pd.cut(cleaned_df['Age'], bins=bins, labels=labels, right=False)
cleaned_df = pd.get_dummies(cleaned_df, columns=['Age_Group'], drop_first=False)

cleaned_df = cleaned_df.drop(columns = ["Age",'PassengerId','Name'])
print(cleaned_df.head())

# Training


In [None]:
from sklearn.model_selection import train_test_split

# Split dataset (80% train, 20% test by default)


cleaned_df = cleaned_df.map(lambda x: int(x) if isinstance(x,bool) else x)
train_ds_pd, valid_ds_pd = train_test_split(cleaned_df, test_size=0.3, random_state=42)
print(cleaned_df.head())
# Print dataset sizes
#print(f"{len(train_ds_pd)} examples in training, {len(valid_ds_pd)} examples in testing.")

In [None]:
tfdf.keras.get_all_models()

# Random Tree Forest


train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="Transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label="Transported")

rf = tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1")
rf.compile(metrics=["accuracy"])
rf.fit(x=train_ds)
inspector = rf.make_inspector()
print(inspector.evaluation())
evaluation = rf.evaluate(x=valid_ds,return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")


In [None]:
import tensorflow_decision_forests as tfdf



# Convert data to TensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="Transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label="Transported")


# Initialize Gradient Boosted Trees model with tuned hyperparameters
gbm = tfdf.keras.GradientBoostedTreesModel(
    num_trees=400,                # More trees for better learning
    max_depth=8,                  # Deeper trees capture more complexity
    shrinkage=0.03,               # Lower learning rate for better convergence
    subsample=0.7,                # Random subsampling to reduce overfitting
)

gbm.compile(metrics=["accuracy"])

# Train the model
gbm.fit(x=train_ds)

# Inspect the trained model
inspector = gbm.make_inspector()
print(inspector.evaluation())

# Evaluate the model on validation data
evaluation = gbm.evaluate(x=valid_ds, return_dict=True)
for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")


In [None]:
import tensorflow_decision_forests as tfdf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

# Assuming cleaned_df is your dataframe and is already loaded

# Convert boolean columns to int
cleaned_df = cleaned_df.map(lambda x: int(x) if isinstance(x,bool) else x)

# Split the data into training and validation sets (can be done before the loop)
train_ds_pd, valid_ds_pd = train_test_split(cleaned_df, test_size=0.3, random_state=42)

# Convert to TensorFlow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="Transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label="Transported")

# Define the hyperparameter space
num_trees_values = [20, 50, 100, 300, 500]  # Different number of trees
max_depth_values = [4, 6, 8, 12, 16, 30]   # Different tree depths
shrinkage_values = [0.01, 0.03, 0.05]      # Different shrinkage values
subsample_values = [0.4, 0.5, 0.6, 0.7]    # Different subsample rates

# Create an empty list to store the evaluation results
results = []

# Number of cross-validation folds
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Loop over all combinations of hyperparameters
for num_trees in num_trees_values:
    for max_depth in max_depth_values:
        for shrinkage in shrinkage_values:
            for subsample in subsample_values:
                
                fold_accuracies = []
                fold_losses = []

                # Perform k-fold cross-validation
                for train_index, val_index in kf.split(train_ds_pd):
                    # Create the train and validation splits for this fold
                    train_fold = train_ds_pd.iloc[train_index]
                    valid_fold = train_ds_pd.iloc[val_index]
                    
                    # Convert to TensorFlow datasets
                    train_fold_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_fold, label="Transported")
                    valid_fold_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_fold, label="Transported")
                    
                    # Create and compile the model with the current hyperparameters
                    gbm = tfdf.keras.GradientBoostedTreesModel(
                        num_trees=num_trees,
                        max_depth=max_depth,
                        shrinkage=shrinkage,
                        subsample=subsample
                    )
                    gbm.compile(metrics=["accuracy"])

                    # Print the current hyperparameters for this fold
                    print(f"Training with num_trees={num_trees}, max_depth={max_depth}, shrinkage={shrinkage}, subsample={subsample}")

                    # Train the model on the current fold
                    gbm.fit(x=train_fold_ds)

                    # Evaluate the model on the validation fold
                    evaluation = gbm.evaluate(x=valid_fold_ds, return_dict=True)

                    # Store the accuracy and loss for this fold
                    fold_accuracies.append(evaluation.get("accuracy", 0.0))
                    fold_losses.append(evaluation.get("loss", 0.0))

                # Calculate the average accuracy and loss across all folds
                avg_accuracy = np.mean(fold_accuracies)
                avg_loss = np.mean(fold_losses)

                # Store the results along with the hyperparameters
                results.append({
                    "num_trees": num_trees,
                    "max_depth": max_depth,
                    "shrinkage": shrinkage,
                    "subsample": subsample,
                    "avg_validation_accuracy": avg_accuracy,
                    "avg_validation_loss": avg_loss,
                })

# Print the results
for result in results:
    print(f"Hyperparameters: {result['num_trees']} trees, max_depth={result['max_depth']}, shrinkage={result['shrinkage']}, subsample={result['subsample']}")
    print(f"Average Validation Accuracy: {result['avg_validation_accuracy']:.4f}, Average Validation Loss: {result['avg_validation_loss']:.4f}")
    print("="*50)

# Optionally, sort results by average validation accuracy
sorted_results = sorted(results, key=lambda x: x["avg_validation_accuracy"], reverse=True)

# Print sorted best hyperparameters
print("Best hyperparameters based on average validation accuracy:")
for result in sorted_results[:5]:  # Top 5 results
    print(f"{result['num_trees']} trees, max_depth={result['max_depth']}, shrinkage={result['shrinkage']}, subsample={result['subsample']}")
    print(f"Average Validation Accuracy: {result['avg_validation_accuracy']:.4f}")
    print("-"*50)


from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="Transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label="Transported")
rf = RandomForestClassifier(n_estimators=200, max_depth=8)
xgb = XGBClassifier(n_estimators=300, learning_rate=0.05)
lr = LogisticRegression()

ensemble_model = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb),
    ('lr', lr)
], voting='soft')

ensemble_model.fit(train_ds_pd.drop(columns=["Transported"]), train_ds_pd["Transported"])

In [None]:
print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)
inspector.variable_importances()["NUM_AS_ROOT"]

In [None]:
#tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=3)

# Submission


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tensorflow_decision_forests as tfdf

# Load the test dataset
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
submission_id = test_df["PassengerId"]

# Fill missing values

test_df[['VIP', 'CryoSleep']] = test_df[['VIP', 'CryoSleep']].fillna(value=0)

# Create new features 
test_df['FamilySize'] = test_df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
test_df[["Deck", "Cabin_num", "Side"]] = test_df["Cabin"].str.split("/", expand=True)
test_df.drop(columns=['Cabin'], inplace=True)

# Age groups
bins = [0, 5, 10, 18, 25, 35, 45, 60, 100]
labels = ['0-5', '6-10', '11-18', '19-25', '26-35', '36-45', '46-60', '60+']

test_df['Age_Group'] = pd.cut(test_df['Age'], bins=bins, labels=labels, right=False)

drop_features = ["Age",'Name','PassengerId']

test_df['Cabin_num'] = pd.to_numeric(test_df['Cabin_num'], errors='coerce')

test_df["TotalSpending"] = (
    test_df["Spa"] + test_df["VRDeck"] + 
    test_df["RoomService"] + test_df["ShoppingMall"] + 
    test_df["FoodCourt"]
)

test_df["LuxuryUser"] = (test_df["Spa"] > 0) | (test_df["VRDeck"] > 0) | (test_df["RoomService"] > 0)
test_df["SpendingPerAge"] = test_df["TotalSpending"] / (test_df["Age"] + 1)

test_df["HighSpender"] = test_df["TotalSpending"] > test_df["TotalSpending"].median()
test_df = pd.get_dummies(test_df, columns=['Age_Group'], drop_first=False)

test_df = test_df.drop(columns=drop_features)
# Convert DataFrame to TensorFlow dataset
test_df = test_df.map(lambda x: int(x) if isinstance(x,bool) else x)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df)


# Get predictions for test data
predictions = gbm.predict(test_ds)
n_predictions = (predictions > 0.5).astype(bool)

# Create submission file
output = pd.DataFrame({'PassengerId': submission_id,
                       'Transported': n_predictions.squeeze()})

output.head()


In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)
output.head()

In [None]:
test_df.isnull().sum().sort_values(ascending=False)

# NOTES

Tried things that got worse result:

- Age groups groupped by planet 
- Keeping age column
- Dropping rows with NaN
- Filling all rows with 0
- Filling test data similiarly to training data
