In [31]:
# Begin by importing all required libraries
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [34]:
# Read data
train_df = pd.read_csv('csv_files/train.csv')

# Preview data
print('Raw data format:')
display(train_df.head())

# Determining the amount of missing data per column
missing_data = train_df.isna().sum()

# Calculating the percentage of missing data per column
missing_percentage = (missing_data / len(train_df)) * 100

missing_info = pd.DataFrame({
    "Missing Values": missing_data,
    "Percentage": missing_percentage
})

missing_info.sort_values(by="Missing Values", ascending=False)

# Imputers
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

# List of numerical and categorical columns that need imputation
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Imputation
train_df[numerical_cols] = median_imputer.fit_transform(train_df[numerical_cols])
train_df[categorical_cols] = mode_imputer.fit_transform(train_df[categorical_cols])


# Assuming train_df is predefined
decision_tree_df = train_df.copy()  # Use copy to avoid SettingWithCopyWarning

# Drop unnecessary columns
decision_tree_df.drop(columns=['PassengerId', 'Name', 'VIP'], inplace=True)

decision_tree_df.dropna(subset=['Cabin'], inplace=True)

# Split 'Cabin' into 'Deck' and 'Side'
decision_tree_df['Deck'] = decision_tree_df['Cabin'].str.split('/').str[0]
decision_tree_df['Side'] = decision_tree_df['Cabin'].str.split('/').str[2]

decision_tree_df.drop(columns=['Cabin'], inplace=True) 

# Convert 'CryoSleep' boolean to int
decision_tree_df['CryoSleep'] = decision_tree_df['CryoSleep'].astype(int)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical variables
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    decision_tree_df[col] = label_encoder.fit_transform(decision_tree_df[col])

# After edits for decision trees
print('DataFrame used for trees:')
decision_tree_df.head()


Raw data format:


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


DataFrame used for trees:


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,1,0,2,39.0,0.0,0.0,0.0,0.0,0.0,False,1,0
1,0,0,2,24.0,109.0,9.0,25.0,549.0,44.0,True,5,1
2,1,0,2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,0,1
3,1,0,2,33.0,0.0,1283.0,371.0,3329.0,193.0,False,0,1
4,0,0,2,16.0,303.0,70.0,151.0,565.0,2.0,True,5,1


In [52]:
# Preprocessing: Convert boolean column to int
decision_tree_df['Transported'] = decision_tree_df['Transported'].astype(int)

# Split the DataFrame into features and the target
features = decision_tree_df.drop('Transported', axis=1)
target = decision_tree_df['Transported']

# Split the data into train and test sets
X = decision_tree_df.drop(['Transported'], axis=1)  # Features
y = decision_tree_df['Transported']  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2)

# Since the label column is not in X_train and X_val, concatenate it back for creating the TF dataset
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)

In [53]:
# Convert the pandas dataframes to tensorflow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="Transported")
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_df, label="Transported")

# Initialize the Random Forest model
rf_model = tfdf.keras.RandomForestModel()

# Fit the model to the training data
rf_model.fit(train_ds)

# Evaluate the model on the validation set
loss = rf_model.evaluate(val_ds)
print(f"Loss on validation set: {loss}")

# Predict on the validation dataset
predictions = rf_model.predict(val_ds)

# If the predictions are probabilities, convert them to binary labels
if predictions.ndim > 1 and predictions.shape[1] > 1:
    # Assuming the positive class probabilities are in the second column for binary classification
    predicted_labels = (predictions[:, 1] >= 0.5).astype(int)
else:
    # If predictions are already binary labels
    predicted_labels = predictions.flatten().astype(int)  # Ensure binary labels and correct shape

# Verify the content of predicted_labels
print("Sample of predicted labels:", predicted_labels[:10])

# Ensure y_val_array is correctly formatted as a 1D binary array
y_val_array = np.array(y_val).flatten()  # Flatten in case y_val is 2D


Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpncb5tjlb as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.131881. Found 6795 examples.
Training model...


[INFO 24-03-21 20:35:53.5249 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpncb5tjlb/model/ with prefix dea5ce994b024f62


Model trained in 0:00:00.796178
Compiling model...
Model compiled.

[INFO 24-03-21 20:35:53.8063 SAST decision_forest.cc:734] Model loaded with 300 root(s), 258224 node(s), and 11 input feature(s).
[INFO 24-03-21 20:35:53.8063 SAST abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-03-21 20:35:53.8063 SAST kernel.cc:1061] Use fast generic engine


Loss on validation set: 0.0
Sample of predicted labels: [0 0 0 0 0 0 0 0 0 0]
Calculated validation accuracy: 0.5103
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1 Score: 0.0000


In [49]:
import tensorflow_decision_forests as tfdf
from sklearn.metrics import accuracy_score

# Define a range of hyperparameters to test
n_trees_options = [50, 100, 300]
max_depth_options = [5, 10, None]  # 'None' means no maximum depth limit

best_accuracy = 0
best_params = {}

for n_trees in n_trees_options:
    for max_depth in max_depth_options:
        # Initialize the Random Forest model with current hyperparameters
        rf_model = tfdf.keras.RandomForestModel(num_trees=n_trees, max_depth=max_depth)

        # Fit the model to the training data
        rf_model.fit(train_ds)

        # Evaluate the model on the validation set
        loss = rf_model.evaluate(val_ds)

        # Predict on the validation dataset
        predictions = rf_model.predict(val_ds)

        # If the predictions are probabilities, convert them to binary labels
        if predictions.ndim > 1 and predictions.shape[1] > 1:
            # Assuming the positive class probabilities are in the second column for binary classification
            predicted_labels = (predictions[:, 1] >= 0.5).astype(int)
        else:
            # If predictions are already binary labels
            predicted_labels = predictions.flatten().astype(int)  # Ensure binary labels and correct shape
            
        # Calculate the accuracy
        accuracy = accuracy_score(y_val_array, predicted_labels)

        print(f"Num Trees: {n_trees}, Max Depth: {max_depth}, Validation Accuracy: {accuracy:.4f}")

        # Update the best params if current model is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'num_trees': n_trees, 'max_depth': max_depth}

print(f"Best Accuracy: {best_accuracy:.4f}")
print("Best Hyperparameters:", best_params)


Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpy9sj120w as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.116396. Found 6795 examples.
Training model...
Model trained in 0:00:00.027110
Compiling model...
Model compiled.
Num Trees: 50, Max Depth: 5, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpqttrx9j6 as temporary training directory
Reading training dataset...


[INFO 24-03-21 20:32:12.0692 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpy9sj120w/model/ with prefix bca574fb743b4ffa
[INFO 24-03-21 20:32:12.0709 SAST decision_forest.cc:734] Model loaded with 50 root(s), 1516 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:12.0709 SAST abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-03-21 20:32:12.0710 SAST kernel.cc:1061] Use fast generic engine


Training dataset read in 0:00:00.095121. Found 6795 examples.
Training model...
Model trained in 0:00:00.071597
Compiling model...
Model compiled.
Num Trees: 50, Max Depth: 10, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpyt1cr6io as temporary training directory
Reading training dataset...


[INFO 24-03-21 20:32:12.3626 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpqttrx9j6/model/ with prefix 496cd9c4fbfa486c
[INFO 24-03-21 20:32:12.3790 SAST decision_forest.cc:734] Model loaded with 50 root(s), 16820 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:12.3791 SAST kernel.cc:1061] Use fast generic engine


Training dataset read in 0:00:00.096862. Found 6795 examples.
Training model...
Model trained in 0:00:00.147057
Compiling model...
Model compiled.
Num Trees: 50, Max Depth: None, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpimlxj0_j as temporary training directory
Reading training dataset...


[INFO 24-03-21 20:32:12.7111 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpyt1cr6io/model/ with prefix fcef183cdd3147ab
[INFO 24-03-21 20:32:12.7555 SAST decision_forest.cc:734] Model loaded with 50 root(s), 43640 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:12.7555 SAST kernel.cc:1061] Use fast generic engine


Training dataset read in 0:00:00.092134. Found 6795 examples.
Training model...
Model trained in 0:00:00.044696
Compiling model...
Model compiled.
Num Trees: 100, Max Depth: 5, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpz49ol8cu as temporary training directory
Reading training dataset...


[INFO 24-03-21 20:32:13.0355 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpimlxj0_j/model/ with prefix 968375cebefa4ab5
[INFO 24-03-21 20:32:13.0384 SAST decision_forest.cc:734] Model loaded with 100 root(s), 3028 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:13.0384 SAST kernel.cc:1061] Use fast generic engine


Training dataset read in 0:00:00.095613. Found 6795 examples.
Training model...
Model trained in 0:00:00.138976
Compiling model...
Model compiled.
Num Trees: 100, Max Depth: 10, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmp83ap7rrs as temporary training directory
Reading training dataset...


[INFO 24-03-21 20:32:13.3688 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpz49ol8cu/model/ with prefix 8f0ea520bc314bd8
[INFO 24-03-21 20:32:13.4014 SAST decision_forest.cc:734] Model loaded with 100 root(s), 33948 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:13.4014 SAST kernel.cc:1061] Use fast generic engine


Training dataset read in 0:00:00.093121. Found 6795 examples.
Training model...
Model trained in 0:00:00.259212
Compiling model...
Model compiled.


[INFO 24-03-21 20:32:13.8030 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmp83ap7rrs/model/ with prefix aba2b56789df4a7a
[INFO 24-03-21 20:32:13.8889 SAST decision_forest.cc:734] Model loaded with 100 root(s), 87700 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:13.8890 SAST kernel.cc:1061] Use fast generic engine


Num Trees: 100, Max Depth: None, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpi54drm_s as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.090923. Found 6795 examples.
Training model...
Model trained in 0:00:00.117083
Compiling model...
Model compiled.
Num Trees: 300, Max Depth: 5, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmp1pc006jd as temporary training directory
Reading training dataset...


[INFO 24-03-21 20:32:14.2475 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpi54drm_s/model/ with prefix 1accc6c860ac4696
[INFO 24-03-21 20:32:14.2561 SAST decision_forest.cc:734] Model loaded with 300 root(s), 9094 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:14.2561 SAST kernel.cc:1061] Use fast generic engine


Training dataset read in 0:00:00.099156. Found 6795 examples.
Training model...
Model trained in 0:00:00.422908
Compiling model...
Model compiled.


[INFO 24-03-21 20:32:14.8095 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmp1pc006jd/model/ with prefix 2618e4d84416491b
[INFO 24-03-21 20:32:14.9112 SAST decision_forest.cc:734] Model loaded with 300 root(s), 101326 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:14.9112 SAST kernel.cc:1061] Use fast generic engine


Num Trees: 300, Max Depth: 10, Validation Accuracy: 0.5021
Use /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpivs3zetp as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.092646. Found 6795 examples.
Training model...


[INFO 24-03-21 20:32:15.6938 SAST kernel.cc:1233] Loading model from path /var/folders/8m/ktb56qhs1_b5w47663nfm5wh0000gn/T/tmpivs3zetp/model/ with prefix 402a0483a7b3454f


Model trained in 0:00:00.779894
Compiling model...
Model compiled.

[INFO 24-03-21 20:32:15.9541 SAST decision_forest.cc:734] Model loaded with 300 root(s), 262786 node(s), and 11 input feature(s).
[INFO 24-03-21 20:32:15.9541 SAST kernel.cc:1061] Use fast generic engine


Num Trees: 300, Max Depth: None, Validation Accuracy: 0.5021
Best Accuracy: 0.5021
Best Hyperparameters: {'num_trees': 50, 'max_depth': 5}


## Test

In [54]:
# Read data
train_df = pd.read_csv('csv_files/test.csv')

# Preview data
print('Raw data format:')
display(train_df.head())

# Determining the amount of missing data per column
missing_data = train_df.isna().sum()

# Calculating the percentage of missing data per column
missing_percentage = (missing_data / len(train_df)) * 100

missing_info = pd.DataFrame({
    "Missing Values": missing_data,
    "Percentage": missing_percentage
})

missing_info.sort_values(by="Missing Values", ascending=False)

# Imputers
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

# List of numerical and categorical columns that need imputation
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Imputation
train_df[numerical_cols] = median_imputer.fit_transform(train_df[numerical_cols])
train_df[categorical_cols] = mode_imputer.fit_transform(train_df[categorical_cols])


# Assuming train_df is predefined
decision_tree_df = train_df.copy()  # Use copy to avoid SettingWithCopyWarning

# Drop unnecessary columns
decision_tree_df.drop(columns=['PassengerId', 'Name', 'VIP'], inplace=True)

decision_tree_df.dropna(subset=['Cabin'], inplace=True)

# Split 'Cabin' into 'Deck' and 'Side'
decision_tree_df['Deck'] = decision_tree_df['Cabin'].str.split('/').str[0]
decision_tree_df['Side'] = decision_tree_df['Cabin'].str.split('/').str[2]

decision_tree_df.drop(columns=['Cabin'], inplace=True) 

# Convert 'CryoSleep' boolean to int
decision_tree_df['CryoSleep'] = decision_tree_df['CryoSleep'].astype(int)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical variables
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    decision_tree_df[col] = label_encoder.fit_transform(decision_tree_df[col])

# After edits for decision trees
print('DataFrame used for trees:')
decision_tree_df.head()


Raw data format:


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


DataFrame used for trees:


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,0,1,2,27.0,0.0,0.0,0.0,0.0,0.0,6,1
1,0,0,2,19.0,0.0,9.0,0.0,2823.0,0.0,5,1
2,1,1,0,31.0,0.0,0.0,0.0,0.0,0.0,2,1
3,1,0,2,38.0,0.0,6652.0,0.0,181.0,585.0,2,1
4,0,0,2,20.0,10.0,0.0,635.0,0.0,0.0,5,1
