In [3]:
# Begin by importing all required libraries
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data
train_df = pd.read_csv('csv_files/train.csv')

# Preview data
print('Raw data format:')
display(train_df.head())

# Determining the amount of missing data per column
missing_data = train_df.isna().sum()

# Calculating the percentage of missing data per column
missing_percentage = (missing_data / len(train_df)) * 100

missing_info = pd.DataFrame({
    "Missing Values": missing_data,
    "Percentage": missing_percentage
})

missing_info.sort_values(by="Missing Values", ascending=False)

# Imputers
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

# List of numerical and categorical columns that need imputation
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Imputation
train_df[numerical_cols] = median_imputer.fit_transform(train_df[numerical_cols])
train_df[categorical_cols] = mode_imputer.fit_transform(train_df[categorical_cols])


# Assuming train_df is predefined
decision_tree_df = train_df.copy()  # Use copy to avoid SettingWithCopyWarning

# Drop unnecessary columns
decision_tree_df.drop(columns=['PassengerId', 'Name', 'VIP'], inplace=True)

decision_tree_df.dropna(subset=['Cabin'], inplace=True)

# Split 'Cabin' into 'Deck' and 'Side'
decision_tree_df['Deck'] = decision_tree_df['Cabin'].str.split('/').str[0]
decision_tree_df['Side'] = decision_tree_df['Cabin'].str.split('/').str[2]

decision_tree_df.drop(columns=['Cabin'], inplace=True) 

# Convert 'CryoSleep' boolean to int
decision_tree_df['CryoSleep'] = decision_tree_df['CryoSleep'].astype(int)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical variables
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    decision_tree_df[col] = label_encoder.fit_transform(decision_tree_df[col])

# After edits for decision trees
print('DataFrame used for trees:')
decision_tree_df.head()


Raw data format:


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


DataFrame used for trees:


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,1,0,2,39.0,0.0,0.0,0.0,0.0,0.0,False,1,0
1,0,0,2,24.0,109.0,9.0,25.0,549.0,44.0,True,5,1
2,1,0,2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,0,1
3,1,0,2,33.0,0.0,1283.0,371.0,3329.0,193.0,False,0,1
4,0,0,2,16.0,303.0,70.0,151.0,565.0,2.0,True,5,1


In [None]:
# Preprocessing: Convert boolean column to int
decision_tree_df['Transported'] = decision_tree_df['Transported'].astype(int)

# Split the DataFrame into features and the target
features = decision_tree_df.drop('Transported', axis=1)
target = decision_tree_df['Transported']

# Split the data into train and test sets
X = decision_tree_df.drop(['Transported'], axis=1)  # Features
y = decision_tree_df['Transported']  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the pandas dataframe to a tensorflow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, label="Transported")
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_val, label="Transported")

# Initialize the Random Forest model
rf_model = tfdf.keras.RandomForestModel()

# Fit the model to the training data
rf_model.fit(train_ds)

# Evaluate the model on the test set
evaluation = rf_model.evaluate(test_ds)

# Accuracy is typically the second metric in the evaluation output if the dataset has labels.
accuracy = evaluation[1]
print(f"Test accuracy: {accuracy:.4f}")

# Or you can make predictions and then calculate accuracy using scikit-learn's functionality
y_pred = rf_model.predict(test_ds)
y_pred = tf.argmax(y_pred, axis=1).numpy()
accuracy = accuracy_score(y_test, y_pred)
print(f"Calculated test accuracy: {accuracy:.4f}")
