# Spaceship Titanic Dataset with TensorFlow Decision Forests

# Import the library

In [None]:
from fastai.imports import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from fastai.tabular.all import *

np.set_printoptions(linewidth=130)

# Load the Dataset

In [None]:
dataset_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
testset = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
print("Le dataset d'entrainement est de dimension : {}".format(dataset_df.shape))

The data is composed of 14 columns and 8693 entries. We can see all 14 dimensions of our dataset by printing out the first 5 entries using the following code:

In [None]:
dataset_df.head(5)

There are 12 feature columns. Using these features your model has to predict whether the passenger is rescued or not indicated by the column `Transported`.

# Let us quickly do a basic exploration of the dataset

In [None]:
dataset_df.describe()

In [None]:
dataset_df.info()

# Bar chart for label column: Transported



In [None]:
plot_df = dataset_df.Transported.value_counts()
plot_df.plot(kind="bar")

# Numerical data distribution

Let us plot all the numerical columns and their value counts:

In [None]:
fig, ax = plt.subplots(5,1,  figsize=(10, 10))
plt.subplots_adjust(top = 2)

sns.histplot(dataset_df['Age'], color='b', bins=50, ax=ax[0]);
sns.histplot(dataset_df['FoodCourt'], color='b', bins=50, ax=ax[1]);
sns.histplot(dataset_df['ShoppingMall'], color='b', bins=50, ax=ax[2]);
sns.histplot(dataset_df['Spa'], color='b', bins=50, ax=ax[3]);
sns.histplot(dataset_df['VRDeck'], color='b', bins=50, ax=ax[4]);

# Prepare the dataset

We will drop both `PassengerId` and `Name` columns as they are not necessary for model training.

In [None]:
def process(df):
    df1 = df.copy()
    df1[["Deck", "Cabin_num", "Side"]] = df1["Cabin"].str.split("/", expand=True)
    if "Transported" in df1.columns :
        df2 = df1[["Transported"]]
        df1 = df1.drop("Transported", axis=1)
        df2 = df2["Transported"].astype(int)
    else :
        df2 = None
    df1 = df1.drop(["PassengerId", "Name","Cabin"], axis=1)
    numeric_cols = df1.select_dtypes(include = ['float64', 'int64'])
    categorical_cols = df1.select_dtypes(include = ['object', 'bool'])
    médiane = numeric_cols.median()
    modes = categorical_cols.mode().iloc[0]
    df1[numeric_cols.columns] = df1[numeric_cols.columns].fillna(médiane)
    df1[categorical_cols.columns] = df1[categorical_cols.columns].fillna(modes)
    types_dict = { 'Cabin_num': int,'CryoSleep': int, 'VIP': int}
    df1 = df1.astype(types_dict)
    categorical_cols = df1.select_dtypes(include = ['object', 'bool'])
    for col in categorical_cols.columns:
        df1[col] = pd.Categorical(df1[col])
        df1[col] = df1[col].cat.codes
    return df1,df2

DF, Y = process(dataset_df)
DF_tst = process(testset)[0]
DF.head(10)

In [None]:
seed = 42
trn_df, val_df, trn_df_y, val_df_y = train_test_split(DF, Y, test_size=0.2, random_state=seed)

In [None]:
rf0 = RandomForestClassifier(500, min_samples_leaf = 15)

def numper(df) :
    return np.array(df).ravel()

def precision (modele, reel) :
    return ((modele == reel).sum())/len(reel)

def entrainement (df,df_y,df_val,df_val_y,rf):
    rf.fit(df, numper(df_y))
    return precision(rf.predict(df),df_y),precision(rf.predict(df_val),df_val_y)

entrainement (trn_df,trn_df_y,val_df,val_df_y,rf0) 

In [None]:
def evolution(df,df_y,df_val,df_val_y):
    Precision_trn1 =[]
    Precision_val1 =[]
    Precision_trn2 =[]
    Precision_val2 =[]
    Abcisses1 = []
    Abcisses2 = []
    for i in range(10,511,50):
        rf = RandomForestClassifier(i, min_samples_leaf = 5)
        rf.fit(df, numper(df_y))
        Abcisses1.append(i)
        Precision_trn1.append(precision(rf.predict(df),df_y))
        Precision_val1.append(precision(rf.predict(df_val),df_val_y))
    for j in range(2,102,5):
        rf = RandomForestClassifier(300, min_samples_leaf = j)
        rf.fit(df, numper(df_y))
        Abcisses2.append(j)
        Precision_trn2.append(precision(rf.predict(df),df_y))
        Precision_val2.append(precision(rf.predict(df_val),df_val_y))
    plt.subplot(1, 2, 1)
    plt.plot(Abcisses1, Precision_val1, label = 'Precision Cross Validation Set', marker = 'x')
    plt.xlabel("Nombre d'Arbres de la Random Forest")
    plt.ylabel('Taux de Précision')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(Abcisses2, Precision_val2, label = 'Precision Cross Validation Set', marker = 'x')
    plt.xlabel("Taille de Ramification Minimale")
    plt.ylabel('Taux de Précision')
    plt.legend()
    plt.tight_layout()
    plt.show()
        
#evolution (trn_df,trn_df_y,val_df,val_df_y) 

In [None]:
testset.head()

In [None]:
def subm(preds, suff):
    testset["Transported"] = preds
    testset["Transported"] = testset["Transported"].astype(bool)
    sub_df = testset[["PassengerId","Transported"]]
    sub_df.to_csv(f'sub-{suff}.csv', index=False)

#subm(rf0.predict(DF_tst), 'Random_Forest')

In [None]:
def process_nn(df,df_y = None):
    df_nn = df.copy()
    if df_y is not None :
        df_nn["Transported"] = df_y
    df_nn['LogShoppingMall'] = np.log1p(df_nn['ShoppingMall'])
    df_nn['LogFoodCourt'] = np.log1p(df_nn['FoodCourt'])
    df_nn['LogSpa'] = np.log1p(df_nn['Spa'])
    df_nn['LogVRDeck'] = np.log1p(df_nn['VRDeck'])
    df_nn = df_nn.drop(["ShoppingMall", "FoodCourt","Spa","VRDeck"], axis=1)
    return df_nn
    
DF_NN = process_nn(DF,Y)
DF_tst_NN = process_nn(DF_tst)

In [None]:
splits = RandomSplitter(seed=42)(DF_NN)
DF_NN.head()

dls = TabularPandas(
    DF_NN, splits=splits,
    procs = [Categorify, FillMissing, Normalize],
    cat_names=["HomePlanet","CryoSleep","Destination","VIP", "Deck","Cabin_num","Side","RoomService"],
    cont_names=['Age', 'LogFoodCourt', 'LogShoppingMall', 'LogSpa', 'LogVRDeck'],
    y_names="Transported", y_block = CategoryBlock(),
).dataloaders(path=".")

In [None]:
learn = tabular_learner(dls, metrics=accuracy, layers=[3,3])

In [None]:
Sugest_lr = learn.lr_find(suggest_funcs=(slide, valley))
Sugest_lr

In [None]:
learn.fit(2, lr = 0.04)

In [None]:
tst_dl = learn.dls.test_dl(DF_tst_NN)
prediction,_ = learn.get_preds(dl = tst_dl)
prediction = (prediction[:,1]>0.5).int()

In [None]:
subm(prediction, 'NN')