# Spaceship Titanic Challenge

# Import the library

In this challenge i will use mainly the tools from fastai, pytorch and sklearn

In [None]:
from fastai.imports import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from fastai.tabular.all import *

np.set_printoptions(linewidth=130)

# Load the Dataset

I load the 2 datasets (traning and testing set), useful to train my model and to evaluate it

In [None]:
dataset_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
testset = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
print("Le dataset d'entrainement est de dimension : {}".format(dataset_df.shape))

The data is composed of 14 columns and 8693 entries. We can see all 14 dimensions of our dataset by printing out the first 5 entries using the following code:

Moreover, we can also use df.info() and df.describe() to obtain others valuable informations on the format/values of each column.

In [None]:
dataset_df.head(5)

There are 12 feature columns (14 - `PassengerId` and `Transported` columns). Using these features your model has to predict whether the passenger is rescued or not indicated by the column `Transported`.

# Prepare the dataset

I created the process() function for multiples preprocess steps that we need to make :
I will split the `Cabin` column into 3 different columns `Deck`, `Cabin_num`,`Side`
If it's a dataframe with the `Transported` column (ex : training set), i will remove it and put it into another dataframe.
I will drop both `PassengerId`,`Cabin` and `Name` columns as they are not necessary for model training.
I will replace each NaN value by the median for numerical columns, and by the mode (most commun value) for the categorical columns.
Finally, i will replace every categorical text by values, for example True/False by 1/0

In [None]:
def process(df):
    df1 = df.copy()
    df1[["Deck", "Cabin_num", "Side"]] = df1["Cabin"].str.split("/", expand=True)
    if "Transported" in df1.columns :
        df2 = df1[["Transported"]]
        df1 = df1.drop("Transported", axis=1)
        df2 = df2["Transported"].astype(int)
    else :
        df2 = None
    df1 = df1.drop(["PassengerId", "Name","Cabin"], axis=1)
    numeric_cols = df1.select_dtypes(include = ['float64', 'int64'])
    categorical_cols = df1.select_dtypes(include = ['object', 'bool'])
    médiane = numeric_cols.median()
    modes = categorical_cols.mode().iloc[0]
    df1[numeric_cols.columns] = df1[numeric_cols.columns].fillna(médiane)
    df1[categorical_cols.columns] = df1[categorical_cols.columns].fillna(modes)
    types_dict = { 'Cabin_num': int,'CryoSleep': int, 'VIP': int}
    df1 = df1.astype(types_dict)
    categorical_cols = df1.select_dtypes(include = ['object', 'bool'])
    for col in categorical_cols.columns:
        df1[col] = pd.Categorical(df1[col])
        df1[col] = df1[col].cat.codes
    return df1,df2

DF, Y = process(dataset_df)
DF_tst = process(testset)[0]
DF.head()

I used the train_test_split() function to separate the training set into a traing set and a cross validation set. I use a seed to ensure the reproductibility of the random parts.

In [None]:
seed = 42
trn_df, val_df, trn_df_y, val_df_y = train_test_split(DF, Y, test_size=0.2, random_state=seed)

The function entrainement() fit the model for an input dataframe (ex : training set), and returns the accuracy for the training set and the accuracy for the cross validation set :

In [None]:
rf0 = RandomForestClassifier(500, min_samples_leaf = 15)

def numper(df) :
    return np.array(df).ravel()

def precision (modele, reel) :
    return ((modele == reel).sum())/len(reel)

def entrainement (df,df_y,df_val,df_val_y,rf):
    rf.fit(df, numper(df_y))
    return precision(rf.predict(df),df_y),precision(rf.predict(df_val),df_val_y)

entrainement (trn_df,trn_df_y,val_df,val_df_y,rf0) 

evolution() function is just a function that i used to see how can i optimize the parameters of the RandomForestClassifier function, i.e the number of trees and the minimum of leafs.

In [None]:
def evolution(df,df_y,df_val,df_val_y):
    Precision_trn1 =[]
    Precision_val1 =[]
    Precision_trn2 =[]
    Precision_val2 =[]
    Abcisses1 = []
    Abcisses2 = []
    for i in range(10,511,50):
        rf = RandomForestClassifier(i, min_samples_leaf = 5)
        rf.fit(df, numper(df_y))
        Abcisses1.append(i)
        Precision_trn1.append(precision(rf.predict(df),df_y))
        Precision_val1.append(precision(rf.predict(df_val),df_val_y))
    for j in range(2,102,5):
        rf = RandomForestClassifier(300, min_samples_leaf = j)
        rf.fit(df, numper(df_y))
        Abcisses2.append(j)
        Precision_trn2.append(precision(rf.predict(df),df_y))
        Precision_val2.append(precision(rf.predict(df_val),df_val_y))
    plt.subplot(1, 2, 1)
    plt.plot(Abcisses1, Precision_val1, label = 'Precision Cross Validation Set', marker = 'x')
    plt.xlabel("Nombre d'Arbres de la Random Forest")
    plt.ylabel('Taux de Précision')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(Abcisses2, Precision_val2, label = 'Precision Cross Validation Set', marker = 'x')
    plt.xlabel("Taille de Ramification Minimale")
    plt.ylabel('Taux de Précision')
    plt.legend()
    plt.tight_layout()
    plt.show()
        
#evolution (trn_df,trn_df_y,val_df,val_df_y) 

In [None]:
testset.head()

subm() function is to create the output csv for a model and a testing dataframe

In [None]:
def subm(preds, suff):
    testset["Transported"] = preds
    testset["Transported"] = testset["Transported"].astype(bool)
    sub_df = testset[["PassengerId","Transported"]]
    sub_df.to_csv(f'sub-{suff}.csv', index=False)

#subm(rf0.predict(DF_tst), 'Random_Forest')

To train a Neural Network, i used a second step of preprocessing. Unlike the random forest algorithm, it's useful for a NN to normalize the distribution of the columns. To do that, i used the logarithm of some numericals columns :

In [None]:
def process_nn(df,df_y = None):
    df_nn = df.copy()
    if df_y is not None :
        df_nn["Transported"] = df_y
    df_nn['LogShoppingMall'] = np.log1p(df_nn['ShoppingMall'])
    df_nn['LogFoodCourt'] = np.log1p(df_nn['FoodCourt'])
    df_nn['LogSpa'] = np.log1p(df_nn['Spa'])
    df_nn['LogVRDeck'] = np.log1p(df_nn['VRDeck'])
    df_nn = df_nn.drop(["ShoppingMall", "FoodCourt","Spa","VRDeck"], axis=1)
    return df_nn
    
DF_NN = process_nn(DF,Y)
DF_tst_NN = process_nn(DF_tst)

I use the TabularPandas() function to make the things clean by itself :

In [None]:
splits = RandomSplitter(seed=42)(DF_NN)
DF_NN.head()

dls = TabularPandas(
    DF_NN, splits=splits,
    procs = [Categorify, FillMissing, Normalize],
    cat_names=["HomePlanet","CryoSleep","Destination","VIP", "Deck","Cabin_num","Side","RoomService"],
    cont_names=['Age', 'LogFoodCourt', 'LogShoppingMall', 'LogSpa', 'LogVRDeck'],
    y_names="Transported", y_block = CategoryBlock(),
).dataloaders(path=".")

And now, we have to create the initial model, using the tabular_learner() function, the metric and the layers that i want :

In [None]:
learn = tabular_learner(dls, metrics=accuracy, layers=[3,3])

lr_find() is a informative function to have a sense of what could be a good learning rate :

In [None]:
Sugest_lr = learn.lr_find(suggest_funcs=(slide, valley))
Sugest_lr

Now, we can fit the model with the data considering a number of iterations (epoch) and a learning rate that we can define using the graphic.

In [None]:
learn.fit(2, lr = 0.04)

The argument test_dl() is to use the same parameters than the previous set for the test set.
That done, we can make our predictions :

In [None]:
tst_dl = learn.dls.test_dl(DF_tst_NN)
prediction,_ = learn.get_preds(dl = tst_dl)
prediction = (prediction[:,1]>0.5).int()

In [None]:
subm(prediction, 'NN')