# Model building for survival prediction

In [1]:
# Install keras tuner
%pip install keras_tuner

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from scipy.stats import randint, uniform
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import keras_tuner as kt

# Set default style for graphs
sns.set_style("whitegrid")

Collecting keras_tuner
  Downloading keras_tuner-1.4.2-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras_tuner)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Collecting namex (from keras-core->keras_tuner)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, kt-legacy, keras-core, keras_tuner
Successfully installed keras-core-0.1.7 keras_tuner-1.4.2 kt-legacy-1.0.5 namex-0.0.7
Using TensorFlow backend


In [2]:
# Import datasets
test_dataset = pd.read_csv("test.csv")
train_dataset = pd.read_csv("train.csv")
test_dataset_copy = test_dataset.copy()

## Data preparation

### Split data into features and labels

In [3]:
# Separate features from labels
X_train = train_dataset.drop("Survived", axis=1)
y_train = train_dataset["Survived"]

### Transformations

1. Drop PassengerId, Ticket, Cabin and Name
2. Impute numerical columns with mean and categorical columns with most frequent
3. Combine number of parents/children and siblings/spouses into new category "Relatives"
4. Divide the relatives into 3 bins [0, 1-3, >3]
5. Age is divided into 2 categories: age 0-10 and >10
6. Numerical features are scaled (default StandardScaler)
7. Categorical features are one-hot encoded


In [4]:
def titanic_transformation(dataset:pd.DataFrame, scaler=StandardScaler()) -> pd.DataFrame:
  """
  Processes the dataset according to the following steps:
    1. Drops PassengerId, Ticket, Cabin and Name
    2. Impute numerical columns with mean
       and categorical columns with most frequent
    3. Combines number of parents/children and siblings/spouses
       into new category "Relatives"
    4. Divides the relatives into 3 bins [0, 1-3, >3]
    5. Age is divided into 2 categories: age 0-10 and >10
    6. Numerical features are scaled (default StandardScaler)
    7. Categorical features are one-hot encoded

  Returns: Dataframe

  Args: - dataset: dataframe for transformation
        - scaler: scaler for numerical features

  """
  # Drop PassengerId
  dataset.drop(["PassengerId", "Ticket", "Cabin", "Name"], axis=1, inplace=True)

  # Define columns for different transformations
  numerical_columns = ["Age", "Fare"]
  categorical_columns = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]

  # Impute first missing values
  original_columns = dataset.columns.to_list() # Save column names
  imputer_num = Pipeline([("imputer_num", SimpleImputer(strategy="mean"))])
  imputer_cat = Pipeline([("imputer_cat", SimpleImputer(strategy="most_frequent"))])
  imputer = ColumnTransformer(
      transformers=[
          ("imputer_num", imputer_num, numerical_columns),
          ("imputer_cat", imputer_cat, categorical_columns)
          ], remainder="passthrough")
  dataset = imputer.fit_transform(dataset)
  dataset = pd.DataFrame(dataset,
                         columns=imputer.get_feature_names_out())

  # Combine Parch and SibSp into relatives
  dataset["Relatives"] = dataset["imputer_cat__Parch"] + dataset["imputer_cat__SibSp"]

  # Separate relatives into 3 categories: alone, 1-3, >3
  bin_edges = [-1, 0, 3, float("inf")]
  bin_labels = ["0 relatives", "1-3 relatives", ">3 relatives"]
  dataset["Relative_cat"] = pd.cut(dataset["Relatives"],
                                   bins=bin_edges,
                                   labels=bin_labels)

   # Separate age into 2 categories: <15, >15
  bin_edges = [-1, 15, float("inf")]
  bin_labels = ["0-15", ">15"]
  dataset["Age_cat"] = pd.cut(dataset["imputer_num__Age"],
                              bins=bin_edges,
                              labels=bin_labels)

  # Create transformers for each type of transformation
  numerical_scaler = Pipeline([("scaler", scaler)])
  categorical_encoder = Pipeline([("onehot", OneHotEncoder(drop="first",
                                                           sparse_output=False)
  )])

 # Add new features to feature lists
  numerical_columns = ["imputer_num__Age", "imputer_num__Fare","Relatives",
                       "imputer_cat__SibSp", "imputer_cat__Parch"]
  categorical_columns = ["imputer_cat__Pclass", "imputer_cat__Sex",
                         "imputer_cat__Embarked",
                         "Relative_cat", "Age_cat"]

  # Create a ColumnTransformer to apply transformations to the respective columns
  preprocessor = ColumnTransformer(
      transformers=[
          ("num", numerical_scaler, numerical_columns),
           ("cat", categorical_encoder, categorical_columns)]
      ,remainder="passthrough")

  # Transform dataset
  dataset = preprocessor.fit_transform(dataset)
  dataset = pd.DataFrame(dataset,
                         columns=preprocessor.get_feature_names_out())
  return dataset

In [5]:
# Transform all datasets
X_train_transformed = titanic_transformation(X_train)
X_train_transformed

Unnamed: 0,num__imputer_num__Age,num__imputer_num__Fare,num__Relatives,num__imputer_cat__SibSp,num__imputer_cat__Parch,cat__imputer_cat__Pclass_2,cat__imputer_cat__Pclass_3,cat__imputer_cat__Sex_male,cat__imputer_cat__Embarked_Q,cat__imputer_cat__Embarked_S,cat__Relative_cat_1-3 relatives,cat__Relative_cat_>3 relatives,cat__Age_cat_>15
0,-0.592481,-0.502445,0.059160,0.432793,-0.473674,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
1,0.638789,0.786845,0.059160,0.432793,-0.473674,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,-0.284663,-0.488854,-0.560975,-0.474545,-0.473674,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.407926,0.420730,0.059160,0.432793,-0.473674,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,0.407926,-0.486337,-0.560975,-0.474545,-0.473674,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.207709,-0.386671,-0.560975,-0.474545,-0.473674,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
887,-0.823344,-0.044381,-0.560975,-0.474545,-0.473674,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
888,0.000000,-0.176263,1.299429,0.432793,2.008933,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
889,-0.284663,-0.044381,-0.560975,-0.474545,-0.473674,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Divide the dataset into subsets to screen features for different models

* Base: Fare (continuos), Pclass (categorical), Sex (categorical), Embarked (categorical)
* Subset 1: Age (continous), Relatives(continuous)
* Subset 2: Age (categorical younger and older than 15), Relatives (3 classes)
* Subset 3: Parch (continuous), SibSp (continuous)

In [6]:
# Divide dataset into subsets for feature screening
base_set_columns = ["num__imputer_num__Fare", "cat__imputer_cat__Pclass_2",
                    "cat__imputer_cat__Pclass_3", "cat__imputer_cat__Sex_male",
                    'cat__imputer_cat__Embarked_Q', 'cat__imputer_cat__Embarked_S']
subset_1_columns = ["num__imputer_num__Age", 	"num__imputer_num__Fare",
                    "cat__imputer_cat__Pclass_2", 	"cat__imputer_cat__Pclass_3",
                    "cat__imputer_cat__Sex_male", 'cat__imputer_cat__Embarked_Q',
                    'cat__imputer_cat__Embarked_S',"num__Relatives"]
subset_2_columns = ['num__imputer_num__Fare', 'cat__imputer_cat__Pclass_2',
                    'cat__imputer_cat__Pclass_3', 'cat__imputer_cat__Sex_male',
                    'cat__imputer_cat__Embarked_Q', 'cat__imputer_cat__Embarked_S',
                    'cat__Relative_cat_1-3 relatives', 'cat__Relative_cat_>3 relatives',
                    'cat__Age_cat_>15']
subset_3_columns = ["num__imputer_num__Fare", "cat__imputer_cat__Pclass_2",
                    "cat__imputer_cat__Pclass_3", "cat__imputer_cat__Sex_male",
                    'cat__imputer_cat__Embarked_Q', 'cat__imputer_cat__Embarked_S',
                    'num__imputer_cat__SibSp', 'num__imputer_cat__Parch']

base_set = X_train_transformed[base_set_columns]
subset_1 = X_train_transformed[subset_1_columns]
subset_2 = X_train_transformed[subset_2_columns]
subset_3 = X_train_transformed[subset_3_columns]

## Model screening

Screening of different models on different subsets of data. Performance is measured by accuracy using k-fold cross-validation.

* Support vector classifier
* KNN classifier
* Random forest
* Gradient boosting classifier
* Multilayer perceptron (2 layers, 100 neurons each)

In [7]:
def model_screening(X_train, y_train, models, dataset_name="", random_state=42):
    """
    Perform model screening

    Parameters:
    - X_train: Training data
    - y_train: Training labels
    - X_val : Validation data
    - y_val: Validation labels
    - models: A dictionary: {model names: model objects}

    Returns:
    - A dictionary containing model names and evaluation metrics
    """
    results = {}

    for model_name, model in models.items():
        # Train the model on the training set
        model.fit(X_train, y_train)

        # Make predictions on the validation set
        #y_pred = model.predict(X_val)

        # Calculate evaluation metrics
        accuracy = cross_val_score(estimator=model,
                                   X=X_train,
                                   y=y_train,
                                   cv=10,
                                   scoring="accuracy")

        # Store the evaluation metrics in the results dictionary
        results[model_name] = {
            dataset_name + " Accuracy": round(np.mean(accuracy), 3),
            dataset_name + " Stdev": round(np.std(accuracy), 3)
        }

    return results

In [8]:
# Create models
models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier(),
          "mlp": MLPClassifier(hidden_layer_sizes=(100, 100),
                               batch_size=32)
          }

# Train sklearn models and save results for base set
results_base = model_screening(X_train=base_set,
                               y_train=y_train,
                               models=models,
                               dataset_name="Base set")

# Train sklearn models and save results for subset 1
results_subset1 = model_screening(X_train=subset_1,
                                  y_train=y_train,
                                  models=models,
                                  dataset_name="Subset 1")

# Train sklearn models and save results for subset 2
results_subset2 = model_screening(X_train=subset_2,
                                  y_train=y_train,
                                  models=models,
                                  dataset_name="Subset 2")

# Train sklearn models and save results for subset 3
results_subset3 = model_screening(X_train=subset_3,
                                  y_train=y_train,
                                  models=models,
                                  dataset_name="Subset 3")



In [9]:
# Create dataframe of results
results_base = pd.DataFrame(results_base)
results_subset1 = pd.DataFrame(results_subset1)
results_subset2 = pd.DataFrame(results_subset2)
results_subset3 = pd.DataFrame(results_subset3)

# Concatenate dataframes
results = pd.concat([results_base, results_subset1,
                     results_subset2, results_subset3])
results

Unnamed: 0,svc_clf,knn_clf,random_forest_clf,gradient_boosted_clf,mlp
Base set Accuracy,0.814,0.804,0.807,0.809,0.806
Base set Stdev,0.029,0.031,0.041,0.04,0.033
Subset 1 Accuracy,0.827,0.81,0.8,0.834,0.803
Subset 1 Stdev,0.039,0.045,0.051,0.044,0.035
Subset 2 Accuracy,0.822,0.823,0.819,0.834,0.831
Subset 2 Stdev,0.033,0.047,0.046,0.041,0.037
Subset 3 Accuracy,0.808,0.798,0.792,0.808,0.796
Subset 3 Stdev,0.025,0.044,0.049,0.047,0.038


### Conclusions: Model Screening

* In general, all models benefit from the inclusion of relatives and age data.
* The three best models are svc, gradient boosting classifier, and mlp on subset 2.



## Fine tuning

In [10]:
# Prepare datasets for fine tuning
X_train = subset_2.astype("float32")
y_train = y_train.astype("float32")

### 1. SVC

In [11]:
# Set up parameter distributions
param_distribs = {"C": uniform(0, 10),
                  "kernel": ["linear", "poly", "rbf", "sigmoid"],
                  "degree": randint(0, 20),
                  "class_weight": [None, "balanced"]}

# Set up random search
rnd_search_svc = RandomizedSearchCV(SVC(),
                                    param_distributions=param_distribs,
                                    n_iter=20,
                                    cv=5,
                                    scoring="accuracy",
                                    random_state=42)

# Fit random search to data set
rnd_search_svc.fit(X_train, y_train)

# Save tuned classifier
svc_clf = rnd_search_svc.best_estimator_

# Look at evaluation results
cv_results = pd.DataFrame(rnd_search_svc.cv_results_)
cv_results.sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
15,0.100539,0.030072,0.005275,0.000807,4.951769,,2,poly,"{'C': 4.951769101112702, 'class_weight': None,...",0.860335,0.820225,0.825843,0.808989,0.853933,0.833865,0.019863,1
0,0.018498,0.00526,0.004689,0.001384,3.745401,,14,rbf,"{'C': 3.745401188473625, 'class_weight': None,...",0.837989,0.814607,0.825843,0.803371,0.859551,0.828272,0.019419,2
1,0.035731,0.006832,0.002823,0.000151,7.79691,,6,poly,"{'C': 7.796910002727692, 'class_weight': None,...",0.815642,0.792135,0.853933,0.814607,0.837079,0.822679,0.021127,3
3,0.045649,0.020851,0.003792,0.001108,3.337086,balanced,2,poly,"{'C': 3.337086111390218, 'class_weight': 'bala...",0.798883,0.808989,0.825843,0.808989,0.859551,0.820451,0.021381,4
11,0.041492,0.009857,0.008852,0.002068,6.075449,,8,rbf,"{'C': 6.075448519014383, 'class_weight': None,...",0.815642,0.797753,0.825843,0.797753,0.859551,0.819308,0.022825,5
8,1.168697,0.745242,0.005699,0.000303,2.921446,balanced,14,poly,"{'C': 2.9214464853521815, 'class_weight': 'bal...",0.804469,0.741573,0.831461,0.842697,0.853933,0.814826,0.04014,6
19,0.105264,0.049195,0.011383,0.005915,4.497541,balanced,3,poly,"{'C': 4.497541333697656, 'class_weight': 'bala...",0.810056,0.747191,0.837079,0.803371,0.859551,0.811449,0.03786,7
4,0.070694,0.011507,0.006814,0.001403,0.564116,balanced,11,poly,"{'C': 0.5641157902710026, 'class_weight': 'bal...",0.782123,0.735955,0.825843,0.853933,0.842697,0.80811,0.043571,8
6,0.03928,0.01782,0.005709,0.001735,6.116532,,11,linear,"{'C': 6.116531604882809, 'class_weight': None,...",0.826816,0.808989,0.814607,0.780899,0.803371,0.806936,0.015158,9
18,3.185575,2.208428,0.008417,0.003459,1.848545,,17,poly,"{'C': 1.8485445552552704, 'class_weight': None...",0.798883,0.724719,0.825843,0.814607,0.837079,0.800226,0.03981,10


### Gradient boosting classifier

In [12]:
# Set up parameter distributions
param_distribs = {"learning_rate": uniform(0, 1),
                  "loss": ["log_loss", "exponential"],
                  "n_estimators": randint(10, 1000)}

# Set up random search
rnd_search_gradient_boost = RandomizedSearchCV(GradientBoostingClassifier(),
                                               param_distributions=param_distribs,
                                               n_iter=20,
                                               cv=5,
                                               scoring="accuracy",
                                               random_state=42)

# Fit random search to data set
rnd_search_gradient_boost.fit(X_train, y_train)

# Save tuned classifier
gradient_boost_clf = rnd_search_gradient_boost.best_estimator_

# Look at evaluation results
cv_results = pd.DataFrame(rnd_search_gradient_boost.cv_results_)
cv_results.sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_loss,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.409212,0.034869,0.004969,0.00171,0.156019,log_loss,224,"{'learning_rate': 0.15601864044243652, 'loss':...",0.821229,0.797753,0.865169,0.848315,0.837079,0.833909,0.023077,1
12,0.806258,0.153135,0.004685,0.000695,0.090606,exponential,572,"{'learning_rate': 0.0906064345328208, 'loss': ...",0.804469,0.803371,0.865169,0.859551,0.825843,0.83168,0.02636,2
5,0.58297,0.03658,0.004279,0.000514,0.020584,exponential,353,"{'learning_rate': 0.020584494295802447, 'loss'...",0.804469,0.814607,0.876404,0.803371,0.859551,0.83168,0.030366,3
16,0.375707,0.006301,0.003267,0.000178,0.013265,log_loss,325,"{'learning_rate': 0.013264961159866528, 'loss'...",0.832402,0.803371,0.876404,0.797753,0.848315,0.831649,0.029086,4
11,0.31604,0.035887,0.004054,0.000229,0.973756,log_loss,199,"{'learning_rate': 0.9737555188414592, 'loss': ...",0.810056,0.797753,0.859551,0.853933,0.831461,0.83055,0.024018,5
1,0.075089,0.023586,0.004965,0.002312,0.731994,log_loss,30,"{'learning_rate': 0.7319939418114051, 'loss': ...",0.793296,0.820225,0.870787,0.825843,0.837079,0.829446,0.025188,6
14,1.0866,0.191223,0.005814,0.000945,0.466763,log_loss,828,"{'learning_rate': 0.4667628932479799, 'loss': ...",0.793296,0.820225,0.848315,0.848315,0.831461,0.828322,0.020502,7
7,0.194348,0.004206,0.00315,0.000659,0.181825,log_loss,170,"{'learning_rate': 0.18182496720710062, 'loss':...",0.798883,0.786517,0.865169,0.842697,0.848315,0.828316,0.03026,8
3,0.763999,0.056751,0.006047,0.001526,0.058084,exponential,382,"{'learning_rate': 0.05808361216819946, 'loss':...",0.821229,0.797753,0.865169,0.814607,0.842697,0.828291,0.023397,9
4,0.22453,0.062524,0.003167,0.000674,0.601115,exponential,140,"{'learning_rate': 0.6011150117432088, 'loss': ...",0.804469,0.792135,0.848315,0.865169,0.825843,0.827186,0.026965,10


### MLP

For the neural network model, the data set is divided into a validation set to avoid time-consuming cross-validation.

In [13]:
# Split data into train and validation set
X_mlp_train, X_mlp_val, y_mlp_train, y_mlp_val = train_test_split(X_train,
                                                                  y_train,
                                                                  train_size=0.8,
                                                                  random_state=42)

# Create batched and prefetched tensorflow datasets
mlp_train_ds = tf.data.Dataset.from_tensor_slices((X_mlp_train, y_mlp_train))
mlp_train_ds = mlp_train_ds.batch(32).prefetch(tf.data.AUTOTUNE)
mlp_val_ds = tf.data.Dataset.from_tensor_slices((X_mlp_val, y_mlp_val))
mlp_val_ds = mlp_val_ds.batch(32).prefetch(tf.data.AUTOTUNE)

In [14]:
def model_builder(hp):
  model = keras.Sequential()

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  hp_units = hp.Int("units", min_value=8, max_value=512, step=8)
  hp_activation = hp.Choice("activation",
                            values=["relu", "elu", "selu", "gelu"])
  model.add(keras.layers.Dense(units=hp_units, activation=hp_activation,
                               kernel_initializer="he_normal"))
  model.add(keras.layers.Dense(units=hp_units, activation=hp_activation,
                               kernel_initializer="he_normal"))
  model.add(keras.layers.Dense(1, activation="sigmoid"))

  # Tune the learning rate for the optimizer
  hp_learning_rate = hp.Choice("learning_rate", values=list(np.linspace(0, 0.1)))

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.BinaryCrossentropy(),
                metrics=["accuracy"])

  return model

In [15]:
# Set up Keras hypertuner using Hyperband for searching
tuner = kt.Hyperband(model_builder,
                     objective="val_accuracy",
                     max_epochs=20,
                     factor=3,
                     directory="mlp_finetuning",
                     project_name="fine_tuning6")

tuner.search(mlp_train_ds,
             validation_data=mlp_val_ds,
             epochs=20)

Trial 30 Complete [00h 00m 06s]
val_accuracy: 0.826815664768219

Best val_accuracy So Far: 0.8379888534545898
Total elapsed time: 00h 01m 41s


In [16]:
# Get the best hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# Build mlp classifier with best hyperparameters
mlp_clf = tuner.hypermodel.build(best_hps)

# Build early stopping and reduce plateau callbacks
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,
                                                  restore_best_weights=True)
lr_plateau_cb = keras.callbacks.ReduceLROnPlateau(patience=5)

# Train mlp classifier
history = mlp_clf.fit(mlp_train_ds,
                      validation_data=mlp_val_ds,
                      epochs=1000,callbacks=[early_stopping_cb, lr_plateau_cb])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

## Prediction of test data by each model and submission to Kaggle

In [17]:
# Transform test dataset
X_test = titanic_transformation(test_dataset)

# Filter columns to match with subset 2
X_test = X_test[subset_2_columns]
X_test.astype("float32")

# Predict survival based on the test set
y_pred_svc = svc_clf.predict(X_test).astype("int64")
y_pred_boost = gradient_boost_clf.predict(X_test).astype("int64")
y_pred_mlp = tf.round(mlp_clf.predict(X_test))
y_pred_mlp = tf.cast(tf.squeeze(y_pred_mlp), tf.int32)



In [18]:
# Take passengerId as series from test_dataset_copy
passenger_ids = test_dataset_copy["PassengerId"]

# Create dataframes with PassengerId and Survived as columns
svc_submission = pd.DataFrame({"PassengerId": passenger_ids,
                               "Survived": y_pred_svc})
boost_submission = pd.DataFrame({"PassengerId": passenger_ids,
                               "Survived": y_pred_boost})
mlp_submission = pd.DataFrame({"PassengerId": passenger_ids,
                               "Survived": y_pred_mlp})

# Write csv files for submission
svc_submission.to_csv("svc_submission.csv", index=False)
boost_submission.to_csv("boost_submission.csv", index=False)
mlp_submission.to_csv("mlp_submission.csv", index=False)

**Scores from Kaggle**:
* MLP: 0.77272
* GradientBoost: 0.76794
* SVC: 0.78229