# Model building for survival prediction

In [50]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
sns.set_style("whitegrid")
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
%pip install keras_tuner



In [51]:
# Receiving datasets
test_dataset = pd.read_csv("test.csv")
train_dataset = pd.read_csv("train.csv")

## Data preparation

Split train dataset into train and validation set

In [52]:
from sklearn.model_selection import train_test_split
# Split train_dataset into train and validation set
train_dataset, val_dataset= train_test_split(train_dataset, random_state=42,
                                             train_size=0.8, shuffle=True)

# Seperate features from labels
X_train = train_dataset.drop("Survived", axis=1)
y_train = train_dataset["Survived"]
X_val = val_dataset.drop("Survived", axis=1)
y_val = val_dataset["Survived"]

**To do:**

1. Pclass: One-hot-encoding
2. Sex: One-hot-encoding
3. Fare: Scaling (StandardScaler because of high std)
4. Age: Maybe imputing and sorting into bins


In [53]:
def titanic_transformation(dataset:pd.DataFrame, include_columns:list=None):
  """
  Processes the dataset according to the following steps:
    1. Combines number of parents/children and siblings/spouses
       into new category "Relatives"
    2. Divides the relatives into 3 bins [0, 1-3, >3]
    3. Age is imputed with mean age values
    4. Imputed age and fare are standard scaled
    5. Pclass, Sex, Relative_cat are onehot-encoded

  Returns: Dataframe

  Args: - dataset: dataframe for transformation
        - include_columns: list of columns to include in output
          "pclass", "sex", "age", "fare", "relatives"

  """
  # Combine Parch and SibSp into relatives
  dataset["Relatives"] = dataset["Parch"] + dataset["SibSp"]
  # Seperate relatives into 3 categories: alone, 1-3, >3
  # Define the bin edges and labels
  bin_edges = [-1, 0, 3, float("inf")]
  bin_labels = ["0 relatives", "1-3 relatives", ">3 relatives"]
  # Create the new categorical column
  dataset["Relative_cat"] = pd.cut(dataset["Relatives"],
                                   bins=bin_edges,
                                   labels=bin_labels)
  # Define columns for different transformations
  numerical_columns = ["Age", "Fare", "Relatives"]
  categorical_columns = ["Pclass", "Sex", "Relative_cat"]
  # Create transformers for each type of transformation
  numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())])
  categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(drop="first"))])

  # Create a ColumnTransformer to apply transformations to the respective columns
  preprocessor = ColumnTransformer(
      transformers=[
        ("num", numerical_transformer, numerical_columns),
        ("cat", categorical_transformer, categorical_columns)
    ], remainder="passthrough")
  # Transform dataset
  dataset_transformed = preprocessor.fit_transform(dataset)
  dataset_transformed = pd.DataFrame(dataset_transformed,
                                     columns=preprocessor.get_feature_names_out())
  dataset_transformed = dataset_transformed[include_columns]
  return dataset_transformed

In [54]:
# Transform all datasets
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male",
                 "cat__Relative_cat_1-3 relatives",
                 "cat__Relative_cat_>3 relatives"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

# Check shapes of arrays
X_train_transformed.shape, X_val_transformed.shape, y_train.shape, y_val.shape

((712, 7), (179, 7), (712,), (179,))

## Model screening:

Simple screening of different models to select model for feature selection and fine tuning. Performance is measured using holdout-validation with binary accuracy on the validation dataset.

* Support vector classifier
* KNN classifier
* Random forest
* Gradient boosted tree
* Multi-layer perceptron

In [55]:
def model_screening(X_train, y_train, X_val, y_val, models, random_state=42):
    """
    Perform model screening

    Parameters:
    - X_train: Training data
    - y_train: Training labels
    - X_val : Validation data
    - y_val: Validation labels
    - models: A dictionary: {model names: model objects}

    Returns:
    - A dictionary containing model names and evaluation metrics
    """
    results = {}

    for model_name, model in models.items():
        # Train the model on the training set
        model.fit(X_train, y_train)

        # Make predictions on the validation set
        y_pred = model.predict(X_val)

        # Calculate evaluation metrics
        accuracy = sklearn.metrics.accuracy_score(y_val, y_pred)
        #roc_auc = sklearn.metrics.roc_auc_score(y_val, y_pred)

        # Store the evaluation metrics in the results dictionary
        results[model_name] = {
            "Accuracy": accuracy,
         #   "ROC-AUC": roc_auc
        }

    return results

In [56]:
# Create models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier()
          }
# Train sklearn models and save results
results = model_screening(X_train=X_train_transformed,
                          y_train=y_train,
                          X_val=X_val_transformed,
                          y_val=y_val,
                          models=models)

# Train a simple model with all categories
mlp_clf = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

mlp_clf.compile(loss="binary_crossentropy",
              optimizer="Adam",
              metrics=["accuracy"])

early = keras.callbacks.EarlyStopping(restore_best_weights=True)


mlp_clf.fit(x=X_train_transformed,
          y=y_train,
          batch_size=32,
          epochs=1000,
          validation_data=(X_val_transformed, y_val),
          callbacks=[early])

mlp_accuracy = mlp_clf.evaluate(X_val_transformed, y_val)[1]
results["mlp"] = mlp_accuracy
pd.DataFrame(results).transpose()

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000


Unnamed: 0,Accuracy
svc_clf,0.815642
knn_clf,0.810056
random_forest_clf,0.776536
gradient_boosted_clf,0.75419
mlp,0.776536


Screen for features:

I want to know if the categorical relatives features is benefitial for the models or not and compare it to the discret relatives feature.

Exclude it at all

In [57]:
# Transform all datasets exluding relatives at all
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier()
          }
# Train sklearn models and save results
results_1 = model_screening(X_train=X_train_transformed,
                          y_train=y_train,
                          X_val=X_val_transformed,
                          y_val=y_val,
                          models=models)

# Train a simple model with all categories
mlp_clf = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

mlp_clf.compile(loss="binary_crossentropy",
              optimizer="Adam",
              metrics=["accuracy"])

early = keras.callbacks.EarlyStopping(restore_best_weights=True)


mlp_clf.fit(x=X_train_transformed,
          y=y_train,
          batch_size=32,
          epochs=1000,
          validation_data=(X_val_transformed, y_val),
          callbacks=[early],
          verbose=0)

mlp_accuracy = mlp_clf.evaluate(X_val_transformed, y_val)[1]
results_1["mlp"] = mlp_accuracy



Now I try it with the continous relatives feature

In [59]:
# Transform all datasets exluding relatives at all
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male",
                 "num__Relatives"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier()
          }
# Train sklearn models and save results
results_2 = model_screening(X_train=X_train_transformed,
                          y_train=y_train,
                          X_val=X_val_transformed,
                          y_val=y_val,
                          models=models)

# Train a simple model with all categories
mlp_clf = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

mlp_clf.compile(loss="binary_crossentropy",
              optimizer="Adam",
              metrics=["accuracy"])

early = keras.callbacks.EarlyStopping(restore_best_weights=True)


mlp_clf.fit(x=X_train_transformed,
          y=y_train,
          batch_size=32,
          epochs=1000,
          validation_data=(X_val_transformed, y_val),
          callbacks=[early],
          verbose=0)

mlp_accuracy = mlp_clf.evaluate(X_val_transformed, y_val)[1]
results_2["mlp"] = mlp_accuracy



Compare the results

In [60]:
# Concatenate results

res0 = pd.DataFrame(results)
res1 = pd.DataFrame(results_1)
res2 = pd.DataFrame(results_2)

combined_res = pd.concat([res0, res1, res2], axis=0, ignore_index=True)
combined_res

Unnamed: 0,svc_clf,knn_clf,random_forest_clf,gradient_boosted_clf,mlp
0,0.815642,0.810056,0.776536,0.75419,0.776536
1,0.804469,0.832402,0.782123,0.759777,0.798883
2,0.804469,0.804469,0.776536,0.765363,0.815642


The relatives are definitely beneficial for SVC and the MLP but rather harm the KNN classifier. Furthermore, it seems that the SVC does not care about how the relatives are presented and the MLP prefers the continous version.

### Check if the Parch and SibSp values are better

In [61]:
# Transform all datasets using parch
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male",
                 "remainder__Parch"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier()
          }
# Train sklearn models and save results
results_3 = model_screening(X_train=X_train_transformed,
                          y_train=y_train,
                          X_val=X_val_transformed,
                          y_val=y_val,
                          models=models)

# Train a simple model with all categories
mlp_clf = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

mlp_clf.compile(loss="binary_crossentropy",
              optimizer="Adam",
              metrics=["accuracy"])

early = keras.callbacks.EarlyStopping(restore_best_weights=True)


mlp_clf.fit(x=X_train_transformed,
          y=y_train,
          batch_size=32,
          epochs=1000,
          validation_data=(X_val_transformed, y_val),
          callbacks=[early],
          verbose=0)

mlp_accuracy = mlp_clf.evaluate(X_val_transformed, y_val)[1]
results_3["mlp"] = mlp_accuracy



In [62]:
# Transform all datasets using sibsp
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male",
                 "remainder__SibSp"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier()
          }
# Train sklearn models and save results
results_4 = model_screening(X_train=X_train_transformed,
                          y_train=y_train,
                          X_val=X_val_transformed,
                          y_val=y_val,
                          models=models)

# Train a simple model with all categories
mlp_clf = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

mlp_clf.compile(loss="binary_crossentropy",
              optimizer="Adam",
              metrics=["accuracy"])

early = keras.callbacks.EarlyStopping(restore_best_weights=True)


mlp_clf.fit(x=X_train_transformed,
          y=y_train,
          batch_size=32,
          epochs=1000,
          validation_data=(X_val_transformed, y_val),
          callbacks=[early],
          verbose=0)

mlp_accuracy = mlp_clf.evaluate(X_val_transformed, y_val)[1]
results_4["mlp"] = mlp_accuracy



In [63]:
# Concatenate results

res0 = pd.DataFrame(results)
res1 = pd.DataFrame(results_1)
res2 = pd.DataFrame(results_2)
res3 = pd.DataFrame(results_3)
res4 = pd.DataFrame(results_4)

combined_res = pd.concat([res0, res1, res2, res3, res4], axis=0, ignore_index=True)
combined_res

Unnamed: 0,svc_clf,knn_clf,random_forest_clf,gradient_boosted_clf,mlp
0,0.815642,0.810056,0.776536,0.75419,0.776536
1,0.804469,0.832402,0.782123,0.759777,0.798883
2,0.804469,0.804469,0.776536,0.765363,0.815642
3,0.810056,0.837989,0.782123,0.765363,0.77095
4,0.804469,0.810056,0.787709,0.765363,0.782123


In [64]:
# Transform all datasets using both
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male",
                 "remainder__SibSp",
                 "remainder__Parch"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier()
          }
# Train sklearn models and save results
results_5 = model_screening(X_train=X_train_transformed,
                          y_train=y_train,
                          X_val=X_val_transformed,
                          y_val=y_val,
                          models=models)

# Train a simple model with all categories
mlp_clf = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

mlp_clf.compile(loss="binary_crossentropy",
              optimizer="Adam",
              metrics=["accuracy"])

early = keras.callbacks.EarlyStopping(restore_best_weights=True)


mlp_clf.fit(x=X_train_transformed,
          y=y_train,
          batch_size=32,
          epochs=1000,
          validation_data=(X_val_transformed, y_val),
          callbacks=[early],
          verbose=0)

mlp_accuracy = mlp_clf.evaluate(X_val_transformed, y_val)[1]
results_5["mlp"] = mlp_accuracy



In [65]:
# Concatenate results

res0 = pd.DataFrame(results)
res1 = pd.DataFrame(results_1)
res2 = pd.DataFrame(results_2)
res3 = pd.DataFrame(results_3)
res4 = pd.DataFrame(results_4)
res5 = pd.DataFrame(results_5)

combined_res = pd.concat([res0, res1, res2, res3, res4, res5], axis=0, ignore_index=True)
combined_res

Unnamed: 0,svc_clf,knn_clf,random_forest_clf,gradient_boosted_clf,mlp
0,0.815642,0.810056,0.776536,0.75419,0.776536
1,0.804469,0.832402,0.782123,0.759777,0.798883
2,0.804469,0.804469,0.776536,0.765363,0.815642
3,0.810056,0.837989,0.782123,0.765363,0.77095
4,0.804469,0.810056,0.787709,0.765363,0.782123
5,0.815642,0.815642,0.798883,0.743017,0.804469


It seems that there is no difference between using relatives and parch+sibsp

##" Conclusions:

Best models:
- MLP with relatives as continuous feature
- SVC with relatives as categorical or continous feature
- KNN with relatives as continous feature or ommitted

So far the continous feature of relatives was not scaled at all

In [66]:
# Transform all datasets using relatives scalled
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male",
                 "num__Relatives"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

models = {"svc_clf": SVC(),
          "knn_clf": KNeighborsClassifier(),
          "random_forest_clf": RandomForestClassifier(),
          "gradient_boosted_clf": GradientBoostingClassifier()
          }
# Train sklearn models and save results
results_6 = model_screening(X_train=X_train_transformed,
                          y_train=y_train,
                          X_val=X_val_transformed,
                          y_val=y_val,
                          models=models)

# Train a simple model with all categories
mlp_clf = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

mlp_clf.compile(loss="binary_crossentropy",
              optimizer="Adam",
              metrics=["accuracy"])

early = keras.callbacks.EarlyStopping(restore_best_weights=True)


mlp_clf.fit(x=X_train_transformed,
          y=y_train,
          batch_size=32,
          epochs=1000,
          validation_data=(X_val_transformed, y_val),
          callbacks=[early],
          verbose=0)

mlp_accuracy = mlp_clf.evaluate(X_val_transformed, y_val)[1]
results_6["mlp"] = mlp_accuracy



In [67]:
# Concatenate results

res0 = pd.DataFrame(results)
res1 = pd.DataFrame(results_1)
res2 = pd.DataFrame(results_2)
res3 = pd.DataFrame(results_3)
res4 = pd.DataFrame(results_4)
res5 = pd.DataFrame(results_5)

combined_res = pd.concat([res0, res1, res2, res3, res4, res5], axis=0, ignore_index=True)
combined_res

Unnamed: 0,svc_clf,knn_clf,random_forest_clf,gradient_boosted_clf,mlp
0,0.815642,0.810056,0.776536,0.75419,0.776536
1,0.804469,0.832402,0.782123,0.759777,0.798883
2,0.804469,0.804469,0.776536,0.765363,0.815642
3,0.810056,0.837989,0.782123,0.765363,0.77095
4,0.804469,0.810056,0.787709,0.765363,0.782123
5,0.815642,0.815642,0.798883,0.743017,0.804469


So scaling seems to be not a problem since the min-max range is pretty small.

**Conclusion** Now fine tuning with SVC, KNN and MLP using all features and the relative as continous feature.

## Fine tuning

In [68]:
# Prepare dataset for fine tuning
# Split train_dataset into train and validation set
train_dataset, val_dataset= train_test_split(train_dataset, random_state=42,
                                             train_size=0.8, shuffle=True)

# Seperate features from labels
X_train = train_dataset.drop("Survived", axis=1)
y_train = train_dataset["Survived"]
X_val = val_dataset.drop("Survived", axis=1)
y_val = val_dataset["Survived"]

# Transform all datasets
include_columns=["num__Age",
                 "num__Fare",
                 "cat__Pclass_2",
                 "cat__Pclass_3",
                 "cat__Sex_male",
                 "num__Relatives"]

X_train_transformed = titanic_transformation(X_train,
                                             include_columns=include_columns)
X_val_transformed = titanic_transformation(X_val,
                                           include_columns=include_columns)

# Transform dataframes into numpy arrays with float32 dtype
X_train_transformed = np.asarray(X_train_transformed).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_val_transformed = np.asarray(X_val_transformed).astype("float32")
y_val = np.asarray(y_val).astype("float32")

# Check shapes of arrays
X_train_transformed.shape, X_val_transformed.shape, y_train.shape, y_val.shape

((569, 6), (143, 6), (569,), (143,))

### 1. SVC

In [69]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Set up parameter distributions
param_distribs = {"C": randint(1, 10),
                  "kernel": ["linear", "poly", "rbf", "sigmoid"],
                  "degree": randint(0, 20),
                  "class_weight": [None, "balanced"]}

# Set up random search
rnd_search_svc = RandomizedSearchCV(SVC(),
                                    param_distributions=param_distribs,
                                    n_iter=50,
                                    cv=3,
                                    scoring="accuracy",
                                    random_state=42)

# Fit random search to data set
rnd_search_svc.fit(X_train_transformed, y_train)

# Save tuned classifier
svc_clf = rnd_search_svc.best_estimator_

# Look at evaluation results
cv_results = pd.DataFrame(rnd_search_svc.cv_results_)
cv_results.sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
28,0.003981,0.000163,0.002098,9.2e-05,2,,15,rbf,"{'C': 2, 'class_weight': None, 'degree': 15, '...",0.810526,0.826316,0.846561,0.827801,0.014748,1
2,0.004449,0.000698,0.002759,0.000941,3,,10,rbf,"{'C': 3, 'class_weight': None, 'degree': 10, '...",0.805263,0.821053,0.846561,0.824292,0.017015,2
33,0.004496,0.000421,0.001964,6.9e-05,7,,13,rbf,"{'C': 7, 'class_weight': None, 'degree': 13, '...",0.810526,0.815789,0.846561,0.824292,0.015892,2
10,0.004658,0.000578,0.002044,1.8e-05,7,,8,rbf,"{'C': 7, 'class_weight': None, 'degree': 8, 'k...",0.810526,0.815789,0.846561,0.824292,0.015892,2
9,0.004731,0.000436,0.002122,0.000217,9,,4,rbf,"{'C': 9, 'class_weight': None, 'degree': 4, 'k...",0.805263,0.815789,0.846561,0.822538,0.017522,5
23,0.004491,0.000457,0.00208,9.4e-05,8,,10,rbf,"{'C': 8, 'class_weight': None, 'degree': 10, '...",0.805263,0.815789,0.846561,0.822538,0.017522,5
38,0.006859,0.001311,0.003513,0.000818,4,,6,rbf,"{'C': 4, 'class_weight': None, 'degree': 6, 'k...",0.810526,0.805263,0.846561,0.820783,0.018354,7
24,0.004661,0.000252,0.002404,0.000128,1,balanced,2,rbf,"{'C': 1, 'class_weight': 'balanced', 'degree':...",0.794737,0.805263,0.84127,0.813757,0.019924,8
25,0.004298,9.7e-05,0.001657,9.7e-05,1,,4,poly,"{'C': 1, 'class_weight': None, 'degree': 4, 'k...",0.821053,0.8,0.814815,0.811956,0.008829,9
32,0.005374,0.00052,0.001481,3.4e-05,3,,4,poly,"{'C': 3, 'class_weight': None, 'degree': 4, 'k...",0.821053,0.794737,0.804233,0.806674,0.010881,10


### KNN

In [70]:
# Set up parameter distributions
param_distribs = {"n_neighbors": randint(1, 20),
                  "weights": ["uniform", "distance"]}

# Set up random search
rnd_search_knn = RandomizedSearchCV(KNeighborsClassifier(),
                                    param_distributions=param_distribs,
                                    n_iter=50,
                                    cv=3,
                                    scoring="accuracy",
                                    random_state=42)

# Fit random search to data set
rnd_search_knn.fit(X_train_transformed, y_train)

# Save tuned classifier
knn_clf = rnd_search_knn.best_estimator_

# Look at evaluation results
cv_results = pd.DataFrame(rnd_search_knn.cv_results_)
cv_results.sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
43,0.000741,5.4e-05,0.011849,0.001762,13,uniform,"{'n_neighbors': 13, 'weights': 'uniform'}",0.821053,0.826316,0.830688,0.826019,0.003939,1
49,0.000718,4.4e-05,0.010212,0.000114,11,uniform,"{'n_neighbors': 11, 'weights': 'uniform'}",0.815789,0.831579,0.825397,0.824255,0.006496,2
5,0.00074,7e-06,0.010106,0.000232,11,uniform,"{'n_neighbors': 11, 'weights': 'uniform'}",0.815789,0.831579,0.825397,0.824255,0.006496,2
12,0.000738,1.1e-05,0.010265,0.000588,12,uniform,"{'n_neighbors': 12, 'weights': 'uniform'}",0.805263,0.826316,0.835979,0.822519,0.012824,4
1,0.000727,8e-06,0.010255,9.4e-05,15,uniform,"{'n_neighbors': 15, 'weights': 'uniform'}",0.8,0.836842,0.830688,0.82251,0.016114,5
44,0.000847,0.000133,0.011174,0.000658,15,uniform,"{'n_neighbors': 15, 'weights': 'uniform'}",0.8,0.836842,0.830688,0.82251,0.016114,5
40,0.000658,1.8e-05,0.009719,0.000338,8,uniform,"{'n_neighbors': 8, 'weights': 'uniform'}",0.805263,0.831579,0.820106,0.818983,0.010773,7
2,0.000724,3.6e-05,0.010666,0.000871,8,uniform,"{'n_neighbors': 8, 'weights': 'uniform'}",0.805263,0.831579,0.820106,0.818983,0.010773,7
27,0.000689,8e-06,0.009512,0.000258,8,uniform,"{'n_neighbors': 8, 'weights': 'uniform'}",0.805263,0.831579,0.820106,0.818983,0.010773,7
15,0.000828,0.000198,0.012737,0.002781,16,uniform,"{'n_neighbors': 16, 'weights': 'uniform'}",0.794737,0.831579,0.825397,0.817238,0.016109,10


### MLP

In [72]:
import keras_tuner as kt

def model_builder(hp):
  model = keras.Sequential()

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  hp_units = hp.Int("units", min_value=8, max_value=512, step=8)
  hp_activation = hp.Choice("activation", values=["relu", "tanh", "elu", "selu", "gelu"])
  model.add(keras.layers.Dense(units=hp_units, activation=hp_activation, kernel_initializer="he_normal"))
  model.add(keras.layers.Dropout(0.3))
  model.add(keras.layers.Dense(units=hp_units, activation=hp_activation))
  model.add(keras.layers.Dense(1, activation="sigmoid"))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice("learning_rate", values=list(np.linspace(0.1, 0.001, 20)))

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.BinaryCrossentropy(),
                metrics=["accuracy"])

  return model

In [73]:
tuner = kt.Hyperband(model_builder,
                     objective="val_accuracy",
                     max_epochs=10,
                     factor=3,
                     directory="test3")

tuner.search(X_train_transformed,
             y_train,
             epochs=20,
             validation_data=(X_val_transformed, y_val),
             callbacks=[early])

Trial 30 Complete [00h 00m 02s]
val_accuracy: 0.8181818127632141

Best val_accuracy So Far: 0.8461538553237915
Total elapsed time: 00h 01m 22s
