<a href="https://colab.research.google.com/github/MStamirski/Spaceship-Titanic/blob/main/Model_CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/MyDrive/Colab_Notebooks/SDA_upskill/Spaceship"

In [None]:
!pip install catboost

# Model with parameters

In [None]:
import catboost
from sklearn.metrics import accuracy_score

In [None]:
model = catboost.CatBoostClassifier(verbose=False)

In [None]:
def objective(trial):

    model = catboost.CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

# Features datasets

In [None]:
!pip install import-ipynb

In [None]:
import import_ipynb

In [None]:
from FeaturesEngineering import get_features, categories_one_hot_encoding, categories_target_encoding, categories_leave_one_out_encoding

In [None]:
df = get_features('train')

In [None]:
df['Transported'] = df['Transported'].apply(lambda x: 1 if x else 0) # otherwise CatBoost throws error

In [None]:
df_ohe = categories_one_hot_encoding(df)
df_te = categories_target_encoding(df)
df_looe = categories_leave_one_out_encoding(df)

# Initial verification

In [None]:
from Optimization import verify_feature_dataset, get_subsets, model_optimization, test_tuned_model

In [None]:
acc_ohe_ini = verify_feature_dataset(df_ohe, model)

Accuracy: 78.79601226993866%


In [None]:
acc_te_ini = verify_feature_dataset(df_te, model)

Accuracy: 78.75766871165644%


In [None]:
acc_looe_ini = verify_feature_dataset(df_looe, model)

Accuracy: 77.30061349693251%


# Parameters tuning

In [None]:
from optuna.samplers import TPESampler
sampler = TPESampler(seed=1)

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_ohe)
params = model_optimization(20, objective, samp=sampler)
model = catboost.CatBoostClassifier(**params, verbose=False)
result_ohe_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 19:25:12,443][0m A new study created in memory with name: no-name-9fffdb36-4cf6-4417-adea-762f1d64f0eb[0m
[32m[I 2023-04-08 19:25:13,474][0m Trial 0 finished with value: 0.8080517613227893 and parameters: {'iterations': 475, 'learning_rate': 0.027583475549166746, 'depth': 4, 'l2_leaf_reg': 1.0551779964424746e-05, 'bootstrap_type': 'Bayesian', 'random_strength': 2.0931628460945333e-07, 'bagging_temperature': 0.923385947687978, 'od_type': 'Iter', 'od_wait': 26}. Best is trial 0 with value: 0.8080517613227893.[0m
[32m[I 2023-04-08 19:25:16,283][0m Trial 1 finished with value: 0.798705966930266 and parameters: {'iterations': 585, 'learning_rate': 0.006892694481137703, 'depth': 8, 'l2_leaf_reg': 1.10795595820296e-06, 'bootstrap_type': 'Bayesian', 'random_strength': 0.7999391045172093, 'bagging_temperature': 0.27387593197926163, 'od_type': 'IncToDec', 'od_wait': 32}. Best is trial 0 with value: 0.8080517613227893.[0m
[32m[I 2023-04-08 19:25:17,933][0m Trial 2 fin


Number of finished trials: 20
Best trial:
Value: 0.809489575844716
  Params: 
    iterations: 906
    learning_rate: 0.0192665824000967
    depth: 4
    l2_leaf_reg: 0.00020979075425099598
    bootstrap_type: Bayesian
    random_strength: 4.7913336454146464e-08
    bagging_temperature: 2.211499214434934
    od_type: Iter
    od_wait: 26

Accuracy after tuning: 78.66589994249568%

Classification report
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       861
           1       0.78      0.81      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739


Confusion matrix
[[659 202]
 [169 709]]

All features number: 90, Selected features number: 25
Selected_features:

HomePlanet_Earth
HomePlanet_Europa
CryoSleep_False
CryoSleep_True
Cabin_deck_C
Cabin_deck_E
Cabin_deck_F
Cabin_deck_G
Cabin_side_P
Cabin_side_S
Cabin_n

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_te)
params = model_optimization(20, objective, samp=sampler)
model = catboost.CatBoostClassifier(**params, verbose=False)
result_te_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 19:25:57,485][0m A new study created in memory with name: no-name-ed4ff92e-797f-4edb-a524-e1294ffdf0da[0m
[32m[I 2023-04-08 19:25:58,054][0m Trial 0 finished with value: 0.7936736161035226 and parameters: {'iterations': 203, 'learning_rate': 0.0792462129989754, 'depth': 7, 'l2_leaf_reg': 0.006079895907372734, 'bootstrap_type': 'Bayesian', 'random_strength': 4.712281485061178e-05, 'bagging_temperature': 2.3702698024302773, 'od_type': 'IncToDec', 'od_wait': 10}. Best is trial 0 with value: 0.7936736161035226.[0m
[32m[I 2023-04-08 19:25:59,665][0m Trial 1 finished with value: 0.7742631200575126 and parameters: {'iterations': 656, 'learning_rate': 0.004500800919479376, 'depth': 7, 'l2_leaf_reg': 7.234707764374606, 'bootstrap_type': 'Bayesian', 'random_strength': 1.6422067769778728e-05, 'bagging_temperature': 9.085351509197992, 'od_type': 'IncToDec', 'od_wait': 48}. Best is trial 0 with value: 0.7936736161035226.[0m
[32m[I 2023-04-08 19:26:01,127][0m Trial 2 fin


Number of finished trials: 20
Best trial:
Value: 0.8058950395398994
  Params: 
    iterations: 931
    learning_rate: 0.026488021317107736
    depth: 4
    l2_leaf_reg: 1.5805248860991158e-08
    bootstrap_type: Bayesian
    random_strength: 1.7214677137755465e-08
    bagging_temperature: 0.28306488020794607
    od_type: Iter
    od_wait: 32

Accuracy after tuning: 79.12593444508337%

Classification report
              precision    recall  f1-score   support

           0       0.80      0.77      0.79       861
           1       0.78      0.81      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739


Confusion matrix
[[665 196]
 [167 711]]

All features number: 14, Selected features number: 6
Selected_features:

HomePlanet_transformed
Cabin_deck_transformed
Cabin_num/100_transformed
RService_deciles_transformed
Spa_deciles_transformed
VRD_deciles_transforme

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_looe)
params = model_optimization(20, objective, samp=sampler)
model = catboost.CatBoostClassifier(**params, verbose=False)
result_looe_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 19:26:38,223][0m A new study created in memory with name: no-name-cc8824b8-9993-42fc-8a6f-46fb01320208[0m
[32m[I 2023-04-08 19:26:53,127][0m Trial 0 finished with value: 0.7555715312724659 and parameters: {'iterations': 667, 'learning_rate': 0.002632376589458986, 'depth': 9, 'l2_leaf_reg': 4.627695912122567e-08, 'bootstrap_type': 'Bayesian', 'random_strength': 2.2020941703793847e-06, 'bagging_temperature': 8.047545637433455, 'od_type': 'Iter', 'od_wait': 31}. Best is trial 0 with value: 0.7555715312724659.[0m
[32m[I 2023-04-08 19:26:55,692][0m Trial 1 finished with value: 0.7526959022286125 and parameters: {'iterations': 933, 'learning_rate': 0.0033619677334531975, 'depth': 4, 'l2_leaf_reg': 0.22421240280673352, 'bootstrap_type': 'Bayesian', 'random_strength': 0.08904350893401006, 'bagging_temperature': 9.07815852503524, 'od_type': 'IncToDec', 'od_wait': 19}. Best is trial 0 with value: 0.7555715312724659.[0m
[32m[I 2023-04-08 19:27:20,271][0m Trial 2 finis


Number of finished trials: 20
Best trial:
Value: 0.7886412652767792
  Params: 
    iterations: 241
    learning_rate: 0.04661404511513248
    depth: 6
    l2_leaf_reg: 1.0612016111464591e-06
    bootstrap_type: Bayesian
    random_strength: 0.00832252456244437
    bagging_temperature: 3.023929581123987
    od_type: Iter
    od_wait: 40

Accuracy after tuning: 76.36572742955721%

Classification report
              precision    recall  f1-score   support

           0       0.76      0.76      0.76       861
           1       0.76      0.77      0.77       878

    accuracy                           0.76      1739
   macro avg       0.76      0.76      0.76      1739
weighted avg       0.76      0.76      0.76      1739


Confusion matrix
[[653 208]
 [203 675]]

All features number: 14, Selected features number: 7
Selected_features:

HomePlanet_transformed
CryoSleep_transformed
Cabin_deck_transformed
Cabin_num/100_transformed
RService_deciles_transformed
Spa_deciles_transformed
VRD_de

# Save results

In [None]:
data = []
data.append(['CatBoostClassifier', 'OHE', acc_ohe_ini, result_ohe_tuned[0], str(result_ohe_tuned[1])+" / 90"])
data.append(['CatBoostClassifier', 'TE', acc_te_ini, result_te_tuned[0], str(result_te_tuned[1])+" / 14"])
data.append(['CatBoostClassifier', 'LOOE', acc_looe_ini, result_looe_tuned[0], str(result_looe_tuned[1])+" / 14"])

In [None]:
import csv
from os.path import exists
resfile = 'spaceship_results.csv'

In [None]:
if exists(resfile):
  f = open(resfile, 'a')
  writer = csv.writer(f)
else:
  header = ['Model', 'Categories_encoding', 'Initial_accuracy', 'Tuned_Accuracy', 'Important_Features']
  f = open(resfile, 'w', newline='')
  writer = csv.writer(f)
  writer.writerow(header)

writer.writerows(data)
f.close()