<a href="https://colab.research.google.com/github/MStamirski/Spaceship-Titanic/blob/main/Model_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/MyDrive/Colab_Notebooks/SDA_upskill/Spaceship"

# Model with parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
model = RandomForestClassifier()

In [None]:
def objective(trial):

  rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
  rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
  rf_min_samples_leaf = trial.suggest_int("rf_min_samples_leaf", 1, 50)
  rf_max_features = trial.suggest_float("rf_max_features", 0.1, 0.7)

  optuna_model = RandomForestClassifier(
      max_depth = rf_max_depth,
      n_estimators = rf_n_estimators,
      min_samples_leaf = rf_min_samples_leaf,
      max_features = rf_max_features
    )
  optuna_model.fit(X_train, y_train)

  y_pred = optuna_model.predict(X_val)
  accuracy = accuracy_score(y_val, y_pred)

  return accuracy

# Features datasets

In [None]:
!pip install import-ipynb

In [None]:
import import_ipynb

In [None]:
from FeaturesEngineering import get_features, categories_one_hot_encoding, categories_target_encoding, categories_leave_one_out_encoding

In [None]:
df = get_features('train')
df_ohe = categories_one_hot_encoding(df)
df_te = categories_target_encoding(df)
df_looe = categories_leave_one_out_encoding(df)

# Initial verification

In [None]:
from Optimization import verify_feature_dataset, get_subsets, model_optimization, test_tuned_model

In [None]:
acc_ohe_ini = verify_feature_dataset(df_ohe, model)

Accuracy: 77.79907975460122%


In [None]:
acc_te_ini = verify_feature_dataset(df_te, model)

Accuracy: 77.99079754601227%


In [None]:
acc_looe_ini = verify_feature_dataset(df_looe, model)

Accuracy: 77.60736196319019%


# Parameters tuning

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_ohe)
params = model_optimization(20, objective)
model = RandomForestClassifier(max_depth = params['rf_max_depth'], n_estimators = params['rf_n_estimators'], min_samples_leaf = params['rf_min_samples_leaf'], max_features = params['rf_max_features'])
result_ohe_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 18:54:41,290][0m A new study created in memory with name: no-name-eb000096-6fe9-4191-8196-a6455cbf5df6[0m
[32m[I 2023-04-08 18:54:45,183][0m Trial 0 finished with value: 0.7260963335729691 and parameters: {'rf_max_depth': 2, 'rf_n_estimators': 630, 'rf_min_samples_leaf': 18, 'rf_max_features': 0.44137939248081737}. Best is trial 0 with value: 0.7260963335729691.[0m
[32m[I 2023-04-08 18:54:52,991][0m Trial 1 finished with value: 0.7246585190510424 and parameters: {'rf_max_depth': 2, 'rf_n_estimators': 859, 'rf_min_samples_leaf': 14, 'rf_max_features': 0.4603352926936495}. Best is trial 0 with value: 0.7260963335729691.[0m
[32m[I 2023-04-08 18:55:02,365][0m Trial 2 finished with value: 0.7728253055355859 and parameters: {'rf_max_depth': 5, 'rf_n_estimators': 937, 'rf_min_samples_leaf': 38, 'rf_max_features': 0.5545034243584891}. Best is trial 2 with value: 0.7728253055355859.[0m
[32m[I 2023-04-08 18:55:11,227][0m Trial 3 finished with value: 0.793673616103


Number of finished trials: 20
Best trial:
Value: 0.8037383177570093
  Params: 
    rf_max_depth: 20
    rf_n_estimators: 185
    rf_min_samples_leaf: 3
    rf_max_features: 0.6921844980708455

Accuracy after tuning: 77.80333525014376%

Classification report
              precision    recall  f1-score   support

       False       0.77      0.78      0.78       861
        True       0.78      0.77      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739


Confusion matrix
[[674 187]
 [199 679]]

All features number: 90, Selected features number: 22
Selected_features:

HomePlanet_Earth
HomePlanet_Europa
CryoSleep_False
CryoSleep_True
Destination_TRAPPIST-1e
Cabin_deck_E
Cabin_deck_F
Cabin_deck_G
Cabin_side_P
Cabin_side_S
Cabin_persons_1
RService_deciles_1
RService_deciles_9
RService_deciles_10
FCourt_deciles_10
ShMall_deciles_10
Spa_deciles_1
Spa_deciles_9
Spa_d

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_te)
params = model_optimization(20, objective)
model = RandomForestClassifier(max_depth = params['rf_max_depth'], n_estimators = params['rf_n_estimators'], min_samples_leaf = params['rf_min_samples_leaf'], max_features = params['rf_max_features'])
result_te_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 18:56:41,708][0m A new study created in memory with name: no-name-42a5099e-04be-4465-b978-3d62d3ec7eba[0m
[32m[I 2023-04-08 18:56:45,344][0m Trial 0 finished with value: 0.7843278217109992 and parameters: {'rf_max_depth': 5, 'rf_n_estimators': 754, 'rf_min_samples_leaf': 16, 'rf_max_features': 0.4493803521918105}. Best is trial 0 with value: 0.7843278217109992.[0m
[32m[I 2023-04-08 18:56:47,380][0m Trial 1 finished with value: 0.7821710999281093 and parameters: {'rf_max_depth': 9, 'rf_n_estimators': 451, 'rf_min_samples_leaf': 44, 'rf_max_features': 0.5648373186664888}. Best is trial 0 with value: 0.7843278217109992.[0m
[32m[I 2023-04-08 18:56:50,194][0m Trial 2 finished with value: 0.7469446441409058 and parameters: {'rf_max_depth': 3, 'rf_n_estimators': 857, 'rf_min_samples_leaf': 17, 'rf_max_features': 0.5666431868424594}. Best is trial 0 with value: 0.7843278217109992.[0m
[32m[I 2023-04-08 18:56:53,222][0m Trial 3 finished with value: 0.7886412652767


Number of finished trials: 20
Best trial:
Value: 0.798705966930266
  Params: 
    rf_max_depth: 12
    rf_n_estimators: 289
    rf_min_samples_leaf: 11
    rf_max_features: 0.5328623603813379

Accuracy after tuning: 77.68832662449684%

Classification report
              precision    recall  f1-score   support

       False       0.79      0.75      0.77       861
        True       0.77      0.80      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739


Confusion matrix
[[645 216]
 [172 706]]

All features number: 14, Selected features number: 6
Selected_features:

HomePlanet_transformed
CryoSleep_transformed
Cabin_num/100_transformed
RService_deciles_transformed
Spa_deciles_transformed
VRD_deciles_transformed


In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(df_looe)
params = model_optimization(20, objective)
model = RandomForestClassifier(max_depth = params['rf_max_depth'], n_estimators = params['rf_n_estimators'], min_samples_leaf = params['rf_min_samples_leaf'], max_features = params['rf_max_features'])
result_looe_tuned = test_tuned_model(model, X_train, y_train, X_test, y_test)

[32m[I 2023-04-08 18:57:29,643][0m A new study created in memory with name: no-name-90a4cd1d-08f4-4f46-ae6f-9821f2620c6a[0m
[32m[I 2023-04-08 18:57:33,428][0m Trial 0 finished with value: 0.7606038820992091 and parameters: {'rf_max_depth': 8, 'rf_n_estimators': 758, 'rf_min_samples_leaf': 35, 'rf_max_features': 0.13634492713555754}. Best is trial 0 with value: 0.7606038820992091.[0m
[32m[I 2023-04-08 18:57:38,227][0m Trial 1 finished with value: 0.7821710999281093 and parameters: {'rf_max_depth': 25, 'rf_n_estimators': 277, 'rf_min_samples_leaf': 41, 'rf_max_features': 0.48126744631041196}. Best is trial 1 with value: 0.7821710999281093.[0m
[32m[I 2023-04-08 18:57:46,805][0m Trial 2 finished with value: 0.7828900071890726 and parameters: {'rf_max_depth': 5, 'rf_n_estimators': 490, 'rf_min_samples_leaf': 6, 'rf_max_features': 0.48066951436709693}. Best is trial 2 with value: 0.7828900071890726.[0m
[32m[I 2023-04-08 18:58:19,472][0m Trial 3 finished with value: 0.7893601725


Number of finished trials: 20
Best trial:
Value: 0.7893601725377426
  Params: 
    rf_max_depth: 25
    rf_n_estimators: 1000
    rf_min_samples_leaf: 8
    rf_max_features: 0.6369771273823448

Accuracy after tuning: 76.88326624496837%

Classification report
              precision    recall  f1-score   support

       False       0.78      0.75      0.76       861
        True       0.76      0.79      0.77       878

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739


Confusion matrix
[[645 216]
 [186 692]]

All features number: 14, Selected features number: 5
Selected_features:

HomePlanet_transformed
CryoSleep_transformed
RService_deciles_transformed
Spa_deciles_transformed
VRD_deciles_transformed


# Save results

In [None]:
data = []
data.append(['RandomForestClassifier', 'OHE', acc_ohe_ini, result_ohe_tuned[0], str(result_ohe_tuned[1])+" / 90"])
data.append(['RandomForestClassifier', 'TE', acc_te_ini, result_te_tuned[0], str(result_te_tuned[1])+" / 14"])
data.append(['RandomForestClassifier', 'LOOE', acc_looe_ini, result_looe_tuned[0], str(result_looe_tuned[1])+" / 14"])

In [None]:
import csv
from os.path import exists
resfile = 'spaceship_results.csv'

In [None]:
if exists(resfile):
  f = open(resfile, 'a')
  writer = csv.writer(f)
else:
  header = ['Model', 'Categories_encoding', 'Initial_accuracy', 'Tuned_Accuracy', 'Important_Features']
  f = open(resfile, 'w', newline='')
  writer = csv.writer(f)
  writer.writerow(header)

writer.writerows(data)
f.close()