# Kaggle - Spaceship Titanic
With RANDOM BOOSTED FORESTS

In [1]:
#import libraries
from http.client import ImproperConnectionState
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn 
from sklearn.model_selection import train_test_split
import time
import graphviz
import os
import pickle

In [7]:
#load data
version_data = "v2_0"
path_data = f'../data/preprocessed/{version_data}/' 
(x_test, x_train, y_test, y_train, x_predict) = [np.load(f'{path_data}{f}.npy', allow_pickle=True) 
                                                for f in ('X_test', 'X_train', 'y_test', 'y_train',  'X_predict')]

print(f'x_train.shape: {x_train.shape}')
print(f'x_test.shape: {x_test.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'y_test.shape: {y_test.shape}')
print(f'x_predict.shape: {x_predict.shape}')

x_train.shape: (6954, 10)
x_test.shape: (1739, 10)
y_train.shape: (6954,)
y_test.shape: (1739,)
x_predict.shape: (4277, 10)


### XGBoost
--> random boosted forests

Train model

In [10]:
#train model
xgbo = xgb.XGBClassifier(
    max_depth=5,learning_rate=0.0001,n_estimators=8000,n_jobs=500, tree_method="gpu_hist", enable_categorical=True, 
    objective='binary:logistic', eval_metric='logloss', verbosity=1, random_state=42) 

start_time = time.time()  #track the model development time
xgbo.fit(x_train,y_train)
end_time = time.time()
duration = end_time - start_time

print("Training duration: %s sec" % (duration)) 

#predict
y_predict = xgbo.predict(x_test)
categorical_accuracy = np.sum(y_predict == y_test) / len(y_test)
print("Categorical accuracy: %s" % categorical_accuracy)

Training duration: 8.859705924987793 sec
Categorical accuracy: 0.7441058079355952


SAVE MODEL

In [38]:
from statistics import mode


model_id = 'xgboost__'+ version_data + '__01'
print(f"Saving model {model_id} ...")

#save model
if not os.path.exists(f'../models/{model_id}'):
        os.mkdir(f'../models/{model_id}')
pickle.dump(xgbo, open(f'../models/{model_id}/model.pkl', 'wb'))

#save results
timestamp = time.strftime("%Y_%m_%d at %H_%M")
print(timestamp)
print(f'categorical_accuracy: {categorical_accuracy}')

overview = pd.read_csv('../models/prediction_overview.csv')
new_row = {'model_id': model_id, 'timestamp': timestamp, 'categorical_accuracy': categorical_accuracy}
overview = pd.concat([overview, pd.DataFrame(new_row, index=[0])])
overview = overview[{'model_id', 'timestamp', 'categorical_accuracy'}]
overview.drop_duplicates(inplace=True)
print(overview)
overview.to_csv('../models/prediction_overview.csv')

Saving model xgboost__v1_4__01 ...
2022_09_22 at 05_02
categorical_accuracy: 0.7878090856814262
   categorical_accuracy           model_id            timestamp
0              0.794710  xgboost__v1_0__01  2022_09_21 at 22_46
1              0.774583       NN__v1_2__01  2022_09_21 at 22_48
2              0.779758       NN__v1_2__02  2022_09_21 at 23_02
3              0.793560       NN__v1_2__03  2022_09_21 at 23_38
4              0.782059  xgboost__v1_3__01  2022_09_22 at 04_09
5              0.780334  xgboost__v1_3__01  2022_09_22 at 04_12
6              0.787234  xgboost__v1_3__01  2022_09_22 at 04_19
7              0.741231  xgboost__v1_4__01  2022_09_22 at 04_58
8              0.729155       NN__v1_4__05  2022_09_22 at 05_00
0              0.787809  xgboost__v1_4__01  2022_09_22 at 05_02


  overview = overview[{'model_id', 'timestamp', 'categorical_accuracy'}]


In [39]:
############# use to create new version and DELETE OLD VERSIONS #############

#overview = pd.DataFrame(columns=['model_id', 'timestamp', 'categorical_accuracy'])
#overview.to_csv('../models/prediction_overview.csv')