## Imports

In [1]:
!pip -q install optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.3/224.3 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

import seaborn as sns

import lightgbm as lgb
import xgboost as xgb
import optuna

## 1. Data conversion

### 1.1 EDA enter

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

In [7]:
!kaggle competitions download -c spaceship-titanic

Downloading spaceship-titanic.zip to /content
  0% 0.00/299k [00:00<?, ?B/s]
100% 299k/299k [00:00<00:00, 108MB/s]


In [8]:
!unzip 'spaceship-titanic.zip'

Archive:  spaceship-titanic.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [9]:
train = pd.read_csv('train.csv').drop(['PassengerId'], axis=1)
test = pd.read_csv('test.csv').drop(['PassengerId'], axis=1)
sample = pd.read_csv('sample_submission.csv')

train

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [10]:
#split Cabin info

train['Cabin'].str.split('/')

0          [B, 0, P]
1          [F, 0, S]
2          [A, 0, S]
3          [A, 0, S]
4          [F, 1, S]
            ...     
8688      [A, 98, P]
8689    [G, 1499, S]
8690    [G, 1500, S]
8691     [E, 608, S]
8692     [E, 608, S]
Name: Cabin, Length: 8693, dtype: object

In [11]:
train[['Deck', 'Num', 'Side']] = train['Cabin'].str.split('/', expand=True)
test[['Deck', 'Num', 'Side']] = test['Cabin'].str.split('/', expand=True)
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [12]:
# replace T on G (becaues A-B-C-D-E-F-G-...-T)

for unique in train['Deck'].unique():
    print('%s: %s' % (unique, train['Deck'][train['Deck'] == unique].count()))

B: 779
F: 2794
A: 256
G: 2559
nan: 0
E: 876
D: 478
C: 747
T: 5


### 1.2 Outliers detection / Fill NaN / Encode

In [13]:
numCol = train.keys()[train.dtypes == 'float']
train[numCol].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


#### 1.2.1 Drop outliers

In [14]:
train[numCol].quantile(0.99)

Age               65.00
RoomService     3096.23
FoodCourt       8033.31
ShoppingMall    2333.44
Spa             5390.10
VRDeck          5646.68
Name: 0.99, dtype: float64

In [15]:
quantile_1 = train[numCol].quantile(0.05)
quantile_3 = train[numCol].quantile(0.95)

#Interquantile Range
IQR = quantile_3 - quantile_1
lower_cond = quantile_1 - 1.5 * IQR
upper_cond = quantile_3 + 1.5 * IQR

sample_train = train[numCol].copy()

sample_train = train[numCol][~((train[numCol] < lower_cond) | (train[numCol] > upper_cond))]
sample_train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8431.0,8392.0,8398.0,8355.0,8347.0
mean,28.82793,179.821136,310.464013,132.040367,187.672292,176.371511
std,14.489021,448.550361,883.751182,337.288062,517.950931,486.153593
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,38.5,53.0,21.0,43.0,30.0
max,79.0,3148.0,6819.0,2316.0,4016.0,3831.0


#### 1.2.2 Lower some outliers

In [16]:
quantile_1 = train[numCol].quantile(0.01)
quantile_3 = train[numCol].quantile(0.99)

IQR = quantile_3 - quantile_1
lower_cond = train[numCol][~(train[numCol] < (quantile_1 - 1.5 * IQR))].min()
upper_cond = train[numCol][~(train[numCol] > (quantile_3 + 1.5 * IQR))].max()

print('[UP LIM] \n%s \n\n[LOW LIM] \n%s' % (upper_cond, lower_cond))

sample_train = train[numCol].copy()

lower_data_cond = np.where(train[numCol] < lower_cond, lower_cond, train[numCol])
upper_data_cond = np.where(train[numCol] > upper_cond, upper_cond, lower_data_cond)

sample_train[numCol] = upper_data_cond

sample_train.describe()

[UP LIM] 
Age                79.0
RoomService      7406.0
FoodCourt       18481.0
ShoppingMall     5635.0
Spa             13208.0
VRDeck          12708.0
dtype: float64 

[LOW LIM] 
Age             0.0
RoomService     0.0
FoodCourt       0.0
ShoppingMall    0.0
Spa             0.0
VRDeck          0.0
dtype: float64


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,222.911067,453.365335,168.104891,307.536545,300.924985
std,14.489021,641.915273,1544.096278,502.654856,1086.87728,1089.182231
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,7406.0,18481.0,5635.0,13208.0,12708.0


#### 1.2.3 Both methods

In [122]:
class DataConversion:
    def __init__(self, data: pd.DataFrame, test: bool = False, drop: bool = True):
        self.test = test
        self.drop = drop

        if self.test:
            self.features = data.drop(['Cabin', 'Name', 'Num'], axis=1)
        else:
            self.features = data.drop(['Cabin', 'Name', 'Transported', 'Num'], axis=1)
            self.targets = data['Transported']

        self.features['Deck'] = self.features['Deck'].replace('T', 'G')
    
    # for drop outliers
    def get_outliers(self, num_data: pd.DataFrame) -> pd.DataFrame:
        quantile_1 = num_data.quantile(0.01)
        quantile_3 = num_data.quantile(0.99)

        IQR = quantile_3 - quantile_1
        lower_cond = quantile_1 - 1.5 * IQR
        upper_cond = quantile_3 + 1.5 * IQR

        num_data = num_data[~((num_data < lower_cond) | (num_data > upper_cond))]

        return num_data

    # for replace outliers
    def lower_outliers(self, num_data: pd.DataFrame) -> pd.DataFrame:
        quantile_1 = num_data.quantile(0.01)
        quantile_3 = num_data.quantile(0.99)

        IQR = quantile_3 - quantile_1
        lower_cond = num_data[~(num_data < (quantile_1 - 1.5 * IQR))].min()
        upper_cond = num_data[~(num_data > (quantile_3 + 1.5 * IQR))].max()

        lower_data_cond = np.where(num_data < lower_cond, num_data.mean(), num_data)
        upper_data_cond = np.where(num_data > upper_cond, num_data.mean(), lower_data_cond)

        return upper_data_cond
    

    def drop_encode(self):
        features = self.features.copy()

        numCol = features.keys()[features.dtypes == 'float']
        features[numCol] = SimpleImputer(strategy='mean').fit_transform(features[numCol])

        if not self.test:
            min_max_obj = MinMaxScaler().fit(features[numCol])
            features[numCol] = min_max_obj.transform(features[numCol])
            features[numCol] = self.get_outliers(features[numCol])
            features[numCol] = min_max_obj.inverse_transform(features[numCol])
        
        features[numCol] = MinMaxScaler().fit_transform(features[numCol])

        catCol = features.keys()[features.dtypes != 'float']
        features[catCol] = SimpleImputer(strategy='most_frequent').fit_transform(features[catCol])
        for key in catCol:
            features[key] = LabelEncoder().fit_transform(features[key])

        features['Shop+Food'] = features['ShoppingMall'] + features['FoodCourt']
        features.drop(['ShoppingMall', 'FoodCourt'], axis=1, inplace=True)

        if not self.test:
            df = features
            df['Transported'] = self.targets

        return features if self.test else df.dropna()


    def lower_encode(self) -> pd.DataFrame:
        features = self.features.copy()

        numCol = features.keys()[features.dtypes == 'float']
        
        if not self.test:
            features[numCol] = self.lower_outliers(features[numCol])
        
        features[numCol] = SimpleImputer(strategy='mean').fit_transform(features[numCol])
        features[numCol] = StandardScaler().fit_transform(features[numCol])

        catCol = features.keys()[features.dtypes != 'float']
        features[catCol] = SimpleImputer(strategy='most_frequent').fit_transform(features[catCol])
        for key in catCol:
            features[key] = LabelEncoder().fit_transform(features[key])

        features['Shop+Food'] = features['ShoppingMall'] + features['FoodCourt']
        features.drop(['ShoppingMall', 'FoodCourt'], axis=1, inplace=True)

        return features if self.test else (features, self.targets)

In [123]:
obj_train = DataConversion(train.copy(), test=False)
obj_test = DataConversion(test.copy(), test=True)

In [147]:
def some_scores(X, Y):
    xTrain, xValid, yTrain, yValid = train_test_split(
        X, Y, test_size=0.2, random_state=10
    )

    scores = pd.DataFrame(index=['train', 'valid'])

    gbc = GradientBoostingClassifier(random_state=10).fit(xTrain, yTrain)
    hgbc = HistGradientBoostingClassifier(random_state=10).fit(xTrain, yTrain)
    lgbm = lgb.LGBMClassifier(random_state=10).fit(xTrain, yTrain)

    gbc_scores = gbc.score(xTrain, yTrain), gbc.score(xValid, yValid)
    hgbc_scores = hgbc.score(xTrain, yTrain), hgbc.score(xValid, yValid)
    lgbm_scores = lgbm.score(xTrain, yTrain), lgbm.score(xValid, yValid)

    scores['GBsklearn'] = gbc_scores
    scores['HistGBsklearn'] = hgbc_scores
    scores['LGBM'] = lgbm_scores

    return scores

### Drop

In [134]:
train_drop = obj_train.drop_encode()
test_drop = obj_test.drop_encode()
train_drop

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,Spa,VRDeck,Deck,Side,Shop+Food,Transported
0,1,0,2,0.493671,0,0.000000,0.000000,0.000000,1,0,0.000000,False
1,0,0,2,0.303797,0,0.014718,0.041566,0.003462,5,1,0.004924,True
2,1,0,2,0.734177,1,0.005806,0.508404,0.003856,0,1,0.193496,False
3,1,0,2,0.417722,0,0.000000,0.252044,0.015187,0,1,0.135261,False
4,0,0,2,0.202532,0,0.040913,0.042777,0.000157,5,1,0.030584,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,0.518987,1,0.000000,0.124394,0.005823,0,0,0.368974,False
8689,0,1,1,0.227848,0,0.000000,0.000000,0.000000,6,1,0.000000,False
8690,0,0,2,0.329114,0,0.000000,0.000076,0.000000,6,1,0.332209,True
8691,1,0,0,0.405063,0,0.000000,0.026726,0.254564,4,1,0.056761,False


In [148]:
some_scores(train_drop.drop(['Transported'], axis=1), train_drop['Transported'])

Unnamed: 0,GBsklearn,HistGBsklearn,LGBM
train,0.809944,0.852002,0.857494
valid,0.813295,0.816185,0.812717


### Lower

In [149]:
features, targets = obj_train.lower_encode()
test_lower = obj_test.lower_encode()
features

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,Spa,VRDeck,Deck,Side,Shop+Food
0,1,0,2,0.709437,0,-0.363978,-0.297694,-0.284565,1,0,-0.659399
1,0,0,2,-0.336717,0,-0.179722,0.261281,-0.241711,5,1,-0.597127
2,1,0,2,2.034566,1,-0.291290,6.539308,-0.236841,0,1,1.781270
3,1,0,2,0.290975,0,-0.363978,3.091789,-0.096592,0,1,1.049237
4,0,0,2,-0.894666,0,0.148220,0.277571,-0.282617,5,1,-0.272597
...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,0.848924,1,-0.363978,1.375157,-0.212492,0,0,3.994662
8689,0,1,1,-0.755179,0,-0.363978,-0.297694,-0.284565,6,1,-0.659399
8690,0,0,2,-0.197230,0,-0.363978,-0.296676,-0.284565,6,1,3.543625
8691,1,0,0,0.221232,0,-0.363978,0.061720,2.866173,4,1,0.056558


In [150]:
some_scores(features, targets)

Unnamed: 0,GBsklearn,HistGBsklearn,LGBM
train,0.805148,0.856629,0.857636
valid,0.795285,0.802185,0.80276


## 2. Train

In [162]:
class OptimalParams:
    def __init__(self, sklearn_ensemble: bool = True):
        self.sklearn_ensemble = sklearn_ensemble
    

    def objective(self, trial):
        if self.sklearn_ensemble:
            params = {
                'subsample': trial.suggest_float('subsample', 1e-3, 1.0, log=True), 
                'max_depth': trial.suggest_int('max_depth', 3, 15, log=True), 
                'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True), 
                'n_estimators': trial.suggest_int('n_estimators', 200, 400)
            }
            params['n_iter_no_change'] = 8

            model = GradientBoostingClassifier(**params).fit(self.X, self.Y)

        else:
            params = {
                'num_leaves': trial.suggest_int('num_leaves', 5, 15),
                'learning_rate': trial.suggest_float('learning_rate', 5e-4, 5e-2, log=True), 
                'n_estimators': trial.suggest_int('n_estimators', 300, 500), 
                'min_child_samples': trial.suggest_int('min_child_samples', 10, 30), 
                'subsample': trial.suggest_float('subsample', 1e-3, 1e-1, log=True), 
                'colsample_bytree': trial.suggest_float('colsample_bytree', 1e-3, 1e-1, log=True)
            }
            
            model = lgb.LGBMClassifier(**params).fit(self.X, self.Y)
        
        trainScore = model.score(self.X, self.Y)
        validScore = model.score(self.vd[0], self.vd[1])

        diff = abs(trainScore - validScore)
        return 2 / (1 / trainScore + 1 / validScore) if diff < 0.05 else -diff


    def optimize(self, data: np.ndarray, label: np.ndarray, 
                 validation_data: tuple = None, n_trials: int = 10):
        self.X, self.Y = data, label
        self.vd = validation_data
        
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=n_trials, show_progress_bar=True)
        
        self.best_params = study.best_trial.params
        return self.best_params

    def score(self):
        if self.sklearn_ensemble:
            model = GradientBoostingClassifier(**self.best_params).fit(self.X, self.Y)

            print('[SCORES] \non train: %.4f \non valid: %.4f' % 
                  (model.score(self.X, self.Y), 
                   model.score(self.vd[0], self.vd[1]))
            )
        else:
            model = lgb.LGBMClassifier(**self.best_params).fit(self.X, self.Y)

            print('[SCORES] \non train: %.4f \non valid: %.4f' % 
                  (model.score(self.X, self.Y), 
                   model.score(self.vd[0], self.vd[1]))
            )

### drop

In [152]:
xTrainDrop, xValidDrop, yTrainDrop, yValidDrop = train_test_split(
    train_drop.drop(['Transported'], axis=1), train_drop['Transported'], test_size=0.2, random_state=10
)

In [163]:
obj_opt = OptimalParams()
optParams = obj_opt.optimize(xTrainDrop, yTrainDrop, n_trials=50, validation_data=(xValidDrop, yValidDrop))
optParams

  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

{'subsample': 0.4561495700185195,
 'max_depth': 9,
 'learning_rate': 0.03943188884512477,
 'n_estimators': 329}

In [164]:
obj_opt.score()

[SCORES] 
on train: 0.9500 
on valid: 0.8104


In [165]:
obj_opt_lgbm = OptimalParams(sklearn_ensemble=False)
optParamsLGBM = obj_opt_lgbm.optimize(xTrainDrop, yTrainDrop, n_trials=100, validation_data=(xValidDrop, yValidDrop))
optParamsLGBM

  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

{'num_leaves': 15,
 'learning_rate': 0.048355790797789885,
 'n_estimators': 420,
 'min_child_samples': 13,
 'subsample': 0.015536580244624038,
 'colsample_bytree': 0.0010915080759197862}

In [166]:
obj_opt_lgbm.score()

[SCORES] 
on train: 0.8040 
on valid: 0.8023
