In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-apr-2021/train.csv
/kaggle/input/tabular-playground-series-apr-2021/test.csv


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from xgboost import XGBClassifier

In [3]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")

In [4]:
display(train.head())
display(test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C


In [5]:
train.isnull().sum()

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age             3292
SibSp              0
Parch              0
Ticket          4623
Fare             134
Cabin          67866
Embarked         250
dtype: int64

In [6]:
train_len = len(train)

df = pd.concat([train, test], axis=0, ignore_index=True)

In [7]:
fare_map = df[['Fare','Pclass']].dropna().groupby('Pclass').mean().to_dict()
df['Fare'] = df['Fare'].fillna(df['Pclass'].map(fare_map['Fare']))

df['Family_Size'] = df['SibSp'] + df['Parch'] + 1

df['Name'] = df['Name'].map(lambda x: x.split(',')[0])

df['isAlone'] = df['Family_Size'].map(lambda x: 1 if x==1 else 0)

# FE : https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-cv

age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
df['Age'] = df['Age'].fillna(df['Pclass'].map(age_map['Age']))

df['Cabin'] = df['Cabin'].fillna('X').map(lambda x: x[0].strip())

df['Ticket'] = df['Ticket'].fillna('X').map(lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'X')

df['Embarked'] = df['Embarked'].fillna('X')

In [8]:
label_cols = ['Name', 'Ticket', 'Sex']
onehot_cols = ['Cabin', 'Embarked']
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [9]:
def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

scaler = StandardScaler()

In [10]:
onehot_encoded_df = pd.get_dummies(df[onehot_cols])

In [11]:
label_encoded_df = df[label_cols].apply(label_encoder)

In [12]:
numerical_df = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=numerical_cols)
target_df = df['Survived']

df = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)

In [13]:
df.head(10)
df_cols = df.columns

In [14]:
preprocessed_train = df[df_cols].iloc[:train_len,:]
preprocessed_test = df[df_cols].iloc[train_len:,:]

In [15]:
display(preprocessed_train.tail())

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name,Ticket,Sex,Cabin_A,Cabin_B,...,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Embarked_C,Embarked_Q,Embarked_S,Embarked_X,Survived
99995,-0.274016,1.667037,-0.539572,-0.505478,-0.441937,1590,21,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
99996,-0.274016,1.909184,-0.539572,-0.505478,-0.496979,2992,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0
99997,0.877699,0.153624,-0.539572,-0.505478,-0.514782,4219,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0
99998,0.877699,1.001135,-0.539572,0.561618,-0.203671,3941,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0
99999,0.877699,1.243282,-0.539572,-0.505478,-0.45529,7055,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0


In [16]:
# scaler = MinMaxScaler()
# preprocessed_train[['Fare']] = scaler.fit_transform(preprocessed_train[['Fare']])
# preprocessed_test[['Fare']] = scaler.transform(preprocessed_test[['Fare']])

In [17]:
y = train['Survived']
X = preprocessed_train.drop(['Survived'], axis=1)
X.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name,Ticket,Sex,Cabin_A,Cabin_B,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Embarked_C,Embarked_Q,Embarked_S,Embarked_X
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,-0.150886,0.229349,-0.054223,-0.020419,-0.010615,13348.75637,42.3269,0.56114,0.06307,0.07439,...,0.03637,0.01749,0.00663,0.00482,0.00032,0.67866,0.22187,0.05424,0.72139,0.0025
std,0.964823,1.091884,1.052693,1.013823,1.031926,7675.447137,13.841825,0.49625,0.24309,0.262406,...,0.18721,0.131089,0.081155,0.069259,0.017886,0.466993,0.415506,0.226492,0.448317,0.049938
min,-1.42573,-2.081386,-0.539572,-0.505478,-0.652312,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-1.42573,-0.572815,-0.539572,-0.505478,-0.513447,6818.0,49.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.274016,0.21416,-0.539572,-0.505478,-0.299363,13191.0,49.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,0.877699,1.122208,0.680848,0.561618,-0.165394,20055.0,49.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,0.877699,3.180451,9.223789,9.098392,10.385374,26469.0,49.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
preprocessed_test.drop(['Survived'], axis=1, inplace=True)
preprocessed_test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name,Ticket,Sex,Cabin_A,Cabin_B,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Embarked_C,Embarked_Q,Embarked_S,Embarked_X
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.150886,-0.229349,0.054223,0.020419,0.010615,13365.97028,41.6103,0.69757,0.07212,0.08113,...,0.02521,0.01837,0.02323,0.00144,0.00026,0.70831,0.22308,0.08573,0.68842,0.00277
std,1.011733,0.838216,0.941264,0.985571,0.966914,7701.204024,14.238149,0.459313,0.258688,0.273036,...,0.156763,0.134286,0.150634,0.03792,0.016122,0.454543,0.416314,0.279966,0.463141,0.052558
min,-1.42573,-2.081386,-0.539572,-0.505478,-0.661659,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-1.42573,-0.814961,-0.539572,-0.505478,-0.512112,6822.0,49.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.877699,-0.451742,-0.539572,-0.505478,-0.454696,13280.0,49.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,0.877699,0.335233,0.680848,0.561618,-0.108127,20063.0,49.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,0.877699,2.817232,9.223789,9.098392,9.436464,26468.0,49.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
def objective(trial, data=X, target=y):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=0)
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': 4000,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_float('gamma', 1e-5, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-5, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 1e-5, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'auc'
    }
    
    xgb_model = XGBClassifier(**params)
    
    xgb_model.fit(X_train, y_train,
                 early_stopping_rounds = 200,
                 eval_set=[(X_val, y_val)],
                 verbose=False)
    pred = xgb_model.predict(X_val)
    
    roc_auc = roc_auc_score(y_val, pred)
    
    return roc_auc

In [20]:
%%time

study = optuna.create_study(direction='maximize', study_name='xgbclassifier')
study.optimize(objective, n_trials=30)


[32m[I 2021-04-26 06:47:33,548][0m A new study created in memory with name: xgbclassifier[0m
[32m[I 2021-04-26 06:47:34,869][0m Trial 0 finished with value: 0.7324761031472936 and parameters: {'max_depth': 8, 'learning_rate': 0.005, 'min_child_weight': 249, 'gamma': 1.972992762128871e-05, 'alpha': 4.1802630146878854e-05, 'lambda': 0.06026873513102999, 'colsample_bytree': 0.1974841584881631, 'subsample': 0.7479455092612857}. Best is trial 0 with value: 0.7324761031472936.[0m
[32m[I 2021-04-26 06:47:41,902][0m Trial 1 finished with value: 0.7734852421772365 and parameters: {'max_depth': 9, 'learning_rate': 0.02, 'min_child_weight': 65, 'gamma': 0.0012019746513666197, 'alpha': 0.009277394141105571, 'lambda': 1.966067735793631e-05, 'colsample_bytree': 0.3088468228283583, 'subsample': 0.5379096580393276}. Best is trial 1 with value: 0.7734852421772365.[0m
[32m[I 2021-04-26 06:47:43,757][0m Trial 2 finished with value: 0.774521298809522 and parameters: {'max_depth': 5, 'learning_r

CPU times: user 1min 53s, sys: 837 ms, total: 1min 54s
Wall time: 1min 46s


In [21]:
display(study.best_trial.params)
display(study.best_value)

{'max_depth': 9,
 'learning_rate': 0.1,
 'min_child_weight': 31,
 'gamma': 0.014115168140332394,
 'alpha': 4.55659030463373,
 'lambda': 1.4308794967873937,
 'colsample_bytree': 0.7002510222638035,
 'subsample': 0.7337481397750236}

0.7757452743027021

In [22]:
optuna.visualization.plot_optimization_history(study)

In [23]:
optuna.visualization.plot_param_importances(study)

In [24]:
best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['booster'] = 'gbtree'
best_params['eval_metric'] = 'auc'
best_params['random_state'] = 42
best_params['use_label_encoder'] = False
best_params

{'max_depth': 9,
 'learning_rate': 0.1,
 'min_child_weight': 31,
 'gamma': 0.014115168140332394,
 'alpha': 4.55659030463373,
 'lambda': 1.4308794967873937,
 'colsample_bytree': 0.7002510222638035,
 'subsample': 0.7337481397750236,
 'tree_method': 'gpu_hist',
 'booster': 'gbtree',
 'eval_metric': 'auc',
 'random_state': 42,
 'use_label_encoder': False}


```{
    'max_depth': 7,
     'learning_rate': 0.1,
     'min_child_weight': 179,
     'gamma': 0.6747422058386815,
     'alpha': 1.1290074464275892e-05,
     'lambda': 1.1279167223864525e-05,
     'colsample_bytree': 0.7953376369683129,
     'subsample': 0.7812635007356518,
     'tree_method': 'gpu_hist',
     'booster': 'gbtree',
     'eval_metric': 'auc',
     'random_state': 42,
     'use_label_encoder': False
}```

Best Score: 0.7756580428440715

In [25]:
preds = np.zeros(test.shape[0])
# aucs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = XGBClassifier(**best_params)
    
    model.fit(X_train, y_train,
             early_stopping_rounds=200,
             eval_set=[(X_val, y_val)],
             verbose=False)
    
    preds += model.predict(preprocessed_test) / skf.n_splits

In [26]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

# model = XGBClassifier(**best_params)

# model.fit(X_train, y_train,
#              early_stopping_rounds=200,
#              eval_set=[(X_val, y_val)],
#              verbose=False)

# predictions = model.predict(preprocessed_test)

In [27]:
submission = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv", index_col='PassengerId')
submission['Survived'] = np.round(preds).astype(int)

In [28]:
submission.to_csv("xgb_optuna_new.csv")