In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-apr-2021/train.csv
/kaggle/input/tabular-playground-series-apr-2021/test.csv


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from catboost import CatBoostClassifier

In [3]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")

In [4]:
N_TRIALS = 30
N_SPLITS = 10
OPTUNA = True

In [5]:
train.isnull().sum()

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age             3292
SibSp              0
Parch              0
Ticket          4623
Fare             134
Cabin          67866
Embarked         250
dtype: int64

In [6]:
train_len = len(train)

df = pd.concat([train, test], axis=0, ignore_index=True)

In [7]:
fare_map = df[['Fare','Pclass']].dropna().groupby('Pclass').mean().to_dict()
df['Fare'] = df['Fare'].fillna(df['Pclass'].map(fare_map['Fare']))

df['Family_Size'] = df['SibSp'] + df['Parch'] + 1

df['Name'] = df['Name'].map(lambda x: x.split(',')[0])

df['isAlone'] = df['Family_Size'].map(lambda x: 1 if x==1 else 0)

# FE : https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-cv

age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
df['Age'] = df['Age'].fillna(df['Pclass'].map(age_map['Age']))

df['Cabin'] = df['Cabin'].fillna('X').map(lambda x: x[0].strip())

df['Ticket'] = df['Ticket'].fillna('X').map(lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'X')

df['Embarked'] = df['Embarked'].fillna('X')

In [8]:
label_cols = ['Name', 'Ticket', 'Sex']
onehot_cols = ['Cabin', 'Embarked']
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [9]:
def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

scaler = StandardScaler()

In [10]:
onehot_encoded_df = pd.get_dummies(df[onehot_cols])

In [11]:
label_encoded_df = df[label_cols].apply(label_encoder)

In [12]:
numerical_df = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=numerical_cols)
target_df = df['Survived']

df = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)

In [13]:
preprocessed_train = df[df.columns].iloc[:train_len,:]
preprocessed_test = df[df.columns].iloc[train_len:,:]

In [14]:
display(preprocessed_train.tail())

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name,Ticket,Sex,Cabin_A,Cabin_B,...,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Embarked_C,Embarked_Q,Embarked_S,Embarked_X,Survived
99995,-0.274016,1.667037,-0.539572,-0.505478,-0.441937,1590,21,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
99996,-0.274016,1.909184,-0.539572,-0.505478,-0.496979,2992,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0
99997,0.877699,0.153624,-0.539572,-0.505478,-0.514782,4219,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0
99998,0.877699,1.001135,-0.539572,0.561618,-0.203671,3941,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0
99999,0.877699,1.243282,-0.539572,-0.505478,-0.45529,7055,49,1,0,0,...,0,0,0,0,1,0,0,1,0,0.0


In [15]:
y = train['Survived']
X = preprocessed_train.drop(['Survived'], axis=1)
# X.describe()

In [16]:
preprocessed_test.drop(['Survived'], axis=1, inplace=True)

In [17]:
def objective(trial, data=X, target=y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.002, 0.05, 0.08, 0.1]),
        'n_estimators': 4000,
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'random_seed': 42,
        'task_type': 'GPU',
        'loss_function':'Logloss', # objective function
        'eval_metric':'AUC', # metric
        'bootstrap_type': 'Poisson',
        'early_stopping_rounds': 222,
        'verbose': False
    }

    cbc_model = CatBoostClassifier(**params)

    cbc_model.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
         
    )
    
    y_pred = cbc_model.predict_proba(X_test)[:,1]
    
    roc_auc = roc_auc_score(y_test, y_pred)
    
    return roc_auc

In [18]:
%%time

study = optuna.create_study(direction='maximize', study_name='catboostclassifier')
study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2021-04-26 07:46:38,881][0m A new study created in memory with name: catboostclassifier[0m
[32m[I 2021-04-26 07:48:05,754][0m Trial 0 finished with value: 0.8500932766829091 and parameters: {'max_depth': 6, 'learning_rate': 0.08, 'max_bin': 256, 'min_data_in_leaf': 36, 'l2_leaf_reg': 0.04704010064951091, 'subsample': 0.6626280105851026}. Best is trial 0 with value: 0.8500932766829091.[0m
[32m[I 2021-04-26 07:49:06,988][0m Trial 1 finished with value: 0.84722558904299 and parameters: {'max_depth': 4, 'learning_rate': 0.002, 'max_bin': 337, 'min_data_in_leaf': 75, 'l2_leaf_reg': 0.8843451179412305, 'subsample': 0.43513933489980405}. Best is trial 0 with value: 0.8500932766829091.[0m
[32m[I 2021-04-26 07:49:17,084][0m Trial 2 finished with value: 0.8485753268368086 and parameters: {'max_depth': 10, 'learning_rate': 0.08, 'max_bin': 328, 'min_data_in_leaf': 60, 'l2_leaf_reg': 0.17508997044605934, 'subsample': 0.7600976350589687}. Best is trial 0 with value: 0.850093276682

CPU times: user 15min 48s, sys: 8min 19s, total: 24min 8s
Wall time: 14min 51s


In [19]:
best_params = study.best_trial.params
best_params['task_type'] = 'GPU'
best_params['loss_function'] = 'Logloss'
best_params['eval_metric'] = 'AUC'
best_params['random_seed'] = 42
best_params['bootstrap_type'] = 'Poisson'
best_params['early_stopping_rounds'] = 200
best_params['verbose'] = False

In [20]:
study.best_value

0.850666376395207

```{'max_depth': 8,
 'learning_rate': 0.005,
 'max_bin': 365,
 'min_data_in_leaf': 46,
 'l2_leaf_reg': 0.28648239231053535,
 'subsample': 0.7192966519984039,
 'task_type': 'GPU',
 'loss_function': 'Logloss',
 'eval_metric': 'AUC',
 'random_seed': 42,
 'bootstrap_type': 'Poisson',
 'early_stopping_rounds': 200,
 'verbose': False}```
 
 Best Score: 0.8505297962710942

In [21]:
# from sklearn.model_selection import KFold

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

predictions = np.zeros(len(test))

for fold, (train_id, test_id) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_id], X.iloc[test_id]
    y_train, y_val = y.iloc[train_id], y.iloc[test_id]
    
    model = CatBoostClassifier(**best_params)
    
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    
    predictions += model.predict_proba(preprocessed_test)[:,1] / skf.n_splits

In [22]:
submission = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv", index_col='PassengerId')
submission['Survived'] = np.round(predictions).astype(int)
submission.to_csv("cat_optuna.csv")