In [204]:
import optuna
import numpy as np
import pandas as pd
from optuna.integration.lightgbm import LightGBMTunerCV as lgbcv
from lightgbm import LGBMClassifier
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation, Dataset
from optuna.integration import LightGBMPruningCallback
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
from sklearn.model_selection import train_test_split

In [189]:
train = pd.read_csv('data/train_cleaned.csv')
test = pd.read_csv('data/test_cleaned.csv')

In [190]:
train.dtypes

id                 int64
N_Days             int64
Drug               int64
Age                int64
Sex                int64
Ascites            int64
Hepatomegaly       int64
Spiders            int64
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float64
Stage              int64
Status            object
dtype: object

In [191]:
# drop id
train = train.drop(['id'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(train.drop(['Status'], axis=1), train['Status'], test_size=0.2, random_state=66)

In [192]:
# get list of categorical features
categorical_features = [col for col in X_train.columns if X_train[col].dtype == 'object']
# convert to category
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
categorical_features

['Edema']

In [193]:
# label encode target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

# check label encoding, 0, 1, 2
le.classes_

array(['C', 'CL', 'D'], dtype=object)

In [194]:
# check target balance
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([3962,  226, 2136], dtype=int64))

## LightGBM

### Auto Tuning

In [158]:
# auto tuner
auto_tuner_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'boosting': 'gbdt',
    'verbosity': -1,
    'num_class': 3,
    'auto_class_weights': 'balanced'
    }

train_data = Dataset(X_train, label=y_train, categorical_feature=categorical_features)

study_auto_lgb = optuna.create_study(direction='minimize', study_name='LightGBM Auto Tuner CV')

auto_tuner = lgbcv(
    params=auto_tuner_params,
    train_set=train_data,
    study=study_auto_lgb,
    callbacks=[early_stopping(2)],
    nfold=10,
    optuna_seed=666,
    seed=666
    )

In [None]:
auto_tuner.run()

In [195]:
# fit model with best params
best_params = auto_tuner.best_params

best_model = LGBMClassifier(**best_params, random_state=666)
best_model.fit(X_train, y_train, categorical_feature=categorical_features)


In [199]:
# predict on validation set with probabilities
y_prob = best_model.predict_proba(X_val)

# evaluate log_loss
from sklearn.metrics import log_loss
print(f'Train Log Loss: {log_loss(y_train, best_model.predict_proba(X_train))}')
print(f'Val Log Loss: {log_loss(y_val, y_prob)}')

Train Log Loss: 0.22732762814688576
Val Log Loss: 0.4172626803280405


### Manual Tuning

In [219]:
# manual define the params
def objective(trial):
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart','rf']),
        'verbosity': -1,
        'num_class': 3,
        # 'auto_class_weights': trial.suggest_categorical('auto_class_weights', ['balanced', 'sqrt_balanced', 'None']),
        # 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        # 'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        # 'max_depth': trial.suggest_int('max_depth', 2, 256),
        # 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 256),
        # 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        # 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        # 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        # 'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        # 'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        # 'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'random_state': 666
        }
    train_data = Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    pruning_callback = LightGBMPruningCallback(trial, 'multi_logloss')
    cv_results = lgb.cv(
        params=params,
        train_set=train_data,
        num_boost_round = trial.suggest_int('num_boost_round', 100, 1000),
        nfold=10,
        seed=666,
        stratified=True,
        callbacks=[pruning_callback,early_stopping(10)],
        )
    return cv_results['valid multi_logloss-mean'][-1]


In [None]:
study = optuna.create_study(direction='minimize', study_name='LightGBM Manual Tuner CV', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, n_trials=150)

In [None]:
...

In [None]:
...

### Xgboost