In [1]:
import gc
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# Load data
data_path = "../processed_data/train-data-300d-sum.txt"
data = pd.read_csv(data_path)

X = data.drop(['class'], axis=1)
y = data['class'] - 1
num_feats = X.shape[1]
num_classes = max(y) + 1

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)



In [3]:
# Tuning the hyper-parameters of LightGBM model and save the results
lgb_train = lgb.Dataset(X_train.values, y_train.values)
lgb_val = lgb.Dataset(X_val.values, y_val.values, reference=lgb_train)

df_params = pd.read_csv("lgb-params.csv")
num_params = df_params.shape[0]
for i in range(num_params):
    params = {
        'boosting_type': df_params['type'].values[i],
        'objective': 'multiclass',
        'num_class': num_classes,
        'metric': 'multi_logloss',

        'learning_rate': df_params['lr'].values[i],

        'num_leaves': df_params['n_leaf'].values[i],
        'max_depth': df_params['n_depth'].values[i],
        'min_data_in_leaf': df_params['min_data'].values[i],

        'feature_fraction': df_params['feat_frac'].values[i],
        'bagging_fraction': df_params['bagging_frac'].values[i],
        'bagging_freq': df_params['bagging_freq'].values[i],

        'lambda_l1': df_params['l1'].values[i],
        'lambda_l2': df_params['l2'].values[i],
        'min_gain_to_split': df_params['min_gain'].values[i],
        'min_sum_hessian_in_leaf': df_params['hessian'].values[i],

        'num_threads': 16,
        'verbose': 0,
        'is_training_metric': 'True'
    }

    print("Hyper-parameters:")
    print(params)

    evals_result = {}
    gbm = lgb.train(params=params,
                    train_set=lgb_train,
                    num_boost_round=100,
                    valid_sets=[lgb_train, lgb_val],
                    valid_names=['train', 'val'],
                    evals_result=evals_result,
                    early_stopping_rounds=50,
                    verbose_eval=1)

    best_round = gbm.best_iteration - 1
    loss_train = evals_result['train']['multi_logloss'][best_round]
    loss_val = evals_result['val']['multi_logloss'][best_round]

    probs_train = gbm.predict(X_train, num_iteration=best_round)
    preds_train = np.argmax(probs_train, axis=1)
    f1_train = f1_score(y_train, preds_train, average='weighted')

    probs_val = gbm.predict(X_val, num_iteration=best_round)
    preds_val = np.argmax(probs_val, axis=1)
    f1_val = f1_score(y_val, preds_val, average='weighted')

    print("Best round: %d" % best_round)
    print("Training Loss: %.5f, Validation Loss: %.5f" % (loss_train, loss_val))
    print("Training F1 Score: %.5f, Validation F1 Score: %.5f" % (f1_train, f1_val))

Hyper-parameters:
{'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class': 19, 'metric': 'multi_logloss', 'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 6, 'min_data_in_leaf': 512, 'feature_fraction': 0.6, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'lambda_l1': 6.37, 'lambda_l2': 65200, 'min_gain_to_split': 0, 'min_sum_hessian_in_leaf': 0.1, 'num_threads': 16, 'verbose': 0, 'is_training_metric': 'True'}
[1]	train's multi_logloss: 2.94427	val's multi_logloss: 2.94428
Training until validation scores don't improve for 50 rounds.
[2]	train's multi_logloss: 2.94411	val's multi_logloss: 2.94413
[3]	train's multi_logloss: 2.94394	val's multi_logloss: 2.94397
[4]	train's multi_logloss: 2.94377	val's multi_logloss: 2.94381
[5]	train's multi_logloss: 2.94361	val's multi_logloss: 2.94366
[6]	train's multi_logloss: 2.94345	val's multi_logloss: 2.94351
[7]	train's multi_logloss: 2.94328	val's multi_logloss: 2.94336
[8]	train's multi_logloss: 2.94311	val's multi_logloss: 2.94321
[9]

  'precision', 'predicted', average, warn_for)
