<a href="https://colab.research.google.com/github/HelenLumi/XGBoost-LightGBM/blob/main/Comparison_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time

import lightgbm as lgb
import xgboost as xgb

from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.metrics import roc_auc_score

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM

Cloning into 'LightGBM'...
remote: Enumerating objects: 27839, done.[K
remote: Counting objects: 100% (4017/4017), done.[K
remote: Compressing objects: 100% (496/496), done.[K
remote: Total 27839 (delta 3733), reused 3655 (delta 3510), pack-reused 23822[K
Receiving objects: 100% (27839/27839), 19.76 MiB | 15.71 MiB/s, done.
Resolving deltas: 100% (20665/20665), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/external_libs/compute'...
remote: Enumerating objects: 21733, done.        
remote: Counting objects: 100% (5/5), done.  

###Prepare data for training

In [None]:
# Read the ranking data
rank_data_dir_path = "LightGBM/examples/lambdarank/"
rank_X_train_all, rank_y_train_all = load_svmlight_file(rank_data_dir_path + "rank.train")
rank_X_test, rank_y_test = load_svmlight_file(rank_data_dir_path + "rank.test")
rank_X_train_all.shape, rank_y_train_all.shape, rank_X_test.shape, rank_y_test.shape

((3005, 300), (3005,), (768, 300), (768,))

In [None]:
# Read the query data
q_train_all = np.loadtxt(rank_data_dir_path + "rank.train.query")
q_test = np.loadtxt(rank_data_dir_path + "rank.test.query")
q_train_all.shape, q_test.shape

((201,), (50,))

In [None]:
# Take the cumulative sum of q_train to find the position that is 75% from the beginning
q_train_cumsum = q_train_all.cumsum()
q_idx = int(np.searchsorted(q_train_cumsum, q_train_all.sum() * 0.75))
X_idx = int(q_train_cumsum[q_idx])
# Split using the found position
rank_X_train, rank_X_valid = rank_X_train_all[:X_idx], rank_X_train_all[X_idx:]
rank_y_train, rank_y_valid = rank_y_train_all[:X_idx], rank_y_train_all[X_idx:]
q_train, q_valid = q_train_all[:q_idx+1], q_train_all[q_idx+1:]
rank_X_train.shape, rank_X_valid.shape, rank_y_train.shape, rank_y_valid.shape, q_train.sum(), q_valid.sum()

((2258, 300), (747, 300), (2258,), (747,), 2258.0, 747.0)

In [None]:
# Prepare classification dataset
class_data_dir_path = "LightGBM/examples/binary_classification/"
df_train = pd.read_csv(class_data_dir_path + "binary.train", sep='\t', header=None)
class_X_train_all = df_train.loc[:,1:]
class_y_train_all = df_train.loc[:,0]

class_X_train, class_X_valid, class_y_train, class_y_valid = train_test_split(class_X_train_all, class_y_train_all, test_size =0.2)

df_test = pd.read_csv(class_data_dir_path + "binary.test", sep='\t', header=None)
class_X_test = df_test.loc[:,1:]
class_y_test = df_test.loc[:,0]

class_X_train.shape, class_y_train.shape, class_X_valid.shape, class_y_valid.shape, class_X_test.shape, class_y_test.shape

((5600, 28), (5600,), (1400, 28), (1400,), (500, 28), (500,))

In [None]:
# Create datasets for LightGBM
rank_train = lgb.Dataset(rank_X_train, rank_y_train, group=q_train)
rank_valid = lgb.Dataset(rank_X_valid, rank_y_valid, reference=rank_train, group=q_valid)

class_train = lgb.Dataset(class_X_train, label=class_y_train)
class_valid = lgb.Dataset(class_X_valid, label=class_y_valid)

###LightGBM Trainer

In [None]:
# LightGBM parameters for ranking dataset
rank_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'ndcg_eval_at': 10,
    'max_bin': 255,
    'num_leaves': 255,
    'random_state': 42,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.01,
    'verbose': -1
}

In [None]:
%%time
# Create LightGBM model for ranking dataset
rank_lgbmodel = lgb.train(
    rank_params, rank_train, valid_sets=rank_valid, early_stopping_rounds=50)

[1]	valid_0's ndcg@10: 0.698063
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's ndcg@10: 0.755844
[3]	valid_0's ndcg@10: 0.759158
[4]	valid_0's ndcg@10: 0.776628
[5]	valid_0's ndcg@10: 0.777835
[6]	valid_0's ndcg@10: 0.780509
[7]	valid_0's ndcg@10: 0.774211
[8]	valid_0's ndcg@10: 0.780412
[9]	valid_0's ndcg@10: 0.774488
[10]	valid_0's ndcg@10: 0.779807
[11]	valid_0's ndcg@10: 0.782879
[12]	valid_0's ndcg@10: 0.786288
[13]	valid_0's ndcg@10: 0.778132
[14]	valid_0's ndcg@10: 0.779591
[15]	valid_0's ndcg@10: 0.778939
[16]	valid_0's ndcg@10: 0.780631
[17]	valid_0's ndcg@10: 0.781085
[18]	valid_0's ndcg@10: 0.774346
[19]	valid_0's ndcg@10: 0.776686
[20]	valid_0's ndcg@10: 0.771888
[21]	valid_0's ndcg@10: 0.775118
[22]	valid_0's ndcg@10: 0.775464
[23]	valid_0's ndcg@10: 0.77226
[24]	valid_0's ndcg@10: 0.772024
[25]	valid_0's ndcg@10: 0.775669
[26]	valid_0's ndcg@10: 0.775867
[27]	valid_0's ndcg@10: 0.776637
[28]	valid_0's ndcg@10: 0.779425
[29]	valid_0's ndcg@10: 

In [None]:
# LightGBM parameters for classification dataset
class_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'is_unbalanced': 'true',
    'max_bin': 255,
    'num_leaves': 255,
    'random_state': 42,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.01,
    'verbose': -1
}

In [None]:
%%time
# Create LightGBM model for classification dataset
class_lgbmodel = lgb.train(
    class_params, class_train, valid_sets=class_valid, early_stopping_rounds=50)

[1]	valid_0's auc: 0.634477
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.653484
[3]	valid_0's auc: 0.686039
[4]	valid_0's auc: 0.694799
[5]	valid_0's auc: 0.693691
[6]	valid_0's auc: 0.690913
[7]	valid_0's auc: 0.691209
[8]	valid_0's auc: 0.693391
[9]	valid_0's auc: 0.706425
[10]	valid_0's auc: 0.709195
[11]	valid_0's auc: 0.708533
[12]	valid_0's auc: 0.717239
[13]	valid_0's auc: 0.720326
[14]	valid_0's auc: 0.720972
[15]	valid_0's auc: 0.723549
[16]	valid_0's auc: 0.723232
[17]	valid_0's auc: 0.722717
[18]	valid_0's auc: 0.72125
[19]	valid_0's auc: 0.722899
[20]	valid_0's auc: 0.721369
[21]	valid_0's auc: 0.724665
[22]	valid_0's auc: 0.727516
[23]	valid_0's auc: 0.730668
[24]	valid_0's auc: 0.731755
[25]	valid_0's auc: 0.73205
[26]	valid_0's auc: 0.73576
[27]	valid_0's auc: 0.739185
[28]	valid_0's auc: 0.737405
[29]	valid_0's auc: 0.741439
[30]	valid_0's auc: 0.739312
[31]	valid_0's auc: 0.738699
[32]	valid_0's auc: 0.736778
[33]	valid_0's auc: 0.

In [None]:
# Measure LightGBM performance
rank_pred = rank_lgbmodel.predict(rank_X_test)
pred_df = pd.DataFrame({
    "query_id": np.repeat(np.arange(q_test.shape[0]), q_test.astype(np.int)),
    "pred": rank_pred,
    "true": rank_y_test,
})

class_y_pred = class_lgbmodel.predict(class_X_test)

print("Ranking NDCG Test: {:.4f}\nClassification AUC Test: {:.4f}".format(pred_df.groupby("query_id").apply(
    lambda d: ndcg_score([d["true"]], [d["pred"]], k=10)).mean(), roc_auc_score(class_y_test, class_y_pred)))

Ranking NDCG Test: 0.7555
Classification AUC Test: 0.8134


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


###XGBoost Trainer

In [None]:
# XGBoost parameters for ranking dataset
rank_xgbmodel = xgb.XGBRanker(  
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    eta=0.01,
    colsample_bytree=0.5, 
    max_depth=8,  
    subsample=0.5
    )

q_train_all = q_train_all.astype('int')

In [None]:
%%time
#Create XGBoost (histogram-based) model for ranking dataset
rank_xgbmodel.fit(rank_X_train_all, rank_y_train_all, group=q_train_all, verbose=True)

CPU times: user 4.43 s, sys: 12 ms, total: 4.44 s
Wall time: 2.39 s


XGBRanker(colsample_bytree=0.5, eta=0.01, max_depth=8, random_state=42,
          subsample=0.5)

In [None]:
# XGBoost parameters for classfication dataset
class_xgbmodel = xgb.XGBClassifier(  
    booster='gbtree',
    random_state=42, 
    eta=0.01,
    colsample_bytree=0.5, 
    max_depth=8,  
    subsample=0.5
    )

In [None]:
%%time
#Create XGBoost (histogram-based) model for classification dataset
class_xgbmodel.fit(class_X_train, class_y_train)

CPU times: user 1.56 s, sys: 6.7 ms, total: 1.56 s
Wall time: 1.56 s


XGBClassifier(colsample_bytree=0.5, eta=0.01, max_depth=8, random_state=42,
              subsample=0.5)

In [None]:
# Measure XGBoost performance
rank_pred1 = rank_xgbmodel.predict(rank_X_test)
pred_df1 = pd.DataFrame({
    "query_id": np.repeat(np.arange(q_test.shape[0]), q_test.astype(np.int)),
    "pred": rank_pred1,
    "true": rank_y_test,
})

class_y_pred1 = class_xgbmodel.predict(class_X_test)

print("Ranking NDCG Test: {:.4f}\nClassification AUC Test: {:.4f}".format(pred_df1.groupby("query_id").apply(
    lambda d: ndcg_score([d["true"]], [d["pred"]], k=10)).mean(), roc_auc_score(class_y_test, class_y_pred1)))

Ranking NDCG Test: 0.7756
Classification AUC Test: 0.7357


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.
