# Gradient Bossting Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m379.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
!pip install lightgbm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp311-cp311-macosx_11_0_universal2.whl (25.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.7/25.7 MB[0m [31m233.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:04[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Load dataset

In [5]:
dataobj = load_iris()
X = pd.DataFrame(dataobj.data[50:, [1,2]], columns=dataobj.feature_names[:2])
y = dataobj.target[50:]
y = np.where(y == 2, 1, 0)
#np.unique(y)

df = X
df['target'] = y

df

Unnamed: 0,sepal length (cm),sepal width (cm),target
0,3.2,4.7,0
1,3.2,4.5,0
2,3.1,4.9,0
3,2.3,4.0,0
4,2.8,4.6,0
...,...,...,...
95,3.0,5.2,1
96,2.5,5.0,1
97,3.0,5.2,1
98,3.4,5.4,1


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=1, stratify=y)

# Scoring

In [26]:
from sklearn.metrics import make_scorer

## accuracy
acc = make_scorer(metrics.accuracy_score)

## precision
pre = make_scorer(metrics.precision_score, zero_division=0, pos_label=1, average='weighted')

## recall
rec = make_scorer(metrics.recall_score, zero_division=0, pos_label=1, average='weighted')

## F1
f1 = make_scorer(metrics.f1_score, pos_label=1, average='weighted')

## aucroc
aucroc = make_scorer(metrics.roc_auc_score)

scoring = {
    'accuracy' : acc,
    'precision' : pre,
    'recall' : rec,
    'f1' : f1,
    'aucroc' : aucroc
}

# Gradient Boosting

In [39]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

## Bagging
base = DecisionTreeClassifier(max_depth=None, criterion='entropy', random_state=1)
bag = BaggingClassifier(estimator=base, n_estimators=100, bootstrap=True, 
                        bootstrap_features=True,max_samples=20,  max_features=3,n_jobs=-1,random_state=1)


## Ada Boost
tree = DecisionTreeClassifier(max_depth=1, random_state=1, criterion='entropy')
ada = AdaBoostClassifier(estimator=tree, learning_rate=.01, random_state=1, n_estimators=100)


## Xgboost
xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False, n_estimators=100)
pipe_xgb = Pipeline([
    ('scl', StandardScaler()), 
    ('pca', PCA(n_components=.9)),
    ('clf', xgb)
])

## Light GBM
lgb = LGBMClassifier(n_estimators=100)
pipe_lgb = Pipeline([
    ('scl', StandardScaler()), 
    ('pca', PCA(n_components=.9)),
    ('clf', lgb)
])
## Cat Boost
cat = CatBoostClassifier(verbose=0, n_estimators=100)
pipe_cat = Pipeline([
    ('scl', StandardScaler()), 
    ('pca', PCA(n_components=.9)),
    ('clf', cat)
])

models = [('BaggingClassifier', bag), ('AdaBoostClassifier', ada), ('AdaBoost', ada),('XGBClassifier', pipe_xgb), ('LGBMClassifier', pipe_lgb), ('CatBoostClassifier', pipe_cat)]

# Cross validation

In [40]:
from sklearn.model_selection import cross_validate

result = []

for (name, pipe) in models:
    cv = cross_validate(
        estimator=pipe,
        X=X_train,
        y=y_train,
        scoring=scoring,
        cv=10,
        n_jobs=-1
    )
    
    ACC=cv['test_accuracy']
    PRE=cv['test_precision']
    REC=cv['test_recall']
    F1=cv['test_f1']
    AUCROC=cv['test_aucroc']
    
    re = {
        'model' : name,
        'accuracy' : f'{np.mean(ACC):6.3f}',
        'precision' : f'{np.mean(PRE):6.3f}',
        'recall' : f'{np.mean(REC):6.3f}',
        'f1' : f'{np.mean(F1):6.3f}',
        'aucroc' : f'{np.mean(AUCROC):6.3f}',
    }
    result.append(re)
    
pd.DataFrame(result).set_index('model')

Unnamed: 0_level_0,accuracy,precision,recall,f1,aucroc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BaggingClassifier,1.0,1.0,1.0,1.0,1.0
AdaBoostClassifier,1.0,1.0,1.0,1.0,1.0
AdaBoost,1.0,1.0,1.0,1.0,1.0
XGBClassifier,0.988,0.99,0.988,0.987,0.988
LGBMClassifier,0.988,0.99,0.988,0.987,0.988
CatBoostClassifier,0.988,0.99,0.988,0.987,0.988
