HGBDT vs GBDT:
Both are working by training multiple DTs and combining them.
GBDT used gradient based optimization algo, while HGBDT uses histogram based optimization algo (essentially numerical, discrete)
GBDT is sensitive to scale of I/P features.

Therefore, HGBDT is more robust, and is ideal for handling very large datasets.

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split

from Convert import convert_to_dataframe, single_y_test_pred

In [2]:
iris = convert_to_dataframe(load_iris())

In [3]:
iris.sample(10)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,target
2,4.7,3.2,1.3,0.2,setosa
26,5.0,3.4,1.6,0.4,setosa
30,4.8,3.1,1.6,0.2,setosa
87,6.3,2.3,4.4,1.3,versicolor
64,5.6,2.9,3.6,1.3,versicolor
81,5.5,2.4,3.7,1.0,versicolor
79,5.7,2.6,3.5,1.0,versicolor
112,6.8,3.0,5.5,2.1,virginica
127,6.1,3.0,4.9,1.8,virginica
136,6.3,3.4,5.6,2.4,virginica


In [4]:
X = iris.drop(['target'], axis=1)
Y = iris['target']
feature_names = iris.columns.values.tolist()[:-1]
class_names = Y.unique().tolist()
print(feature_names)
print(class_names)

['sepallength', 'sepalwidth', 'petallength', 'petalwidth']
['setosa', 'versicolor', 'virginica']


In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)

In [6]:
x_train.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
77,6.7,3.0,5.0,1.7
67,5.8,2.7,4.1,1.0
128,6.4,2.8,5.6,2.1
25,5.0,3.0,1.6,0.2
133,6.3,2.8,5.1,1.5


In [7]:
clf = HistGradientBoostingClassifier()

In [8]:
clf = clf.fit(x_train, y_train)

In [9]:
y_pred = clf.predict(x_test)

In [10]:
print(single_y_test_pred(y_test, y_pred))

    index      target      y_pred
0     144   virginica   virginica
1      79  versicolor  versicolor
2      71  versicolor  versicolor
3      84  versicolor  versicolor
4      59  versicolor  versicolor
5      75  versicolor  versicolor
6     146   virginica   virginica
7     121   virginica   virginica
8      40      setosa      setosa
9     124   virginica   virginica
10     10      setosa      setosa
11     45      setosa      setosa
12    135   virginica   virginica
13      7      setosa      setosa
14     17      setosa      setosa
15    141   virginica   virginica
16     23      setosa      setosa
17     85  versicolor  versicolor
18     50  versicolor  versicolor
19     32      setosa      setosa
20     27      setosa      setosa
21     26      setosa      setosa
22    108   virginica   virginica
23    118   virginica   virginica
24     63  versicolor  versicolor
25     41      setosa      setosa
26     20      setosa      setosa
27     22      setosa      setosa
28    148   vi

In [11]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        13
  versicolor       1.00      1.00      1.00         8
   virginica       1.00      1.00      1.00         9

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [12]:
print("Confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred, labels=class_names))

Confusion matrix:
[[13  0  0]
 [ 0  8  0]
 [ 0  0  9]]


In [13]:
accuracy_test = metrics.accuracy_score(y_test, y_pred) * 100
accuracy_train = metrics.accuracy_score(y_train, clf.predict(x_train)) * 100

print(f"Accuracy: {round(accuracy_test, 2)}% on Test Data")
print(f"Accuracy: {round(accuracy_train, 2)}% on Training Data")

Accuracy: 100.0% on Test Data
Accuracy: 100.0% on Training Data


In [14]:
clf.score(x_test, y_test)

1.0