In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

from Convert import convert_to_dataframe, single_y_test_pred

In [2]:
iris = convert_to_dataframe(load_iris())

In [3]:
iris.sample(10)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,target
43,5.0,3.5,1.6,0.6,setosa
147,6.5,3.0,5.2,2.0,virginica
88,5.6,3.0,4.1,1.3,versicolor
101,5.8,2.7,5.1,1.9,virginica
76,6.8,2.8,4.8,1.4,versicolor
58,6.6,2.9,4.6,1.3,versicolor
103,6.3,2.9,5.6,1.8,virginica
74,6.4,2.9,4.3,1.3,versicolor
122,7.7,2.8,6.7,2.0,virginica
70,5.9,3.2,4.8,1.8,versicolor


In [4]:
X = iris.drop(['target'], axis=1)
Y = iris['target']
feature_names = iris.columns.values.tolist()[:-1]
class_names = Y.unique().tolist()
print(feature_names)
print(class_names)

['sepallength', 'sepalwidth', 'petallength', 'petalwidth']
['setosa', 'versicolor', 'virginica']


In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)

In [6]:
x_train.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
88,5.6,3.0,4.1,1.3
97,6.2,2.9,4.3,1.3
6,4.6,3.4,1.4,0.3
79,5.7,2.6,3.5,1.0
133,6.3,2.8,5.1,1.5


In [7]:
clf = GradientBoostingClassifier()

In [8]:
clf = clf.fit(x_train, y_train)

In [9]:
y_pred = clf.predict(x_test)

In [10]:
print(single_y_test_pred(y_test, y_pred))

    index      target      y_pred
0     139   virginica   virginica
1     127   virginica   virginica
2     137   virginica   virginica
3      18      setosa      setosa
4      33      setosa      setosa
5      94  versicolor  versicolor
6      38      setosa      setosa
7      70  versicolor   virginica
8     135   virginica   virginica
9     131   virginica   virginica
10    102   virginica   virginica
11     19      setosa      setosa
12     86  versicolor  versicolor
13     92  versicolor  versicolor
14     59  versicolor  versicolor
15    129   virginica  versicolor
16     11      setosa      setosa
17     56  versicolor  versicolor
18     61  versicolor  versicolor
19    111   virginica   virginica
20    141   virginica   virginica
21     35      setosa      setosa
22    124   virginica   virginica
23    143   virginica   virginica
24     89  versicolor  versicolor
25      8      setosa      setosa
26      1      setosa      setosa
27    107   virginica   virginica
28    121   vi

In [11]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       0.88      0.88      0.88         8
   virginica       0.92      0.92      0.92        13

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



In [12]:
print("Confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred, labels=class_names))

Confusion matrix:
[[ 9  0  0]
 [ 0  7  1]
 [ 0  1 12]]


In [13]:
accuracy_test = metrics.accuracy_score(y_test, y_pred) * 100
accuracy_train = metrics.accuracy_score(y_train, clf.predict(x_train)) * 100

print(f"Accuracy: {round(accuracy_test, 2)}% on Test Data")
print(f"Accuracy: {round(accuracy_train, 2)}% on Training Data")

Accuracy: 93.33% on Test Data
Accuracy: 100.0% on Training Data


In [14]:
clf.score(x_test, y_test)

0.9333333333333333

In [15]:
feature_imp = pd.Series(clf.feature_importances_, index=feature_names)
feature_imp

sepallength    0.001894
sepalwidth     0.003297
petallength    0.210124
petalwidth     0.784686
dtype: float64