In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from Convert import convert_to_dataframe, single_y_test_pred


In [3]:
iris = convert_to_dataframe(load_iris())

In [4]:
iris.sample(10)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,target
22,4.6,3.6,1.0,0.2,setosa
69,5.6,2.5,3.9,1.1,versicolor
110,6.5,3.2,5.1,2.0,virginica
108,6.7,2.5,5.8,1.8,virginica
68,6.2,2.2,4.5,1.5,versicolor
93,5.0,2.3,3.3,1.0,versicolor
59,5.2,2.7,3.9,1.4,versicolor
14,5.8,4.0,1.2,0.2,setosa
129,7.2,3.0,5.8,1.6,virginica
133,6.3,2.8,5.1,1.5,virginica


In [5]:
X = iris.drop(['target'], axis=1)
Y = iris['target']
feature_names = iris.columns.values.tolist()[:-1]
class_names = Y.unique().tolist()
print(feature_names)
print(class_names)

['sepallength', 'sepalwidth', 'petallength', 'petalwidth']
['setosa', 'versicolor', 'virginica']


In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)

In [7]:
x_train.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
3,4.6,3.1,1.5,0.2
32,5.2,4.1,1.5,0.1
75,6.6,3.0,4.4,1.4
64,5.6,2.9,3.6,1.3
22,4.6,3.6,1.0,0.2


In [8]:
clf = RandomForestClassifier(n_estimators=5)

In [9]:
clf = clf.fit(x_train, y_train)

In [10]:
y_pred = clf.predict(x_test)

In [11]:
print(single_y_test_pred(y_test, y_pred))

    index      target      y_pred
0     109   virginica   virginica
1     116   virginica   virginica
2     139   virginica   virginica
3     142   virginica   virginica
4      88  versicolor  versicolor
5      15      setosa      setosa
6      80  versicolor  versicolor
7     121   virginica   virginica
8     130   virginica   virginica
9      21      setosa      setosa
10     63  versicolor  versicolor
11     54  versicolor  versicolor
12     60  versicolor  versicolor
13    133   virginica  versicolor
14    129   virginica   virginica
15    140   virginica   virginica
16      1      setosa      setosa
17     18      setosa      setosa
18     59  versicolor  versicolor
19     41      setosa      setosa
20      5      setosa      setosa
21    115   virginica   virginica
22      8      setosa      setosa
23    138   virginica   virginica
24     79  versicolor  versicolor
25    119   virginica  versicolor
26     11      setosa      setosa
27    145   virginica   virginica
28     82  ver

In [12]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         8
  versicolor       0.82      1.00      0.90         9
   virginica       1.00      0.85      0.92        13

    accuracy                           0.93        30
   macro avg       0.94      0.95      0.94        30
weighted avg       0.95      0.93      0.93        30



In [13]:
print("Confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred, labels=class_names))

Confusion matrix:
[[ 8  0  0]
 [ 0  9  0]
 [ 0  2 11]]


In [14]:
accuracy_test = metrics.accuracy_score(y_test, y_pred) * 100
accuracy_train = metrics.accuracy_score(y_train, clf.predict(x_train)) * 100

print(f"Accuracy: {round(accuracy_test, 2)}% on Test Data")
print(f"Accuracy: {round(accuracy_train, 2)}% on Training Data")

Accuracy: 93.33% on Test Data
Accuracy: 99.17% on Training Data


In [15]:
feature_imp = pd.Series(clf.feature_importances_, index=feature_names)
feature_imp

sepallength    0.201462
sepalwidth     0.040966
petallength    0.329642
petalwidth     0.427930
dtype: float64

In [16]:
clf.score(x_test, y_test)

0.9333333333333333

In [17]:
clf.estimators_

[DecisionTreeClassifier(max_features='sqrt', random_state=720993222),
 DecisionTreeClassifier(max_features='sqrt', random_state=1222894588),
 DecisionTreeClassifier(max_features='sqrt', random_state=209425721),
 DecisionTreeClassifier(max_features='sqrt', random_state=1920088220),
 DecisionTreeClassifier(max_features='sqrt', random_state=1709795138)]