In [1]:
NAME = "Leonardo Passos Fida"

# Comparison between decision tree and random forest classification models

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn import metrics

## Data set used: iris data set

In [3]:
# Data download from sklearn
from sklearn.datasets import load_iris
data=load_iris().data
target=load_iris().target
df_data=pd.DataFrame(data,columns=['sepal_length','sepal_width','petal_length','petal_width'])
df_target=pd.DataFrame(target,columns=['target'])
df_merged=df_data.join(df_target)
df_merged['class']=df_merged['target'].apply(lambda x: "setosa" if x==0 else ("versicolor" if x==1 else "virginica"))
df_merged.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,class
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


## Choosing the independent (x) and dependent (y) variables:

In [4]:
X = data
y = target

## Performing a train/test split:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## First model: Decision Tree Classifier

In [6]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [7]:
predictions = dtree.predict(X_test)

In [8]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      0.94      0.97        16
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.97        45
weighted avg       0.98      0.98      0.98        45



In [9]:
print(confusion_matrix(y_test,predictions))

[[18  0  0]
 [ 0 15  1]
 [ 0  0 11]]


In [10]:
true=0
false=0
for i in range(len(y_test)):
    if predictions[i]==y_test[i]:
        true=true+1
    else:
        false=false+1
print("right:",true,"wrong:",false)

right: 44 wrong: 1


## This model shows a fantastic precision. Applying it to the original data set to test the predictions:

In [11]:
df_testar=df_merged.drop('class',axis=1)
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
x_teste=[]
for i in range(len(df_testar)):
    x_teste.append(df_testar.drop('target',axis=1).iloc[i].values)
pred1=dtree.predict(x_teste)
df_testar['target_predicted']=pred1
df_testar

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,target_predicted
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,2
146,6.3,2.5,5.0,1.9,2,2
147,6.5,3.0,5.2,2.0,2,2
148,6.2,3.4,5.4,2.3,2,2


## Showing the prediction(s) that was(were) incorrect:

In [12]:
df_testar[df_testar['target']!=df_testar['target_predicted']]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,target_predicted
70,5.9,3.2,4.8,1.8,1,2


In [13]:
if len(df_testar[df_testar['target']!=df_testar['target_predicted']])==1:
       print(len(df_testar[df_testar['target']!=df_testar['target_predicted']]),'value predicted wrong in the dataset')
else:
    print(len(df_testar[df_testar['target']!=df_testar['target_predicted']]),'values predicted wrong in the dataset')

1 value predicted wrong in the dataset


In [14]:
print('Accuracy:',len(df_testar[df_testar['target']==df_testar['target_predicted']])/(len(df_testar[df_testar['target']!=df_testar['target_predicted']])+len(df_testar[df_testar['target']==df_testar['target_predicted']])))

Accuracy: 0.9933333333333333


## Testing with a Random Forest Classifier model:

In [15]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [16]:
rfc_pred = rfc.predict(X_test)

In [17]:
print(confusion_matrix(y_test,rfc_pred))

[[18  0  0]
 [ 0 15  1]
 [ 0  0 11]]


In [18]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      0.94      0.97        16
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.97        45
weighted avg       0.98      0.98      0.98        45



In [19]:
true=0
false=0
for i in range(len(y_test)):
    if rfc_pred[i]==y_test[i]:
        true=true+1
    else:
        false=false+1
print("right:",true,"wrong:",false)

right: 44 wrong: 1


## We can also observe a very good precision. Applying it to the original data set to test the predictions:

In [20]:
df_testar=df_merged.drop('class',axis=1)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
x_teste=[]
for i in range(len(df_testar)):
    x_teste.append(df_testar.drop('target',axis=1).iloc[i].values)
pred1=rfc.predict(x_teste)
df_testar['target_predicted']=pred1
df_testar

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,target_predicted
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,2
146,6.3,2.5,5.0,1.9,2,2
147,6.5,3.0,5.2,2.0,2,2
148,6.2,3.4,5.4,2.3,2,2


## Showing the prediction(s) that was(were) incorrect:

In [21]:
df_testar[df_testar['target']!=df_testar['target_predicted']]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,target_predicted
70,5.9,3.2,4.8,1.8,1,2
134,6.1,2.6,5.6,1.4,2,1


In [22]:
if len(df_testar[df_testar['target']!=df_testar['target_predicted']])==1:
       print(len(df_testar[df_testar['target']!=df_testar['target_predicted']]),'value predicted wrong in the dataset')
else:
    print(len(df_testar[df_testar['target']!=df_testar['target_predicted']]),'values predicted wrong in the dataset')

2 values predicted wrong in the dataset


In [23]:
print('Accuracy:',len(df_testar[df_testar['target']==df_testar['target_predicted']])/(len(df_testar[df_testar['target']!=df_testar['target_predicted']])+len(df_testar[df_testar['target']==df_testar['target_predicted']])))

Accuracy: 0.9866666666666667


## Both models showed very good precision on the iris data set