# Decision Tree from Scratch

Comparison of performance of custom model and sklearn module.

# 1- Importing key Modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import random
from pprint import pprint
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [3]:
# helper file i.e .py
import decision_tree_algorithm

# 2- Load Data and prepare data

In [32]:
df= pd.read_csv('iris.csv')
df.rename(columns={"species":"label"},inplace=True)
df.shape

(150, 5)

In [33]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   label         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [35]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
label           0
dtype: int64

### 2.1.train-test split

In [36]:
import decision_tree_algorithm
from decision_tree_algorithm import train_test_split

In [37]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [38]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size=20)
print(train_df.shape)
print(test_df.shape)

(130, 5)
(20, 5)


In [39]:
train_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [40]:
test_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
98,5.1,2.5,3.0,1.1,versicolor
107,7.3,2.9,6.3,1.8,virginica
10,5.4,3.7,1.5,0.2,setosa
66,5.6,3.0,4.5,1.5,versicolor
130,7.4,2.8,6.1,1.9,virginica


# 3- Model using sklearn

In [41]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [42]:
X= df[['sepal_length','sepal_width','petal_length','petal_width']]
y=df['label']

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 4)
(30, 4)
(120,)
(30,)


In [44]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=3,class_weight= 'balanced')

In [45]:
dt_clf = dt_clf.fit(X_train, y_train)

In [46]:
y_pred=dt_clf.predict(X_test)

In [47]:
print(y_pred[:5])

['virginica' 'versicolor' 'setosa' 'virginica' 'setosa']


In [48]:
print(y_test[:5])

114     virginica
62     versicolor
33         setosa
107     virginica
7          setosa
Name: label, dtype: object


In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.9666666666666667

# 4)- Using custom model


as of taking from .py

In [50]:
# helper files and main functions from .py
import decision_tree_algorithm
from decision_tree_algorithm import train_test_split
from decision_tree_algorithm import check_purity
from decision_tree_algorithm import classify_data
from decision_tree_algorithm import get_potential_splits
from decision_tree_algorithm import calculate_entropy
from decision_tree_algorithm import calculate_overall_entropy
from decision_tree_algorithm import split_data
from decision_tree_algorithm import determine_best_split
from decision_tree_algorithm import determine_type_of_feature
from decision_tree_algorithm import decision_tree_algorithm
from decision_tree_algorithm import classify_example
from decision_tree_algorithm import calculate_accuracy

In [60]:
train_df , test_df= train_test_split(df , test_size=0.2)
tree=decision_tree_algorithm(train_df , max_depth =3)
accuracy= calculate_accuracy(test_df , tree)

In [61]:
print(tree)

{'petal_width <= 0.6': ['setosa', {'petal_length <= 4.8': [{'petal_width <= 1.6': ['versicolor', 'virginica']}, 'virginica']}]}


In [62]:
print(accuracy)

0.9
