# DecisionTree Classifier
This notebooks presents the decision tree classifier implemented in this repo.

In [1]:
import numpy as np
import pandas as pd

In [2]:
from MyDecisionTree import build_tree

## Datasets
Some standard datasets are loaded in order to test the algorithm.

In [3]:
iris_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
mpg_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [5]:
flights_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/flights.csv')
flights_df.head()

Unnamed: 0,year,month,passengers
0,1949,January,112
1,1949,February,118
2,1949,March,132
3,1949,April,129
4,1949,May,121


In [6]:
# Here I perform the shuffling since some datasets are arranged by labels

iris_df = iris_df.sample(frac=1).reset_index(drop=True)
mpg_df = mpg_df.sample(frac=1).reset_index(drop=True)[:150]
flights_df = flights_df.sample(frac=1).reset_index(drop=True)

## Splitting Train and Test sets


In [7]:
train_test_ratio = 0.8

In [8]:
n_train_iris = int(train_test_ratio*len(iris_df))
train_iris_df, test_iris_df = iris_df[:n_train_iris], iris_df[n_train_iris:]

print(f"N iris samples for train: {len(train_iris_df)}; N iris samples for testing: {len(test_iris_df)}")

N iris samples for train: 120; N iris samples for testing: 30


In [9]:
n_train_mpg = int(train_test_ratio*len(mpg_df))
train_mpg_df, test_mpg_df = mpg_df[:n_train_mpg], mpg_df[n_train_mpg:]

print(f"N mpg samples for train: {len(train_mpg_df)}; N mpg samples for testing: {len(test_mpg_df)}")

N mpg samples for train: 120; N mpg samples for testing: 30


In [10]:
n_train_flights = int(train_test_ratio*len(flights_df))
train_flights_df, test_flights_df = flights_df[:n_train_flights], flights_df[n_train_flights:]

print(f"N flights samples for train: {len(train_flights_df)}; N flights samples for testing: {len(test_flights_df)}")

N flights samples for train: 115; N flights samples for testing: 29


In [11]:
iris_label, mpg_label, flights_label = "species", "mpg", "passengers"

In [12]:
iris_feature_cols = list(iris_df.columns)
iris_feature_cols.remove(iris_label)

mpg_feature_cols = list(train_mpg_df.columns)
mpg_feature_cols.remove(mpg_label)

flights_feature_cols = list(flights_df.columns)
flights_feature_cols.remove(flights_label)

## Building The Tree
The module is very straightforward to use: it is sufficient to call the `build_tree` function, and to pass a dataframe containing the dataset as a first argument and the name of the column that contains the labels as a second argument. The function will return a `Node` object that is the root of the tree.

In [13]:
%%time
iris_tree = build_tree(train_iris_df, iris_label)

CPU times: user 2.39 s, sys: 13.4 ms, total: 2.41 s
Wall time: 2.39 s


In [14]:
%%time
mpg_tree = build_tree(train_mpg_df, mpg_label)

CPU times: user 33.5 s, sys: 21.8 ms, total: 33.5 s
Wall time: 33.5 s


In [15]:
%%time
flights_tree = build_tree(train_flights_df, flights_label)

CPU times: user 5.75 s, sys: 7.88 ms, total: 5.75 s
Wall time: 5.75 s


## Making Predictions on Training Set
In order to make predictions, once a tree has been built, it is sufficient to call the `predict` method, passing as argument a dataframe containing the rows one wants to get predictions for. For each input row, the method will return the predicted class as well as the value of gini impurity associated with the node that led to that prediction.

In [16]:
train_iris_df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.4,3.2,1.3,0.2,setosa
1,5.7,2.6,3.5,1.0,versicolor
2,4.6,3.1,1.5,0.2,setosa
3,7.0,3.2,4.7,1.4,versicolor
4,6.7,3.1,4.4,1.4,versicolor


In [17]:
iris_tree.predict(train_iris_df[iris_feature_cols].head(5))

[('setosa', 0.0),
 ('versicolor', 0.0),
 ('setosa', 0.0),
 ('versicolor', 0.0),
 ('versicolor', 0.0)]

In [18]:
train_mpg_df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,19.2,8,267.0,125.0,3605,15.0,79,usa,chevrolet malibu classic (sw)
1,19.2,8,305.0,145.0,3425,13.2,78,usa,chevrolet monte carlo landau
2,18.0,6,225.0,95.0,3785,19.0,75,usa,plymouth fury
3,21.0,4,122.0,86.0,2226,16.5,72,usa,ford pinto runabout
4,31.0,4,112.0,85.0,2575,16.2,82,usa,pontiac j2000 se hatchback


In [19]:
mpg_tree.predict(train_mpg_df[mpg_feature_cols].head(5))

[(19.2, 0.0), (19.2, 0.0), (18.0, 0.0), (21.0, 0.0), (31.0, 0.0)]

In [20]:
train_flights_df.head(5)

Unnamed: 0,year,month,passengers
0,1952,May,183
1,1950,August,170
2,1951,June,178
3,1959,December,405
4,1957,June,422


In [21]:
flights_tree.predict(flights_df[flights_feature_cols].head(5))

[(183, 0.0), (170, 0.0), (178, 0.0), (405, 0.0), (422, 0.0)]

## Making Predictions on the Test Set

In [22]:
test_iris_df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
120,5.0,3.5,1.6,0.6,setosa
121,5.7,2.9,4.2,1.3,versicolor
122,6.6,3.0,4.4,1.4,versicolor
123,5.5,2.6,4.4,1.2,versicolor
124,4.6,3.4,1.4,0.3,setosa


In [23]:
iris_tree.predict(test_iris_df[iris_feature_cols].head(5))

[('virginica', 0.0),
 ('versicolor', 0.0),
 ('versicolor', 0.0),
 ('versicolor', 0.0),
 ('setosa', 0.0)]

In [24]:
test_mpg_df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
120,23.2,4,156.0,105.0,2745,16.7,78,usa,plymouth sapporo
121,28.0,4,116.0,90.0,2123,14.0,71,europe,opel 1900
122,17.0,8,260.0,110.0,4060,19.0,77,usa,oldsmobile cutlass supreme
123,26.0,4,97.0,78.0,2300,14.5,74,europe,opel manta
124,25.5,4,140.0,89.0,2755,15.8,77,usa,ford mustang ii 2+2


In [25]:
mpg_tree.predict(test_mpg_df[mpg_feature_cols].head(5))

[(23.8, 0.0), (28.0, 0.0), (13.0, 0.0), (31.0, 0.0), (24.5, 0.0)]

In [26]:
test_flights_df.head(5)

Unnamed: 0,year,month,passengers
115,1949,June,135
116,1952,January,171
117,1957,September,404
118,1954,May,234
119,1960,March,419


In [27]:
flights_tree.predict(test_flights_df[flights_feature_cols].head(5))

[(118, 0.0), (242, 0.0), (463, 0.0), (229, 0.0), (406, 0.0)]

## Evaluating Accuracy on Train and Test Set

In [28]:
iris_train_predictions = np.array(iris_tree.predict(train_iris_df[iris_feature_cols]))
iris_train_accuracy = sum(iris_train_predictions[:, 0] == train_iris_df[iris_label])/len(train_iris_df)

print(f"Iris train accuracy: {round(iris_train_accuracy*100, 3)}%")

Iris train accuracy: 100.0%


In [29]:
iris_test_predictions = np.array(iris_tree.predict(test_iris_df[iris_feature_cols]))
iris_test_accuracy = sum(iris_test_predictions[:, 0] == test_iris_df[iris_label])/len(test_iris_df)

print(f"Iris test accuracy: {round(iris_test_accuracy*100, 3)}%")

Iris test accuracy: 86.667%


In [30]:
mpg_train_predictions = np.array(mpg_tree.predict(train_mpg_df[mpg_feature_cols]))
mpg_train_error = sum(mpg_train_predictions[:, 0] - train_mpg_df[mpg_label])**2/len(train_mpg_df)

print(f"MPG train error (MSE): {round(mpg_train_error, 3)}")

MPG train error (MSE): 0.0


In [31]:
mpg_test_predictions = np.array(mpg_tree.predict(test_mpg_df[mpg_feature_cols]))
mpg_test_error = sum(mpg_test_predictions[:, 0] - test_mpg_df[mpg_label])**2/len(test_mpg_df)

print(f"MPG test error (MSE): {round(mpg_test_error, 3)}")

MPG test error (MSE): 126.075


In [32]:
flights_train_predictions = np.array(flights_tree.predict(train_flights_df[flights_feature_cols]))
flights_train_error = sum(flights_train_predictions[:, 0] - train_flights_df[flights_label])**2/len(train_flights_df)

print(f"Flights train error (MSE): {round(flights_train_error, 3)}")

Flights train error (MSE): 0.0


In [33]:
flights_test_predictions = np.array(flights_tree.predict(test_flights_df[flights_feature_cols]))
flights_test_error = sum(flights_test_predictions[:, 0] - test_flights_df[flights_label])**2/len(test_flights_df)

print(f"Flights test error (MSE): {round(flights_test_error, 3)}")

Flights test error (MSE): 31978.241


## Visualising The Trees
It is possible to have a visual representation of the tree by calling the `print_tree` method. For each non-leaf node, an attribute and a value are specified. These values are the ones selected by the algorithm to split the data, and, in particular, the left child of each node represents the data for which the value "is attribute == value" is True, and the right child represents the data for which that value is false.

In [34]:
iris_tree.print_tree()

 (R) attribute: petal_width value: 0.2 gini: 0.6640277777777779
       (l) gini: 0.0 decision: setosa (Leaf)
       (r) attribute: petal_width value: 0.4 gini: 0.6248121170077466
             (l) gini: 0.0 decision: setosa (Leaf)
             (r) attribute: petal_width value: 0.1 gini: 0.5849107625743646
                   (l) gini: 0.0 decision: setosa (Leaf)
                   (r) attribute: petal_width value: 0.3 gini: 0.541990550221003
                         (l) gini: 0.0 decision: setosa (Leaf)
                         (r) attribute: petal_width value: 1.3 gini: 0.49586776859504145
                               (l) gini: 0.0 decision: versicolor (Leaf)
                               (r) attribute: petal_width value: 1.0 gini: 0.4763705103969754
                                     (l) gini: 0.0 decision: versicolor (Leaf)
                                     (r) attribute: petal_width value: 1.5 gini: 0.4370447450572321
                                           (l) attribute: 

In [35]:
mpg_tree.print_tree()

 (R) attribute: cylinders value: 8 gini: 0.974166666666666
       (l) attribute: model_year value: 70 gini: 0.8750000000000002
             (l) attribute: displacement value: 304.0 gini: 0.7199999999999999
                   (l) gini: 0.0 decision: 9.0 (Leaf)
                   (r) attribute: displacement value: 307.0 gini: 0.625
                         (l) gini: 0.0 decision: 10.0 (Leaf)
                         (r) attribute: displacement value: 440.0 gini: 0.4444444444444444
                               (l) gini: 0.0 decision: 14.0 (Leaf)
                               (r) gini: 0.0 decision: 15.0 (Leaf)
             (r) attribute: horsepower value: 180.0 gini: 0.8511966701352757
                   (l) attribute: weight value: 3664 gini: 0.4444444444444444
                         (l) gini: 0.0 decision: 11.0 (Leaf)
                         (r) gini: 0.0 decision: 12.0 (Leaf)
                   (r) attribute: model_year value: 71 gini: 0.8418367346938774
                         

In [36]:
flights_tree.print_tree()

 (R) attribute: year value: 1949 gini: 0.9882797731569024
       (l) attribute: month value: November gini: 0.7222222222222221
             (l) gini: 0.0 decision: 104 (Leaf)
             (r) attribute: month value: March gini: 0.6399999999999999
                   (l) gini: 0.0 decision: 132 (Leaf)
                   (r) attribute: month value: July gini: 0.5
                         (l) gini: 0.0 decision: 148 (Leaf)
                         (r) attribute: month value: August gini: 0.4444444444444444
                               (l) gini: 0.0 decision: 148 (Leaf)
                               (r) gini: 0.0 decision: 118 (Leaf)
       (r) attribute: year value: 1951 gini: 0.9877956400976363
             (l) attribute: month value: January gini: 0.8599999999999999
                   (l) gini: 0.0 decision: 145 (Leaf)
                   (r) attribute: month value: November gini: 0.8395061728395059
                         (l) gini: 0.0 decision: 146 (Leaf)
                         (r