## Decision Tree

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import load_iris


In [3]:
iris = load_iris()
D, y = iris.data, iris.target
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [5]:
#Make one pandas dataframe from the two arrays
df = pd.DataFrame(D)
df['y'] = y
print(df)

       0    1    2    3  y
0    5.1  3.5  1.4  0.2  0
1    4.9  3.0  1.4  0.2  0
2    4.7  3.2  1.3  0.2  0
3    4.6  3.1  1.5  0.2  0
4    5.0  3.6  1.4  0.2  0
..   ...  ...  ...  ... ..
145  6.7  3.0  5.2  2.3  2
146  6.3  2.5  5.0  1.9  2
147  6.5  3.0  5.2  2.0  2
148  6.2  3.4  5.4  2.3  2
149  5.9  3.0  5.1  1.8  2

[150 rows x 5 columns]


In [6]:
#This function is to determine the impurity with the gini method. However, it sucks. I didn't remove it so you could have a laugh.
def impurity(df): 
    count = {}
    probability = {}
    impurity = 1

    for i in df['y']:
        if i in count:
            count[i] = count.get(i) + 1
        else:
            count[i] = 1

    for i in count: 
        probability[i] = count[i]/df.shape[0]
        impurity -= (probability[i]**2)
    
    return impurity
    

In [7]:
#This is my second try writing a function to calculate with the gini method :)

def impurity2(df):
    impurity = 1
    values = df['y'].value_counts()
    for i in values:
        impurity = impurity - ((i/(df.shape[0]))**2)
    return impurity


0.6666666666666665


In [14]:
#This function can be used to calculate the costs of a split using the gini impurity method.
#df0 and df1 are the dataframes after making the split
def cost(df, df0, df1):
    N = df.shape[0]
    N0 = df0.shape[0]
    N1 = df1.shape[0]
    cost = (impurity2(df0)*(N0/N) + impurity2(df1)*(N1/N)) - impurity2(df)
    return cost


## Excercise 1

In [18]:
print(f"the Gini impurity of the root node is {impurity2(df)}")

the Gini impurity of the root node is 0.6666666666666665


## Excercise 2

In [21]:
#split the dataframe, such that short contains all Irises with sepal shorter than the mean and long all Irises with sepals longer than the mean
short = df[df[0] <= 5.84]
long = df[df[0] > 5.84]

print(f"The cost of splitting the dataset based on whether the sepal length is shorter or longer than the mean is: {cost(df, short, long)}")


The cost of splitting the dataset based on whether the sepal length is shorter or longer than the mean is: -0.17476190476190456
