In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
def entropy(a):
    sm = a.sum()

    sh = a.shape
    assert len(sh) == 1, "should have exactly 1 row"
    e = 0
    for i in range(sh[0]):
        val = a[i]
        prob = val / sm
        e += prob * np.log2(prob)
    e *= -1

    return e

In [3]:
def gini_index(a):
    sm = a.sum()

    sh = a.shape
    assert len(sh) == 1, "should have exactly 1 row"
    gi = 1
    for i in range(sh[0]):
        val = a[i]
        prob = val / sm
        gi -= np.pow(prob, 2)

    return gi

In [4]:
def information_gain(a, b, c, fn):
    a_sm = a.sum()
    b_sm = b.sum()
    c_sm = c.sum()

    assert a_sm  == b_sm + c_sm, "all the samples should be present in either of the split"
    
    return fn(a) - ((b_sm / a_sm) * fn(b) + (c_sm / a_sm) * fn(c))

In [5]:
information_gain(np.array([16, 14]), np.array([12, 1]), np.array([4, 13]), entropy)

np.float64(0.38121435556157335)

In [6]:
information_gain(np.array([16, 14]), np.array([12, 1]), np.array([4, 13]), gini_index)

np.float64(0.23231774761186524)

# Decision trees on Titanic dataset

In [7]:
df = pd.read_csv('../data/Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Cleaning data

In [8]:
# A lot of the Cabin data is unknown
print("Ratio of NaN in Cabin column ", len(df[df["Cabin"].isna()]) / len(df))
df['Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)

Ratio of NaN in Cabin column  0.7710437710437711


In [13]:
# Encoding categorical data
sex_le = LabelEncoder()
df['Sex'] = sex_le.fit_transform(df['Sex'])

embarked_le = LabelEncoder()
df['Embarked'] = embarked_le.fit_transform(df['Embarked'])

## Splitting data

In [21]:
feat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
label_col = 'Survived'

X = df[feat_cols]
y = df[label_col]

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True)

In [23]:
X_train, y_train

(     Pclass  Sex  SibSp  Parch  Cabin  Embarked
 645       1    1      1      0      1         0
 680       3    0      0      0      0         1
 296       3    1      0      0      0         0
 548       3    1      1      1      0         2
 639       3    1      1      0      0         2
 ..      ...  ...    ...    ...    ...       ...
 771       3    1      0      0      0         2
 390       1    1      1      2      1         2
 888       3    0      1      2      0         2
 738       3    1      0      0      0         2
 791       2    1      0      0      0         2
 
 [712 rows x 6 columns],
 645    1
 680    0
 296    0
 548    0
 639    0
       ..
 771    0
 390    1
 888    0
 738    0
 791    0
 Name: Survived, Length: 712, dtype: int64)

## Training and inference