In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("../datasets/titanic/train.csv")

In [4]:
data.head() # Head gets first five items in the list

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
mod = data.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)

In [7]:
mod.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594
std,0.486592,0.836071,14.526497,1.102743,0.806057
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,38.0,1.0,0.0
max,1.0,3.0,80.0,8.0,6.0


In [8]:
mod["Sex"] = LabelEncoder().fit_transform(mod["Sex"])

In [9]:
mod.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,1,22.0,1,0
1,1,1,0,38.0,1,0
2,1,3,0,26.0,0,0
3,1,1,0,35.0,1,0
4,0,3,1,35.0,0,0


In [10]:
mod.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
count,891.0,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594
std,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057
min,0.0,1.0,0.0,0.42,0.0,0.0
25%,0.0,2.0,0.0,20.125,0.0,0.0
50%,0.0,3.0,1.0,28.0,0.0,0.0
75%,1.0,3.0,1.0,38.0,1.0,0.0
max,1.0,3.0,1.0,80.0,8.0,6.0


In [11]:
processed = mod.dropna() # dropna drops all Nan Values

In [12]:
table = processed.groupby(["Pclass", "Sex"]).mean()[["Age"]].reset_index()

In [13]:
blank = mod.loc[mod["Age"].isna()]

In [14]:
row = blank.iloc[0]

table.loc[(table["Pclass"]==row["Pclass"]) & (table["Sex"]==row["Sex"])].describe().loc["mean",
                                                                                        "Age"]

26.507588932806325

In [15]:
def fillAge(row):
    row["Age"] = table.loc[(table["Pclass"]==row["Pclass"]) & (table["Sex"]==row["Sex"])].describe().loc["mean", "Age"]
  
    return row

In [16]:
blank_mod = blank.apply(fillAge, axis=1)

In [18]:
total = pd.concat([processed, blank_mod])

In [19]:
X = total.drop(["Survived"], axis=1)

In [20]:
y = total["Survived"]

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [24]:
tree = DecisionTreeClassifier()

In [25]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [27]:
tree.score(X_test, y_test)

0.7864406779661017

In [32]:
def entropy(data):
    items, count = np.unique(data, return_counts=True)
    p = count/len(data)
    return np.sum(-p * np.log(p))

In [34]:
entropy(y)

0.6659119735267652

In [31]:
def info_gain(X, y, label):
    
    x_col = X[label]
    pivot = np.mean(x_col)
    
    left_index = x_col < pivot
    right_index = x_col >= pivot
    
    y_left, y_right = y[left_index], y[right_index]
    
    ent = entropy(y)
    left_ent = entropy(y_left)
    right_ent = entropy(y_right)
    
    p_left = len(y_left)/len(y)
    p_right = 1 - p_left
    
    return ent - p_left*left_ent - p_right*right_ent, pivot

In [29]:
y[:10][X[:10]["Age"] < X[:10]["Age"].mean()]

0     0.0
7     0.0
9     1.0
10    1.0
Name: Survived, dtype: float64

In [33]:
for col in X.columns:
    
    print(col, info_gain(X, y, col))

Pclass (0.05253613916173083, 2.308641975308642)
Sex (0.15087048925218172, 0.6475869809203143)
Age (0.0007663481843047659, 29.318642716644153)
SibSp (0.006643498134917131, 0.5230078563411896)
Parch (0.010661126611803073, 0.38159371492704824)
