# Pruning of the C4.5 algoritme with the Titanic dataset
This notebooks contains a C4.5 decision tree fitted on the Titanic dataset, currently only using the categorical features.

Additional packages necessary to run this notebook:
 - Pandas

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from decision_mining.core import c45
from decision_mining.core.dmn import rule_c45, rule, dmn_generation as dmn

## Loading data
Titanic dataset. We're only using the columns "Pclass" and "Sex" as input, and "Survived" as output.
- Pclass is passenger class. This column contains the classes 1, 2 and 3.
- Sex is the gender listed for the passenger. This column contains the classes "male" and "female".
- Survived is if the passenger survived the disaster or not. It contains the classes 1 (Survived) and 0 (did not survive).

In [2]:
data = pd.read_csv(r"https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv", usecols=["Sex", "Pclass", "Survived", "Age"])
data = data[["Pclass", "Sex", "Age", "Survived"]]
X = data.drop("Survived", axis=1).to_numpy()
y = data["Survived"].to_numpy()

# Make column sex binary 
X[:,1] = X[:,1] == "female"
X = X.astype(int)

In [3]:
# Create train/test/val dataset based on the folowing ratio's
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42) 


In [4]:
# Build Decision Tree
predictor = c45.C45Classifier(np.array([2]),min_objs=1)
predictor.fit(X_train, y_train)
predictor.score(X_test,y_test)

0.7078651685393258

Create DMN before pruning

In [5]:
cols = [["Pclass", "Sex", "Age", "Survived"]]
drd_objects = dmn.create_node_objects(cols)
decision_nodes = dmn.create_dependencies(cols, drd_objects)
rules = rule_c45.make_c45_rules([0, 1, 2], c45.traverse_c45(predictor))

decision_nodes[0].rules = rules
tree = dmn.create_xml(drd_objects, decision_nodes)
tree.write("../before_pruning.dmn")

<img src="./images/C4.5-Titanic_with_pruning/DesicionTable_before_pruning.png"/>

## Testing Pruning

In [6]:
original_score = predictor.score(X_val,y_val)
path = list(c45.traverse_c45(predictor))
original_length = len(path)
original_score, original_length

(0.7894736842105263, 110)

In [7]:
predictor.reduced_error_pruning(X_val, y_val, verbose=True)

305 amount of branches are pruned!


In [8]:
score = predictor.score(X_val,y_val)
path = list(c45.traverse_c45(predictor))
length = len(path)
score, length

(0.8646616541353384, 43)

In [9]:
cols = [["Pclass", "Sex", "Age", "Survived"]]
drd_objects = dmn.create_node_objects(cols)
decision_nodes = dmn.create_dependencies(cols, drd_objects)
rules = rule_c45.make_c45_rules([0, 1, 2], c45.traverse_c45(predictor))

decision_nodes[0].rules = rules
tree = dmn.create_xml(drd_objects, decision_nodes)
tree.write("../after_pruning.dmn")

<img src="./images/C4.5-Titanic_with_pruning/DesicionTable_after_pruning.png"/>

In [11]:
print(predictor.score(X_val,y_val))
print(predictor.score(X_train,y_train))
print(predictor.score(X_test,y_test))


0.8646616541353384
0.8721804511278195
0.7640449438202247
