## Titanic Decision Tree: Step-by-Step with Entropy/Information Gain\n",
##    "This notebook will guide you through importing the Titanic dataset from Kaggle, preprocessing it, and building a Decision Tree using entropy (information gain), displaying each step.

### using classic titanic dataset

In [38]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../Datasets/train.csv')
df.head()




Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
import pandas as pd
import numpy as np
import math
from pprint import pprint

# Load the dataset (assumes it's in the same folder)
# df = pd.read_csv("titanic.csv")

# Simplify dataset: drop unnecessary columns
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# Handle missing values
df.dropna(inplace=True)

# Convert categorical columns to numeric
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Discretize continuous features for clarity
df['Age'] = pd.cut(df['Age'], bins=4, labels=[0, 1, 2,3])
df['Fare'] = pd.qcut(df['Fare'], 2, labels=[0, 1])

# Reset index
df.reset_index(drop=True, inplace=True)

# -------------------------
# Helper Functions
# -------------------------

def entropy(data):
    """Calculate entropy of the target column (Survived)."""
    values, counts = np.unique(data, return_counts=True)
    ent = 0
    for i in range(len(values)):
        p = counts[i] / np.sum(counts)
        ent -= p * np.log2(p)
    return ent

def info_gain(data, feature, target_name="Survived"):
    """Calculate information gain of a feature."""
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[feature], return_counts=True)

    weighted_entropy = 0
    for i in range(len(vals)):
        subset = data[data[feature] == vals[i]]
        weighted_entropy += (counts[i] / np.sum(counts)) * entropy(subset[target_name])

    return total_entropy - weighted_entropy

def best_split(data, features):
    """Find the best feature to split on."""
    gains = {}
    for feature in features:
        gains[feature] = info_gain(data, feature)
    best = max(gains, key=gains.get)
    return best, gains

# -------------------------
# Decision Tree Algorithm
# -------------------------

def build_tree(data, features, depth=0):
    target = data['Survived']
    
    # If all examples have the same label
    if len(np.unique(target)) == 1:
        return np.unique(target)[0]
    
    # If no more features to split
    if len(features) == 0:
        return target.mode()[0]

    # Choose the best feature
    best_feat, gains = best_split(data, features)
    
    # Print progress at this level
    print(f"{'|   ' * depth}Split on: {best_feat}")
    for f in gains:
        print(f"{'|   ' * depth} - Info Gain of {f}: {gains[f]:.4f}")

    tree = {best_feat: {}}
    remaining_features = [f for f in features if f != best_feat]

    for val in sorted(data[best_feat].unique()):
        subset = data[data[best_feat] == val]
        print(f"{'|   ' * depth}-> Branch [{best_feat} = {val}] with {len(subset)} rows")

        if subset.empty:
            tree[best_feat][val] = target.mode()[0]
        else:
            subtree = build_tree(subset, remaining_features, depth + 1)
            tree[best_feat][val] = subtree

    return tree


# -------------------------
# Run and Visualize the Tree
# -------------------------

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
decision_tree = build_tree(df, features)

print("\nFinal Decision Tree:")
pprint(decision_tree)


Split on: Sex
 - Info Gain of Pclass: 0.0940
 - Info Gain of Sex: 0.2141
 - Info Gain of Age: 0.0068
 - Info Gain of SibSp: 0.0249
 - Info Gain of Parch: 0.0307
 - Info Gain of Fare: 0.0650
 - Info Gain of Embarked: 0.0279
-> Branch [Sex = 0] with 453 rows
|   Split on: Pclass
|    - Info Gain of Pclass: 0.0415
|    - Info Gain of Age: 0.0091
|    - Info Gain of SibSp: 0.0210
|    - Info Gain of Parch: 0.0176
|    - Info Gain of Fare: 0.0365
|    - Info Gain of Embarked: 0.0176
|   -> Branch [Pclass = 1] with 101 rows
|   |   Split on: Age
|   |    - Info Gain of Age: 0.0668
|   |    - Info Gain of SibSp: 0.0177
|   |    - Info Gain of Parch: 0.0247
|   |    - Info Gain of Fare: 0.0296
|   |    - Info Gain of Embarked: 0.0105
|   |   -> Branch [Age = 0] with 7 rows
|   |   |   Split on: Parch
|   |   |    - Info Gain of SibSp: 0.4138
|   |   |    - Info Gain of Parch: 0.4696
|   |   |    - Info Gain of Fare: 0.0000
|   |   |    - Info Gain of Embarked: 0.0060
|   |   |   -> Branch [Par

In [40]:

def predict(tree, sample):
    """
    Recursively traverse the tree for a single test sample (row).
    """
    if not isinstance(tree, dict):
        return tree  # If it's a leaf node, return the label

    # Get the current feature to split on
    feature = next(iter(tree))

    # Get the feature value for this sample
    feature_value = int(sample[feature])

    # Traverse to the subtree
    if feature_value in tree[feature]:
        return predict(tree[feature][feature_value], sample)
    else:
        # Fallback: unknown feature value (not seen in training)
        return 0  # or return majority class
    

testdata = pd.read_csv("../Datasets/test.csv")

# Simplify dataset: drop unnecessary columns
testdata = testdata[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# Handle missing values
testdata.dropna(inplace=True)

# Convert categorical columns to numeric
testdata['Sex'] = testdata['Sex'].map({'male': 0, 'female': 1})
testdata['Embarked'] = testdata['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Discretize continuous features for clarity
testdata['Age'] = pd.cut(testdata['Age'], bins=3, labels=[0, 1, 2])
testdata['Fare'] = pd.qcut(testdata['Fare'], 3, labels=[0, 1, 2])

pred_y = []

for _, row in testdata.iterrows():
    pred_y.append(predict(decision_tree, row))

print(pred_y)



[np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), 0, np.int64(0), np.int64(0), 0, 0, np.int64(1), 0, 0, np.int64(0), np.int64(1), np.int64(0), 0, np.int64(0), 0, 0, 0, np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), 0, 0, np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), 0, np.int64(1), 0, 0, 0, np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), 0, np.int64(1), np.int64(0), np.int64(1), np.int64(1), 0, np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(0), 0, np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(

In [42]:

real_y = pd.read_csv("../Datasets/gender_submission.csv")

aligned_ground_truth = real_y.loc[testdata.index]
y_true = aligned_ground_truth['Survived'].values
y_pred = np.array(pred_y)


print(len(pred_y), len(y_true))

from sklearn.metrics import accuracy_score
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")


331 331
Accuracy: 0.8278


In [15]:
!pip install scikit-learn

