In [83]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()

from scipy.stats import entropy

In [2]:
titanic_df = pd.read_csv("data/titanic_train.csv")
titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
titanic_df["Age"] = titanic_df["Age"].round(0)
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)
titanic_df = titanic_df.dropna(axis=0, how="any")

In [3]:
titanic_df.head()

Unnamed: 0,Survived,Sex,Age,Embarked,Family_Size,Class
0,0,male,22.0,S,1,Third
1,1,female,38.0,C,1,First
2,1,female,26.0,S,0,Third
3,1,female,35.0,S,1,First
4,0,male,35.0,S,0,Third


In [4]:
categorical_features = ["Class", "Sex", "Embarked"]
target = "Survived"

In [105]:
def get_split_entropy(feature, split_val, side):
    if feature in categorical_features:
        if side == "left":
            operator = "=="
        else:
            operator = "!="
    else:
        if side == "left":
            operator = "<="
        else:
            operator = ">"
    
    side_data = titanic_df.query(feature + operator + "@split_val")
    side_percent_target = np.array(side_data[target]).sum()/len(side_data)
    side_entropy = entropy([side_percent_target, 1.-side_percent_target], base=2)
    subset_size = len(side_data)
    return side_entropy, subset_size

In [133]:
def build_IG_Table(feature):
    percent_target = np.array(titanic_df[target]).sum()/len(np.array(titanic_df[target]))
    dataset_entropy = entropy([percent_target, 1.-percent_target], base=2)
    entropies = []
    if feature in categorical_features:
        split_val_range = np.unique(titanic_df[feature])
    else:
        split_val_range = np.arange(np.min(titanic_df[feature]).astype(int), np.max(titanic_df[feature]).astype(int))
    for split_val in split_val_range:
        
        left_entropy, left_size = get_split_entropy(feature, split_val, "left")
        right_entropy, right_size = get_split_entropy(feature, split_val, "right")
        
        information_gain = dataset_entropy - ((left_size*left_entropy) + (right_size*right_entropy))/len(titanic_df)
        entropies.append((split_val, (100.*left_entropy).round(2), (100.*right_entropy).round(2), (100.*information_gain).round(2)))

    entropy_df = pd.DataFrame(entropies, columns = ["Split_Val", "Left_Entropy", "Right_Entropy", "Information_Gain"])
    return entropy_df

In [138]:
chart = alt.hconcat()
CATEGORICAL_WIDTH = 100
QUANTITATIVE_WIDTH = 300
for feature in titanic_df.columns:
    if feature == target:
        pass
    elif feature in categorical_features:
        data = titanic_df.loc[:,[feature, target]]
        
        selector = alt.selection_single(fields=[feature])
        
        feature_component = alt.Chart(data).mark_bar().encode(
            x = alt.X(feature, axis=alt.Axis(labelAngle=0)),
            y = "count()",
            color = alt.Color(target, type="nominal", sort=[1,0]),
            opacity = alt.condition(selector, alt.value(1), alt.value(0.3))
        ).properties(
            width = CATEGORICAL_WIDTH
        ).add_selection(
            selector
        )
        
        split_component = alt.Chart(data).mark_bar().encode(
            y = "Split:N",
            x = alt.X("count()", scale=alt.Scale(domain = [0, len(titanic_df)])),
            color = alt.Color(target, type="nominal", sort=[1,0])
        ).transform_calculate(
            split_val = selector[feature]
        ).transform_calculate(
            Split = "datum." + feature + " == datum.split_val?'Left':'Right'"
        ).properties(
            width = CATEGORICAL_WIDTH
        )
        
        chart |= (feature_component & split_component)
    else:

        data = titanic_df.loc[:,[feature, target]].melt(id_vars=target)
        selector = alt.selection_single(fields = [feature], on="mouseover", nearest=True)
        
        # build IG table
        ig_df = build_IG_Table(feature)
        
        feature_component = alt.Chart(data).transform_density(
            density='value',
            bandwidth=0.5,
            groupby=['variable', target],
            steps=20
        ).transform_joinaggregate(
            max_density = 'max(density)'
        ).mark_line().encode(
            alt.X('value:Q', title=feature),
            alt.Y('density:Q', axis=alt.Axis(labelAngle=0, titleAngle=0)),
            alt.Color(target, type="nominal")
        ).properties(
            width=QUANTITATIVE_WIDTH, #height=100
        )

        selection_bar_df = pd.DataFrame(
            np.arange(np.min(titanic_df[feature]).astype(int), np.max(titanic_df[feature]).astype(int)),
            columns = [feature]
        )
        selection_bar_df["Height"] = 1

        selection_bars = alt.Chart(selection_bar_df).mark_bar(
            size=.8*QUANTITATIVE_WIDTH/len(selection_bar_df),
            binSpacing=0
        ).transform_calculate(
            key=selector[feature]
        ).transform_calculate(
            val='datum.'+feature
        ).encode(
            x = feature,
            y = alt.Y("Height", axis=None),
            color = alt.value("lightgrey"),
            opacity = alt.condition(alt.datum.val <= alt.datum.key, alt.value(0.3), alt.value(0))
        ).add_selection(
            selector
        )      

        split_component = alt.Chart(data).mark_bar().encode(
            y = "Split:N",
            x = alt.X("count()", scale=alt.Scale(domain = [0, len(titanic_df)])),
            color = alt.Color(target, type="nominal", sort=[1,0])
        ).transform_calculate(
            split_val = selector[feature]
        ).transform_calculate(
            Split = "datum.value <= datum.split_val?'Left':'Right'"
        ).properties(
            width = 100
        )   
    
#         split_text = alt.Chart(data).mark_text().encode(
#             y = "Split:N",
#             text = "split_target_count:Q",
#             #color = alt.Color(target, type="nominal", sort=[1,0])
#         ).transform_calculate(
#             split_val = selector[feature]
#         ).transform_calculate(
#             Split = "datum.value <= datum.split_val?'Left':'Right'"
#         ).transform_joinaggregate(
#             split_total_count = "count()",
#             groupby = ["Split"]
#         ).transform_joinaggregate(
#             split_target_count = "count()",
#             groupby = ["Split", target]
#         )

        split_text = alt.Chart(ig_df).mark_text().encode(
            text = "chart_text:N",
        ).transform_calculate(
            selected_val = selector[feature]
        ).transform_filter(
            (alt.datum.Split_Val <= alt.datum.selected_val) & (alt.datum.Split_Val >= alt.datum.selected_val)
        ).transform_calculate(
            chart_text = "Information Gain: " + alt.datum.Information_Gain
        ).properties(
            width = 150
        )

        ig_text = alt.Chart(ig_df).mark_text().encode(
            text = "chart_text:N",
        ).transform_calculate(
            selected_val = selector[feature]
        ).transform_filter(
            (alt.datum.Split_Val <= alt.datum.selected_val) & (alt.datum.Split_Val >= alt.datum.selected_val)
        ).transform_calculate(
            chart_text = "Information Gain: " + alt.datum.Information_Gain
        ).properties(
            width = 150
        )
        
        chart |= ((selection_bars + feature_component).resolve_scale(y="independent") & (split_component | (split_text + ig_text)))
    
chart

In [96]:
percent_yes_target = np.array(titanic_df["Survived"]).sum()/len(np.array(titanic_df["Survived"]))
dataset_entropy = entropy([percent_yes_target, 1.-percent_yes_target], base=2)
feature = "Age"
entropies = []
for split in np.arange(np.min(titanic_df[feature]).astype(int), np.max(titanic_df[feature]).astype(int)):
    left_data = titanic_df.query(feature + " <= @split")
    right_data = titanic_df.query(feature + " > @split")
    left_percent_target = np.array(left_data["Survived"]).sum()/len(np.array(left_data["Survived"]))
    left_entropy = entropy([left_percent_target, 1.-left_percent_target], base=2)
    right_percent_target = np.array(right_data["Survived"]).sum()/len(np.array(right_data["Survived"]))
    right_entropy = entropy([right_percent_target, 1.-right_percent_target], base=2)
    information_gain = dataset_entropy - ((len(left_data)*left_entropy) + (len(right_data)*right_entropy))/len(titanic_df)
    entropies.append((split, left_entropy, right_entropy, information_gain))
    
entropy_df = pd.DataFrame(entropies, columns = ["Split", "Left_Entropy", "Right_Entropy", "Information_Gain"])

In [98]:
alt.Chart(entropy_df).mark_line().encode(
    x = "Split",
    y = "Information_Gain"
)