In [22]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [122]:
titanic_df = pd.read_csv("data/titanic_train.csv")
titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
titanic_df["Age"] = titanic_df["Age"].round(0)
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)
titanic_df = titanic_df.dropna(axis=0, how="any")

In [44]:
titanic_df.head()

Unnamed: 0,Survived,Sex,Age,Embarked,Family_Size,Class
0,0,male,22.0,S,1,Third
1,1,female,38.0,C,1,First
2,1,female,26.0,S,0,Third
3,1,female,35.0,S,1,First
4,0,male,35.0,S,0,Third


In [45]:
categorical_features = ["Class", "Sex", "Embarked"]
target = "Survived"

In [266]:
chart = alt.hconcat()
for feature in titanic_df.columns:
    if feature == target:
        pass
    elif feature in categorical_features:
        data = titanic_df.loc[:,[feature, target]]
        
        selector = alt.selection_single(fields=[feature])
        
        feature_component = alt.Chart(data).mark_bar().encode(
            x = alt.X(feature, axis=alt.Axis(labelAngle=0)),
            y = "count()",
            color = alt.Color(target, type="nominal", sort=[1,0]),
            opacity = alt.condition(selector, alt.value(1), alt.value(0.3))
        ).properties(
            width = 100
        ).add_selection(
            selector
        )
        
        split_component = alt.Chart(data).mark_bar().encode(
            y = "Split:N",
            x = "count()",
            color = alt.Color(target, type="nominal", sort=[1,0])
        ).transform_calculate(
            split_val = selector[feature]
        ).transform_calculate(
            Split = "datum." + feature + " == datum.split_val?'Left':'Right'"
        ).properties(
            width = 100
        )
        
        chart |= (feature_component & split_component)
    else:

        data = titanic_df.loc[:,[feature, target]].melt(id_vars=target)
        selector = alt.selection_single(fields = [feature], on="mouseover", nearest=True, init={feature:33})
        
        base = alt.Chart(data).transform_density(
            density='value',
            bandwidth=0.5,
            groupby=['variable', target],
            steps=20
        ).transform_joinaggregate(
            max_density = 'max(density)'
        )
        
        feature_component = base.mark_line().encode(
            alt.X('value:Q', title=feature),
            alt.Y('density:Q', axis=alt.Axis(labelAngle=0, titleAngle=0)),
            alt.Color(target, type="nominal")
        ).properties(
            width=300, #height=100
        )
        
#         selection_bars = base.mark_bar().encode(
#             x = "value:Q",
#             y = alt.Y('max_density:Q', title="Frequency"),
#             color = alt.value("lightgrey"),
#             opacity = alt.condition(alt.datum.value <= selector.value, alt.value(0.5), alt.value(0))
#         ).add_selection(
#             selector
#         )

        selection_bars = alt.Chart(titanic_df.loc[:,[feature, target]]).mark_bar(
        ).transform_calculate(
            key=selector[feature]
        ).transform_calculate(
            val='datum.'+feature
        ).encode(
            x = feature,
            y = target,
            color = alt.value("lightgrey"),
            opacity = alt.condition(alt.datum.val <= alt.datum.key, alt.value(0.3), alt.value(0))
        ).add_selection(
            selector
        )
        
#         split_component = base.mark_bar().encode(
#             y = "Split:N",
#             x = "sum(density):Q",
#             color = alt.Color(target, type="nominal", sort=[1,0])
#         ).transform_calculate(
#             split_val = selector["value"]
#         ).transform_calculate(
#             Split = "datum.value <= datum.split_val?'Left':'Right'"
#         ).properties(
#             width = 100
#         )       

        split_component = alt.Chart(data).mark_bar().encode(
            y = "Split:N",
            x = "count()",
            color = alt.Color(target, type="nominal", sort=[1,0])
        ).transform_calculate(
            split_val = selector[feature]
        ).transform_calculate(
            Split = "datum.value <= datum.split_val?'Left':'Right'"
        ).properties(
            width = 100
        )   
        
        chart |= ((selection_bars + feature_component).resolve_scale(y="independent") & split_component)
    
chart

In [275]:
[x for x in range(np.min(titanic_df["Age"]).astype(int), np.max(titanic_df["Age"]).astype(int)+5, 5)]

[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

In [130]:
alt.Chart(titanic_df).mark_bar().encode(
    x = alt.X("Age", axis=alt.Axis(labelAngle=0), bin=alt.Bin(maxbins=50)),
    y = alt.Y("count()", stack=True, sort = alt.EncodingSortField(field=target)),
    color = alt.Color(target, type="nominal", sort=[1,0]),
    tooltip = ["Age", target]
).properties(
    width = 250
)

In [26]:
alt.Chart(titanic_df.melt(id_vars="Survived")).transform_density(
    density='value',
    bandwidth=0.3,
    groupby=['variable', "Survived"],
    steps=20
).mark_line().encode(
    alt.X('value:Q'),
    alt.Y('density:Q', axis=alt.Axis(labelAngle=0, titleAngle=0)),
    alt.Row('variable:N', header=alt.Header(labelAngle=0, titleAngle=0)),
    alt.Color("Survived:N")
).properties(width=400, height=100).resolve_scale(y="independent", x="independent")