In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import plotly

In [None]:
plotly.offline.init_notebook_mode(connected=True)

In [None]:
import pandas as pd

In [None]:
dataframe = pd.read_csv("titanic.tsv", sep="\t", index_col=0)

dataframe = dataframe[["sex", "age", "fare", "survived"]].dropna()

In [None]:
import numpy as np

In [None]:
import kraft

In [None]:
g = (dataframe["sex"] == "male").astype(int)

g.name = "Gender"

a = dataframe["age"]

a.name = "Age"

f = pd.Series(
    kraft.log_nd_array(
        dataframe["fare"].values, shift_as_necessary_to_achieve_min_before_logging="0<"
    ),
    index=dataframe.index,
)

f.name = "Fare"

s = dataframe["survived"]

s.name = "Survival"

kraft.plot_histogram((g, a, f, s), title="Dimensions")

In [None]:
p_s1 = (s == 1).sum() / s.size

p_s1

In [None]:
target_dimension_value = 1

fraction_grid_extension = 1 / 8

n_grid = 128

In [None]:
observation_x_dimension = np.asarray((g, s)).T

n_dimension = observation_x_dimension.shape[1]

p_s1__g = kraft.infer(
    observation_x_dimension,
    target_dimension_value=target_dimension_value,
    fraction_grid_extension=fraction_grid_extension,
    n_grid=n_grid,
    dimension_names=(g.name, s.name),
)

In [None]:
observation_x_dimension = np.asarray((a, s)).T

n_dimension = observation_x_dimension.shape[1]

p_s1__a = kraft.infer(
    observation_x_dimension,
    target_dimension_value=target_dimension_value,
    fraction_grid_extension=fraction_grid_extension,
    n_grid=n_grid,
    dimension_names=(a.name, s.name),
)

In [None]:
observation_x_dimension = np.asarray((f, s)).T

n_dimension = observation_x_dimension.shape[1]

p_s1__f = kraft.infer(
    observation_x_dimension,
    target_dimension_value=target_dimension_value,
    fraction_grid_extension=fraction_grid_extension,
    n_grid=n_grid,
    dimension_names=(f.name, s.name),
)

In [None]:
observation_x_dimension = np.asarray((a, f, s)).T

n_dimension = observation_x_dimension.shape[1]

p_s1__a_f = kraft.infer(
    observation_x_dimension,
    target_dimension_value=target_dimension_value,
    fraction_grid_extension=fraction_grid_extension,
    n_grid=n_grid,
    dimension_names=(a.name, f.name, s.name),
)

In [None]:
observation_x_dimension = np.asarray((a, f, s)).T

n_dimension = observation_x_dimension.shape[1]

p_s1__a_f_naive = kraft.infer_assuming_independence(
    observation_x_dimension,
    target_dimension_value=target_dimension_value,
    fraction_grid_extension=fraction_grid_extension,
    n_grid=n_grid,
    dimension_names=(a.name, f.name, s.name),
)

In [None]:
from sklearn.metrics import auc, roc_curve

In [None]:
maths = (
    "P(S = 1 | G)",
    "P(S = 1 | A)",
    "P(S = 1 | F)",
    "P(S = 1 | A, F)",
    "P(S = 1 | A, F) (naive)",
)

math_roc = {math: {} for math in maths}

for math, p_s1__v, vs in zip(
    maths,
    (p_s1__g, p_s1__a, p_s1__f, p_s1__a_f, p_s1__a_f_naive),
    ((g,), (a,), (f,), (a, f), (a, f)),
):

    p_s1__v = kraft.unmesh(*p_s1__v)[1]

    p_s1__vv = np.full(s.size, np.nan)

    for i in range(s.size):

        index_grid_values = tuple(
            [
                np.argmin(
                    np.absolute(np.linspace(v.min(), v.max(), num=n_grid) - v.values[i])
                )
            ]
            for v in vs
        )

        p_s1__vv[i] = p_s1__v[index_grid_values]

    fpr, tpr, t = roc_curve(s, kraft.normalize_nd_array(p_s1__vv, None, "0-1"))

    math_roc[math]["fpr"] = fpr

    math_roc[math]["tpr"] = tpr

    auc_ = auc(fpr, tpr)

    math_roc[math]["auc"] = auc_

    n_permutation_for_roc = 100

    permuting_aucs = np.full(n_permutation_for_roc, np.nan)

    permuting_s = s.values.copy()

    for i in range(n_permutation_for_roc):

        np.random.shuffle(permuting_s)

        permuting_fpr, permuting_tpr, permuting_t = roc_curve(permuting_s, p_s1__vv)

        permuting_aucs[i] = auc(permuting_fpr, permuting_tpr)

    math_roc[math]["p-value"] = kraft.compute_empirical_p_value(
        auc_, permuting_aucs, ">"
    )

In [None]:
# kraft.plot_bayesian_nomogram(
#     s.values, 1, 0, n_grid, (p_s__g, p_s__a, p_s__f), (g.name, a.name, f.name)
# )

In [None]:
random_roc = np.linspace(0, 1, num=16)

kraft.plot_and_save(
    {
        "layout": {
            "title": {"text": "ROC<br>{}".format(", ".join((g.name, a.name, f.name)))},
            "xaxis": {"title": "False Positive Rate"},
            "yaxis": {"title": "True Positive Rate"},
        },
        "data": [
            {
                "type": "scatter",
                "name": "Random",
                "x": random_roc,
                "y": random_roc,
                "mode": "lines",
                "marker": {"color": "#d8d8d8"},
            },
            *(
                {
                    "type": "scatter",
                    "name": "{} | {:0.3f} | {:0.1e}".format(
                        math, math_roc[math]["auc"], math_roc[math]["p-value"]
                    ),
                    "x": math_roc[math]["fpr"],
                    "y": math_roc[math]["tpr"],
                    "mode": "markers + lines",
                }
                for math in maths
            ),
        ],
    },
    None,
)