In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import plotly as pl

In [None]:
sys.path.insert(0, "..")

import ccal

np.random.random(20121020)

pl.offline.init_notebook_mode(connected=True)

In [None]:
df = pd.read_table("titanic.tsv", index_col=0)

df = df[["sex", "age", "fare", "survived"]].dropna()

df

In [None]:
sys.path.insert(0, "../../nd_array")

g = np.asarray(df["sex"] == "male", dtype=int)

g_name = "Gender"

a = np.asarray(df["age"])

a_name = "Age"

f = ccal.log_nd_array(
    df["fare"].values, shift_as_necessary_to_achieve_min_before_logging="0<"
)

f_name = "Fare"

s = np.asarray(df["survived"])

s_name = "Survival"

ccal.plot_histogram(
    (g, a, f, s),
    names=(g_name, a_name, f_name, s_name),
    title="Variable Distributions",
    xaxis_title="Variable Value",
)

In [None]:
p_s1 = (s == 1).sum() / s.size

p_s1

In [None]:
grid_size = 32

In [None]:
p_s__g, p_s1__g = ccal.infer(
    (g, s), grid_size=grid_size, target=1, names=(g_name, s_name)
)

p_s__a, p_s1__a = ccal.infer(
    (a, s), grid_size=grid_size, target=1, names=(a_name, s_name)
)

p_s__f, p_s1__f = ccal.infer(
    (f, s), grid_size=grid_size, target=1, names=(f_name, s_name)
)

In [None]:
p_s__a_f, p_s1__a_f = ccal.infer(
    (a, f, s), grid_size=grid_size, target=1, names=(a_name, f_name, s_name)
)

p_s__a_f_naive, p_s1__a_f_naive = ccal.infer_assuming_independence(
    (a, f, s), grid_size=grid_size, target=1, names=(a_name, f_name, s_name)
)

In [None]:
from sklearn.metrics import auc, roc_curve

maths = (
    "P(S = 1 | G)",
    "P(S = 1 | A)",
    "P(S = 1 | F)",
    "P(S = 1 | A, F)",
    "P(S = 1 | A, F) (naive)",
)

math_roc = {math: {} for math in maths}

for math, p_s1__v, vs in zip(
    maths,
    (p_s1__g, p_s1__a, p_s1__f, p_s1__a_f, p_s1__a_f_naive),
    ((g,), (a,), (f,), (a, f), (a, f)),
):

    p_s1__vv = np.full(s.size, np.nan)

    for i in range(s.size):

        coordinate = [
            [np.argmin(abs(np.linspace(v.min(), v.max(), grid_size) - v[i]))]
            for v in vs
        ]

        p_s1__vv[i] = p_s1__v[coordinate]

    fpr, tpr, t = roc_curve(s, ccal.normalize_nd_array(p_s1__vv, None, "0-1"))

    math_roc[math]["fpr"] = fpr

    math_roc[math]["tpr"] = tpr

    auc_ = auc(fpr, tpr)

    math_roc[math]["auc"] = auc_

    n_permutation_for_roc = 1000

    permuting_aucs = np.full(n_permutation_for_roc, np.nan)

    permuting_s = s.copy()

    for i in range(n_permutation_for_roc):

        np.random.shuffle(permuting_s)

        permuting_fpr, permuting_tpr, permuting_t = roc_curve(permuting_s, p_s1__vv)

        permuting_aucs[i] = auc(permuting_fpr, permuting_tpr)

    math_roc[math]["p-value"] = ccal.compute_empirical_p_value(
        auc_, permuting_aucs, "great"
    )

In [None]:
ccal.plot_bayesian_nomogram(
    s, 1, 0, grid_size, (p_s__g, p_s__a, p_s__f), (g_name, a_name, f_name)
)

In [None]:
random_roc = np.linspace(0, 1, 16)

ccal.plot_points(
    (random_roc,) + tuple(math_roc[math]["fpr"] for math in maths),
    (random_roc,) + tuple(math_roc[math]["tpr"] for math in maths),
    names=("Random ROC",)
    + tuple(
        "{} | {:0.3f} | {:0.1e}".format(
            math, math_roc[math]["auc"], math_roc[math]["p-value"]
        )
        for math in maths
    ),
    modes=("markers",) + ("markers + lines",) * len(maths),
    title="ROC: G={}, A={}, F={}".format(g_name, a_name, f_name),
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    legend_orientation="h",
)