## Validation: External (ONCOTHROMB 12-01)

This notebook has the validation set for external cohort from Spain

References: 

1. Prediction error estimation: a comparison of resampling methods https://academic.oup.com/bioinformatics/article/21/15/3301/195433


In [None]:
import os

import numpy as np
import pandas as pd
from joblib import load
import sys
from pathlib import Path
import random
import torchtuples as tt
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from functools import wraps
import lifelines
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lifelines.utils import concordance_index
import torch

from dotenv import load_dotenv
load_dotenv()

In [None]:
seed = int(os.getenv("RANDOM_SEED"))
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

In [None]:
pd.set_option("max_colwidth", None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

module_path = str(Path("../scripts").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

from vte_deephit import c_stat, get_target, get_best_params, LabTransform, get_datasets
from utils import get_parent_dir, calc_ci, plot_roc, plot_calibration, bootstrap_ci, VTEDataLoader, get_estimated_cif, plot_grouped_risks, get_pair_counts_and_vte

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cycler

color_list = [
    "#E64B35FF",
    "#4DBBD5FF",
    "#00A087FF",
    "#3C5488FF",
    "#F39B7FFF",
    "#8491B4FF",
    "#91D1C2FF",
    "#DC0000FF",
    "#7E6148FF",
    "#B09C85FF",
]
matplotlib.rcParams["font.family"] = "Arial"
matplotlib.rcParams["axes.prop_cycle"] = cycler(color=color_list)
plt.rcParams["font.size"] = 18
plt.rcParams["axes.linewidth"] = 2

### Missing Analysis

In [None]:
# missing_analysis_df_full_data.to_csv(get_parent_dir() / "assets/data_asset/missing_analysis_dataset_spain_full.csv", index=None)

In [None]:
# missing_analysis_df_missing_data.to_csv(get_parent_dir() / "assets/data_asset/missing_analysis_dataset_spain_missing.csv", index=None)

In [None]:
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.impute import SimpleImputer
# def rmse(y_true, y_pred):
#     return np.sqrt(mean_squared_error(y_true, y_pred))


# def missing_analysis():
#     full_data = pd.read_csv(get_parent_dir() / "assets/data_asset/missing_analysis_dataset_spain_full.csv")
#     missing_data = pd.read_csv(get_parent_dir() / "assets/data_asset/missing_analysis_dataset_spain_missing.csv")
#     NUMERICAL_COLS=["AGE", "DX_delta", "ALBUMIN", "HB", "CREATININE", "TBILI", "ALKPHOS", "WBC", "PLT"]
#     other_cols = ["SEX", "CANCER_TYPE_FINAL", "SAMPLE_TYPE", "EVENT", "OBS_TIME", "KS", "HAD_CHEMO"]
#     encoder = OneHotEncoder(sparse=False, drop="first")
#     one_hot_encoded = encoder.fit_transform(missing_data[["SEX", "CANCER_TYPE_FINAL", "SAMPLE_TYPE"]])
#     one_hot_col_names = encoder.get_feature_names_out(["SEX", "CANCER_TYPE_FINAL", "SAMPLE_TYPE"])
#     one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_col_names, index=missing_data.index)
#     res = pd.concat([missing_data[NUMERICAL_COLS + ['HAD_CHEMO']], one_hot_df], axis=1)
#     imputers = [
#                 SimpleImputer(strategy="mean"),
#                 IterativeImputer(random_state=42, verbose=True, tol=1e-3),
#                 IterativeImputer(estimator=KNeighborsRegressor(), random_state=42),
#                 IterativeImputer(estimator=LinearRegression(), random_state=42, verbose=True, max_iter=100, tol=1e-3),
#                 IterativeImputer(estimator=RandomForestRegressor(), random_state=42, verbose=True, max_iter=10, tol=1e-3),
#                 IterativeImputer(estimator=ExtraTreesRegressor(), random_state=42, verbose=True, max_iter=10, tol=1e-3),
#                 KNNImputer()
#                ]
#     results = []
#     for imputer in imputers:
#         print(imputer)
#         # iterative_imputer = IterativeImputer(random_state=42, verbose=True, tol=1e-5)
#         imputed = pd.DataFrame(imputer.fit_transform(res),
#                                     columns=imputer.get_feature_names_out())
    
#         rmses = {}
#         for col in ["ALBUMIN", "HB", "CREATININE", "TBILI", "ALKPHOS"]:
#             rmses[col] = rmse(full_data[col], imputed[col])
#         results.append({imputer: rmses})
#     print(results)
#     return results

In [None]:
# results = missing_analysis()

In [None]:
results[0].keys()

In [None]:
imputer_names = [str(type(imputer)).split(".")[-1].strip("'>") for d in results for imputer in d.keys()]
scores = [list(d.values())[0] for d in results]

# Create a DataFrame
df = pd.DataFrame(scores, index=imputer_names)
df.index.name = "Imputer"
print(df)

In [None]:
MICE:             0.329602   0     0.089131  0.24065   53.55933 


In [None]:
# Impute data using IterativeImputer
iterative_imputer = IterativeImputer(random_state=42, verbose=True, tol=1e-5)
iris_imputed_iterative = pd.DataFrame(iterative_imputer.fit_transform(iris_missing),
                            columns=iterative_imputer.get_feature_names_out())

# Impute data using LinearRegressionEstimator
knn_estimator = LinearRegression()
ii_1 =  IterativeImputer(estimator=knn_estimator, random_state=42, verbose=True, max_iter=100, tol=1e-5)
iris_imputed_ii_1 = pd.DataFrame(ii_1.fit_transform(iris_missing),
                            columns=ii_1.get_feature_names_out())


# Impute data using LinearRegressionEstimator
rf_estimator = RandomForestRegressor()
ii_2 =  IterativeImputer(estimator=rf_estimator, random_state=42, verbose=True, max_iter=10, tol=1e-5)
iris_imputed_ii_2 = pd.DataFrame(ii_2.fit_transform(iris_missing),
                            columns=ii_2.get_feature_names_out())


etr_estimator = ExtraTreesRegressor()
ii_3 =  IterativeImputer(estimator=etr_estimator, random_state=42, verbose=True, max_iter=10, tol=1e-5)
iris_imputed_ii_3 = pd.DataFrame(ii_3.fit_transform(iris_missing),
                            columns=ii_3.get_feature_names_out())

# Impute data using KNNImputer
knn_imputer = 
KNNImputer()
iris_imputed_knn = pd.DataFrame(knn_imputer.fit_transform(iris_missing),
                            columns=knn_imputer.get_feature_names_out())


# rmse_by_column = {}
# for col in res.columns:
#     rmse_by_column[col] = rmse(res[col], iris_imputed_iterative[col])

# # Print the RMSE values by column
# for col, rmse_value in rmse_by_column.items():
#     print(f"RMSE for {col}: {rmse_value}")
    
# rmse_by_column = {}
# for col in res.columns:
#     rmse_by_column[col] = rmse(res[col], iris_imputed_knn[col])

# # Print the RMSE values by column
# for col, rmse_value in rmse_by_column.items():
#     print(f"RMSE for {col}: {rmse_value}")
    
# Calculate RMSE for each imputation method
rmse_iterative = rmse(res, iris_imputed_iterative)
rmse_ii_1 = rmse(res, iris_imputed_ii_1)
rmse_ii_2 = rmse(res, iris_imputed_ii_2)
rmse_ii_3 = rmse(res, iris_imputed_ii_3)
rmse_knn = rmse(res, iris_imputed_knn)


# Print the RMSE values
print("RMSE for Iterative Imputer: ", rmse_iterative)
print("RMSE for Iterative Imputer 1: ", rmse_ii_1)
print("RMSE for Iterative Imputer 2: ", rmse_ii_2)
print("RMSE for Iterative Imputer 3: ", rmse_ii_3)
print("RMSE for KNN Imputer: ", rmse_knn)

In [None]:
rmse_by_column = {}
for col in res.columns:
    rmse_by_column[col] = rmse(res[col], iris_imputed_ii_1[col])

# Print the RMSE values by column
for col, rmse_value in rmse_by_column.items():
    print(f"RMSE for {col}: {rmse_value}")
    
rmse_by_column = {}
for col in res.columns:
    rmse_by_column[col] = rmse(res[col], iris_imputed_ii_3[col])

# Print the RMSE values by column
for col, rmse_value in rmse_by_column.items():
    print(f"RMSE for {col}: {rmse_value}")

### Main Analysis

In [None]:
# !pip install  scikit-learn==1.1.3

In [None]:
def logg(func):
    @wraps(func)
    def wrapper(df, *args, **kwargs):
        print(df.shape)
        result = func(df, *args, **kwargs)
        print(result.shape)
        return result
    return wrapper

In [None]:
@logg
def copy_df(df):
    return df.copy()

@logg
def clean_spain(df):
    df = df.dropna(axis=0, subset=["OBS_TIME"])
    df = df[(df.OBS_TIME < 1065) & (df.OBS_TIME > 0)]
    df["PLT"] = df["PLT"] / 1000
    df["WBC"] = df["WBC"] / 1000
    return df

@logg
def add_test_columns(df):
    df["EVENT_6"] = np.where(df.OBS_TIME <= 180, df.EVENT, 0)
    df["OBS_TIME_6"] = np.where(df.OBS_TIME <= 180, df.OBS_TIME, 180)
    assert (df.EVENT == 0).sum() < (df.EVENT_6 == 0).sum(), "more patients should get censored at 6 months"
    assert df.OBS_TIME_6.max() <= 180, "max observed for test must be 180 days"
    return df

@logg
def remove_missing(df):
    df = df.dropna(axis=0, subset=["ALBUMIN"])
    return df

@logg
def only_deep_vte(df):
    "Only deep venous thromboembolism events"
    locations_to_ignore = [6, 8.0, 7.0, 3.0, 5.0]
    locations = [2, 2.7, 4, 4.6]
    df = df[(df.EVENT!=1) | ((df.EVENT==1) & ((df.vte_location1.isin(locations)) | (df.vte_location2.isin(locations)) | (df.vte_location3.isin(locations))))]
    return df

In [None]:
# since the external cohort's variable set do not match
# any varible set in our list of variable set - we trained an the feature set
# availble in the external cohort
feature = "ext_spain"

test_cohort_data = pd.read_csv(
    get_parent_dir() / os.getenv("DATA_DIR") / os.getenv(feature.upper())
)

In [None]:
test_cohort_data.head()

In [None]:
spain_dt = test_cohort_data.pipe(copy_df).pipe(add_test_columns)

In [None]:
# c-index on all patients at 6 months
lower, upper, mean, _ = bootstrap_ci(spain_dt, concordance_index, "EVENT_6", "KS", "OBS_TIME_6")

print(f"{mean} ({lower}, {upper})")

In [None]:
compare = test_cohort_data.pipe(copy_df).pipe(clean_spain).pipe(add_test_columns)

In [None]:
lower, upper, mean, _ = bootstrap_ci(compare, concordance_index, "EVENT_6", "KS", "OBS_TIME_6")

print(f"{mean} ({lower}, {upper})")

In [None]:
spain_data = test_cohort_data.pipe(copy_df).pipe(clean_spain).pipe(add_test_columns)


In [None]:
spain_data.ALBUMIN.describe()

In [None]:
spain_data.EVENT.value_counts()

In [None]:
lower, upper, mean, _ = bootstrap_ci(spain_data, concordance_index, "EVENT_6", "KS", "OBS_TIME_6")
print(f"{mean} ({lower}, {upper})")

In [None]:
plot_roc(spain_data, "KS", "EVENT_6")

In [None]:
spain_data["EVENT"].value_counts()

In [None]:
sns.histplot(spain_data["OBS_TIME"])

In [None]:
num_durations = int(max(spain_data["OBS_TIME"])) + 1  # for cut-points
# num_durations = np.arange(0, float(spain_data["OBS_TIME"].max()), 30)
labtrans = LabTransform(num_durations)
# labtrans_6 = LabTransform(np.arange(0, 181.0, 30))
labtrans_6 = LabTransform(181)

In [None]:
y = pd.DataFrame({"event": spain_data.EVENT_6, "times": spain_data.OBS_TIME_6})
event_type = int
y = np.array(
    [tuple(a) for a in y.values],
    dtype=list(zip(y.dtypes.index, [event_type, int])),
)

y = labtrans_6.fit_transform(*get_target(y))

In [None]:
transformation_pipeline = load(
    get_parent_dir() / f"models/{feature}/preprocessing_fit.joblib"
)

features_full = transformation_pipeline.transform(spain_data).astype('float32')
# features_full = transformation_pipeline.transform(spain_data_num_imputed).astype('float32')

In [None]:
features_full.shape

In [None]:
transformation_pipeline.get_feature_names_out()

In [None]:
spain_data[["AGE", "ALBUMIN", "CANCER_TYPE_FINAL"]].head()

In [None]:
imputed = pd.DataFrame(features_full, columns=transformation_pipeline.get_feature_names_out())

In [None]:
imputed.head()

In [None]:
spain_data.HB.describe()

In [None]:
hyper_params = get_best_params(feature)

In [None]:
hyper_params

In [None]:
labtrans_6.cuts

In [None]:
import torch
from pycox.models import DeepHit
from vte_deephit import CauseSpecificNet

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

print(device)
n = 30

params = load(get_parent_dir() / f"models/{feature}/params.pkl")
models = []
optimizer = tt.optim.AdamWR(
    lr=.1*hyper_params["lr"],
    decoupled_weight_decay=hyper_params["L2_par"],
    cycle_eta_multiplier=hyper_params["eta_par"]
)
for i in range(n):
    net = CauseSpecificNet(**params)
    m = DeepHit(net,
                optimizer=optimizer,
                alpha=hyper_params["alpha_par"],
                sigma=hyper_params["sigma_par"],
                device=device,
                duration_index=labtrans.cuts)
    m.load_model_weights(get_parent_dir() / f"models/{feature}/model_{i}.pt")
    # m.net.eval()
    models.append(m)

cifs_full = []

for sm in models:
    cifs_full.append(sm.predict_cif(features_full))

cif_full = np.mean(cifs_full, dtype=np.float64, axis=0)
c_stat_test_full = c_stat(
    cif_full[:, :181, :], y[0], y[1], labtrans_6.cuts, suffix="ext_spain_full_ks",
)
c_stat_test_full

In [None]:
samples = 1000
scores = []
for j in range(samples):
    sub_test, sub_test_y_0, sub_test_y_1 = resample(
            features_full, y[0], y[1], stratify=y[1],random_state=j
        )

    cifs_non_tl = []
    for sm in models:
        cifs_non_tl.append(sm.predict_cif(sub_test))

    cif_non_tl = np.mean(cifs_non_tl, dtype=np.float64, axis=0)
    cstat = c_stat(
            cif_non_tl[:, :181, :],
            sub_test_y_0,
            sub_test_y_1,
            models[0].duration_index,
            suffix="test_onco_no_miss_wo_tl"
        )

    scores.append(cstat)

assert len(scores) == samples

In [None]:
res = pd.concat([pd.DataFrame(df) for df in scores])
res["feature"] = feature

In [None]:
res.groupby("feature").agg(["mean", calc_ci]).to_csv("oncothromb_c_index.csv")

In [None]:
res.groupby("feature").agg(["mean", calc_ci])

## transfer learning

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
params

In [None]:
spain_data.OBS_TIME.describe()

In [None]:
spain_data.OBS_TIME_6.describe()

In [None]:
spain_data.head()

In [None]:
spain_data.EVENT.value_counts()

In [None]:
params

In [None]:
tl_models[0].net

In [None]:
params = load(get_parent_dir() / f"models/{feature}/params.pkl")
n = 30
lr = hyper_params["lr"]  # factor applied in training

for k in range(2):
from sklearn.model_selection import LeaveOneOut
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cifs = None
cifs_non_tl = None
risk_cif_tl = []
risk_cif_non_tl = []
indices = []
debug=True
test_sets = []
scores = []
for i, (train_index, test_index) in enumerate(skf.split(spain_data, spain_data.EVENT)):
    print(f"Fold {i}")
    train = spain_data.iloc[train_index]
    test = spain_data.iloc[test_index]
    test_sets.append(test)
    print(len(train))
    print(len(test))

    y_train = pd.DataFrame({"event": train.EVENT, "times": train.OBS_TIME})
    y_train = np.array(
        [tuple(a) for a in y_train.values],
        dtype=list(zip(y_train.dtypes.index, [event_type, int])),
    )

    labtrans = LabTransform(num_durations)
    y_train = labtrans.fit_transform(*get_target(y_train))

    y_test = pd.DataFrame({"event": test.EVENT_6, "times": test.OBS_TIME_6})
    y_test = np.array(
        [tuple(a) for a in y_test.values],
        dtype=list(zip(y_test.dtypes.index, [event_type, int])),
    )
    y_test = labtrans.transform(*get_target(y_test))
    
    transformation_pipeline = load(
        get_parent_dir() / f"models/{feature}/preprocessing_fit.joblib"
    )

    feature_train = transformation_pipeline.transform(train).astype("float32")
    feature_test = transformation_pipeline.transform(test).astype("float32")

    tl_models = []
    for i in range(n):
        net = CauseSpecificNet(**params)
        optimizer = tt.optim.AdamWR(
            lr=0.1*lr,
            decoupled_weight_decay=hyper_params["L2_par"],
            cycle_eta_multiplier=hyper_params["eta_par"]
        )
        tm = DeepHit(net,
                     optimizer,
                     alpha=hyper_params["alpha_par"],
                     sigma=hyper_params["sigma_par"],
                     duration_index=labtrans.cuts)
        tm.load_model_weights(get_parent_dir() / f"models/{feature}/model_{i}.pt")
        tl_models.append(tm)

    # shared_net.net.0.linear.weight
    # shared_net.net.0.linear.bias
    # shared_net.net.0.batch_norm.weight
    # shared_net.net.0.batch_norm.bias
    # shared_net.net.1.weight
    # shared_net.net.1.bias
    # risk_nets.0.net.0.linear.weight
    # risk_nets.0.net.0.linear.bias
    # risk_nets.0.net.0.batch_norm.weight
    # risk_nets.0.net.0.batch_norm.bias
    # risk_nets.0.net.1.linear.weight
    # risk_nets.0.net.1.linear.bias
    # risk_nets.0.net.1.batch_norm.weight
    # risk_nets.0.net.1.batch_norm.bias
    # risk_nets.0.net.2.weight
    # risk_nets.0.net.2.bias
    # risk_nets.1.net.0.linear.weight
    # risk_nets.1.net.0.linear.bias
    # risk_nets.1.net.0.batch_norm.weight
    # risk_nets.1.net.0.batch_norm.bias
    # risk_nets.1.net.1.linear.weight
    # risk_nets.1.net.1.linear.bias
    # risk_nets.1.net.1.batch_norm.weight
    # risk_nets.1.net.1.batch_norm.bias
    # risk_nets.1.net.2.weight
    # risk_nets.1.net.2.bias
    layers_to_tune = [
                      "risk_nets.0.net.2.weight",
                      "risk_nets.0.net.2.bias",
                      "risk_nets.1.net.2.weight",
                      "risk_nets.1.net.2.bias",
                      # "shared_net.net.0.linear.weight",
                      # "shared_net.net.0.linear.bias",
                      "shared_net.net.1.weight",
                      "shared_net.net.1.bias",
                      "risk_nets.0.net.1.linear.weight",
                      "risk_nets.0.net.1.linear.bias",
                      "risk_nets.1.net.1.linear.weight",
                      "risk_nets.1.net.1.linear.bias"
                     ]

    print(f"Models loaded: {len(tl_models)}")
    cifs_tl = []

    for tm in tl_models:
        for name, prms in tm.net.named_parameters():
            if name in layers_to_tune:
                # print(f"Tuning layers: {name}"
                prms.requires_grad = True
            else:
                prms.requires_grad = False

        total_non_trainable_params = 0
        total_trainable_params = 0
        for param in tm.net.parameters():
            if not param.requires_grad:
                total_non_trainable_params += param.numel()
            else:
                total_trainable_params += param.numel()

        log = tm.fit(feature_train,
                     (y_train[0], y_train[1]),
                     spain_data.shape[0],
                     30,
                     verbose=False)
        cifs_tl.append(tm.predict_cif(feature_test))
        if debug:
            print(f"Total Trainable params = {total_trainable_params}\nTotal non-trainable params = {total_non_trainable_params}")
            debug=False

    cif_tl = np.mean(cifs_tl, dtype=np.float64, axis=0)
    # if cifs is not None:
    #     cifs = np.append(cifs, cif_tl, axis=2)
    # else:
    #     cifs = cif_tl
    print(cif_tl.shape)
    # indices += list(test_index)
    # risk_at_180 = cif_tl[0][181, :]
    # risk_cif_tl += list(risk_at_180)

    ss_cifs_non_tl = []
    for sm in models:
        ss_cifs_non_tl.append(sm.predict_cif(feature_test))

    ss_cif_non_tl = np.mean(ss_cifs_non_tl, dtype=np.float64, axis=0)
    # if cifs_non_tl is not None:
    #     cifs_non_tl = np.append(cifs_non_tl, ss_cif_non_tl, axis=2)
    # else:
    #     cifs_non_tl = ss_cif_non_tl
    # risk_at_180_non_tl = ss_cif_non_tl[0][181, :]
    # risk_cif_non_tl += list(risk_at_180_non_tl)
    cstat = c_stat(
            cif_tl[:, :181, :],
            y_test[0],
            y_test[1],
            labtrans.cuts[:181],
            suffix="test_spain_tl_ks",
        )

    cstat.update(c_stat(
                ss_cif_non_tl[:, :181, :],
                y_test[0],
                y_test[1],
                labtrans.cuts[:181],
                suffix="test_spain_non_tl_ks"))
    scores.append(cstat)

In [None]:
res = pd.concat([pd.DataFrame(df) for df in scores])
res["feature"] = feature

In [None]:
res.groupby("feature").agg(["mean", calc_ci])

In [None]:
cif_tl.shape

In [None]:
cifs.shape

In [None]:
cstat = c_stat(
            cifs[:, :181, :],
            spain_data.iloc[indices]["OBS_TIME_6"].values,
            spain_data.iloc[indices]["EVENT_6"].values,
            labtrans.cuts[:181],
            suffix="test_spain_tl_ks",
        )

cstat.update(c_stat(
            cifs_non_tl[:, :181, :],
            spain_data.iloc[indices]["OBS_TIME_6"].values,
            spain_data.iloc[indices]["EVENT_6"].values,
            labtrans.cuts[:181],
            suffix="test_spain_non_tl_ks"))

In [None]:
cstat

In [None]:
cifs.shape

In [None]:
cifs[0].T.shape

In [None]:
np.array([cifs[0].T, cifs[1].T], ).shape

## Calibration

In [None]:
cif_df = pd.DataFrame(
    {
        "cif": cif_full[0][181],
        "event": spain_data["EVENT_6"],
        "obs_time": spain_data["OBS_TIME_6"],
        # "event": spain_data["EVENT"],
        # "obs_time": spain_data["OBS_TIME"],
        "KS": spain_data["KS"]
    }
)

In [None]:
from utils import bootstrap_prevalence_vte
def get_pair_counts_and_vte(df, ks_condition, cif_condition, alpha=0.05, time_of_interest=180.0):
    filtered_df = df[ks_condition & cif_condition]
    pair_count = len(filtered_df)
    vte_estimates = bootstrap_prevalence_vte(filtered_df["obs_time"],
                                             filtered_df["event"],
                                             n_bootstrap=2000,
                                             time_of_interest=time_of_interest)
    lower = np.percentile(vte_estimates, 100 * (alpha / 2))
    upper = np.percentile(vte_estimates, 100 * (1 - alpha / 2))
    mean = np.mean(vte_estimates)
    return pair_count, round(mean, 2), round(lower, 2), round(upper, 2), vte_estimates


In [None]:
int_risk_ppv=.09
high_risk_condition = (cif_df.KS >= 2)
low_risk_condition = (cif_df.KS < 2)
high_cif_condition = (cif_df.cif >= int_risk_ppv)
low_cif_condition = (cif_df.cif < int_risk_ppv)

(concordant_pairs_high_risk,
 concordant_high_risk_vte,
 concordant_high_risk_vte_lower,
 concordant_high_risk_vte_upper) = get_pair_counts_and_vte(cif_df, high_risk_condition, high_cif_condition)

(discordant_pairs_high_risk,
 discordant_high_risk_vte,
 discordant_high_risk_vte_lower,
 discordant_high_risk_vte_upper) = get_pair_counts_and_vte(cif_df, low_risk_condition, high_cif_condition)

(concordant_pairs_low_risk,
 concordant_low_risk_vte,
 concordant_low_risk_vte_lower,
 concordant_low_risk_vte_upper) = get_pair_counts_and_vte(cif_df, low_risk_condition, low_cif_condition)

(discordant_pairs_low_risk,
 discordant_low_risk_vte,
 discordant_low_risk_vte_lower,
 discordant_low_risk_vte_upper) = get_pair_counts_and_vte(cif_df, high_risk_condition, low_cif_condition)


In [None]:
data = {
    "Concordant Pairs": {
        "KS": ["High Risk", "Low Risk"],
        "DeepVTE": ["High Risk", "Low Risk"],
        "No": [
           concordant_pairs_high_risk,
           concordant_pairs_low_risk,
        ],
        "Incidence VTE": [concordant_high_risk_vte, concordant_low_risk_vte],
        "CI": [f"({concordant_high_risk_vte_lower}, {concordant_high_risk_vte_upper})",
               f"({concordant_low_risk_vte_lower}, {concordant_low_risk_vte_upper})"]
    },
    "Discordant Pairs": {
        "KS": ["Low Risk", "High Risk"],
        "DeepVTE": ["High Risk", "Low Risk"],
        "No": [
            discordant_pairs_high_risk,
            discordant_pairs_low_risk,
        ],
        "Incidence VTE": [discordant_high_risk_vte, discordant_low_risk_vte],
        "CI": [f"({discordant_high_risk_vte_lower}, {discordant_high_risk_vte_upper})",
               f"({discordant_low_risk_vte_lower}, {discordant_low_risk_vte_upper})"]
    },
}

In [None]:
pd.concat([pd.DataFrame.from_dict(data["Concordant Pairs"]), pd.DataFrame.from_dict(data["Discordant Pairs"])])

In [None]:
pd.concat([pd.DataFrame.from_dict(data["Concordant Pairs"]),
           pd.DataFrame.from_dict(data["Discordant Pairs"])]).to_csv("classifiction_table_oncothromb.csv")

In [None]:
plot_grouped_risks(cif_full, y[0], y[1],
                   time_of_interest=181,
                   save=True,
                   name="External Cohort B Validation Set")

In [None]:
plot_calibration(cif_df.cif_non_tl, 
                 events=np.array(cif_df.event_6),
                 durations=np.array(cif_df.obs_time_6),
                 feature="ext_spain",
                 name="ONCOTHROMB",
                 save=True)

In [None]:
a,b,c,_=bootstrap_ci(cif_df, concordance_index, "event_6", "cif_tl", "obs_time_6")

print(f"{c} ({a}, {b})")

In [None]:
a,b,c, _ = bootstrap_ci(cif_df, concordance_index, "event_6", "KS", "obs_time_6")
print(f"{c} ({a}, {b})")

In [None]:
a,b,c,_ = bootstrap_ci(cif_df, concordance_index, "event_6", "cif_non_tl","obs_time_6")
print(f"{c} ({a}, {b})")

In [None]:
plot_roc(cif_df, "cif_non_tl", "event_6", "df", "df")

In [None]:
plot_roc(cif_df, "cif_tl", "event_6", "df", "df")

In [None]:
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(cif_df.event_6==1, cif_df.cif_non_tl, n_bins=5, strategy="quantile")

# Plot the calibration curve
plt.plot(mean_predicted_value, fraction_of_positives, 's-', label='Model')
plt.plot([0, .25], [0, .25], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean predicted value')
plt.ylabel('Fraction of positives')
plt.legend()
plt.show()

In [None]:
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(cif_df.event_6==1, cif_df.cif_tl, n_bins=5, strategy="quantile")

# Plot the calibration curve
plt.plot(mean_predicted_value, fraction_of_positives, 's-', label='Model')
plt.plot([0, .25], [0, .25], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean predicted value')
plt.ylabel('Fraction of positives')
plt.legend()
plt.show()

In [None]:
plot_calibration(cif_df.cif_non_tl, 
                 events=np.array(cif_df.event_6),
                 durations=np.array(cif_df.obs_time_6),
                 feature="ext_spain",
                 name="ONCOTHROMB - Without Transfer Learning",
                 save=True)

In [None]:
plot_calibration(cif_df.cif_tl,
                 events=np.array(cif_df.event_6), 
                 durations=np.array(cif_df.obs_time_6),
                 feature="ext_spain",
                 name="ONCOTHROMB - With Transfer Learning",
                 save=True)