# Nomograms

This notebook is designed to compute and save the clinical data used for nomograms.

In [None]:
import env_apps

import os
import pandas as pd
import numpy as np

from constants import *
from src.data.datasets import Mask, TableDataset
from src.data.processing.sampling import extract_masks


We define our clinical features.

In [None]:
# FEATURES
AGE = Feature(column="AGE")
CLINICAL_STAGE = Feature(column="CLINICAL_STAGE", transform=MappingEncoding({"T1-T2": 0, "T3a": 1}))
CLINICAL_STAGE_MSKCC = Feature(
    column="CLINICAL_STAGE_MSKCC_STYLE",
    transform=MappingEncoding(
        {"T1c": 0, "T2": 0.2, "T2a": 0.2, "T2b": 0.4, "T2c": 0.6, "T3": 0.8, "T3a": 0.8, "T3b": 1}
    )
)
GLEASON_GLOBAL = Feature(column="GLEASON_GLOBAL")
GLEASON_PRIMARY = Feature(column="GLEASON_PRIMARY")
GLEASON_SECONDARY = Feature(column="GLEASON_SECONDARY")
PSA = Feature(column="PSA")


We extract the masks.

In [None]:
masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

We create the function that will be used to save the outer splits imputed dataframes.

In [None]:
def save_outer_splits_dataframes(
    path_to_df: str, 
    path_to_folder: str, 
    stage_feature: Feature, 
    mapping: dict
):
    learning_df = pd.read_csv(path_to_df)

    table_dataset = TableDataset(
        df=learning_df,
        ids_col=ID,
        tasks=TABLE_TASKS,
        cont_features=[AGE, PSA],
        cat_features=[stage_feature, GLEASON_GLOBAL, GLEASON_PRIMARY, GLEASON_SECONDARY]
    )
    
    for k, v in masks.items():
        train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
        table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
        dataframe = table_dataset.imputed_df.copy()

        dataframe[stage_feature.column] = dataframe[stage_feature.column].map(mapping)

        named_masks = {"train": train_mask, "valid": valid_mask, "test": test_mask}
        dataframe = pd.concat(
            objs=[dataframe.iloc[mask].assign(SETS=name) for name, mask in named_masks.items()],
            ignore_index=True
        )

        dataframe.to_csv(os.path.join(path_to_folder, f"outer_split_{k}.csv"), index=False)


We create the function that will be used to save the final dataframe.

In [None]:
def save_final_dataframe(
    path_to_learning_df: str, 
    path_to_holdout_df: str, 
    path_to_folder: str, 
    stage_feature: Feature, 
    mapping: dict
):
    learning_df = pd.read_csv(path_to_learning_df)
    holdout_df = pd.read_csv(path_to_holdout_df)

    df = pd.concat([learning_df, holdout_df], ignore_index=True)

    table_dataset = TableDataset(
        df=df,
        ids_col=ID,
        tasks=TABLE_TASKS,
        cont_features=[AGE, PSA],
        cat_features=[stage_feature, GLEASON_GLOBAL, GLEASON_PRIMARY, GLEASON_SECONDARY]
    )
    
    train_mask = list(range(len(learning_df)))
    test_mask = list(range(len(learning_df), len(learning_df) + len(holdout_df)))
    
    table_dataset.update_masks(
        train_mask=train_mask,
        test_mask=test_mask
    )
    
    dataframe = table_dataset.imputed_df.copy()
    dataframe[stage_feature.column] = dataframe[stage_feature.column].map(mapping)

    named_masks = {"train": train_mask, "test": test_mask}
    dataframe = pd.concat(
        objs=[dataframe.iloc[mask].assign(SETS=name) for name, mask in named_masks.items()],
        ignore_index=True
    )

    dataframe.to_csv(os.path.join(path_to_folder, f"final_set.csv"), index=False)


## CAPRA

In [None]:
save_outer_splits_dataframes(
    path_to_df="local_data/learning_table.csv", 
    path_to_folder="local_data/nomograms/CAPRA/",
    stage_feature=CLINICAL_STAGE,
    mapping={0: "T1-T2", 1: "T3a"}
)

In [None]:
save_final_dataframe(
    path_to_learning_df="local_data/learning_table.csv",
    path_to_holdout_df="local_data/holdout_table.csv",
    path_to_folder="local_data/nomograms/CAPRA/",
    stage_feature=CLINICAL_STAGE,
    mapping={0: "T1-T2", 1: "T3a"}
)

## MSKCC

In [None]:
save_outer_splits_dataframes(
    path_to_df="local_data/mskcc_learning_table.csv", 
    path_to_folder="local_data/nomograms/MSKCC/",
    stage_feature=CLINICAL_STAGE_MSKCC,
    mapping={0: "T1c", 0.2: "T2a", 0.4: "T2b", 0.6: "T2c", 0.8: "T3a", 1: "T3b"}
)

In [None]:
save_final_dataframe(
    path_to_learning_df="local_data/mskcc_learning_table.csv",
    path_to_holdout_df="local_data/mskcc_holdout_table.csv",
    path_to_folder="local_data/nomograms/MSKCC/",
    stage_feature=CLINICAL_STAGE_MSKCC,
    mapping={0: "T1c", 0.2: "T2a", 0.4: "T2b", 0.6: "T2c", 0.8: "T3a", 1: "T3b"}
)

## Custom

In [None]:
learning_df = pd.read_csv("local_data/learning_table.csv")

table_dataset = TableDataset(
    df=learning_df,
    ids_col=ID,
    tasks=TABLE_TASKS,
    cont_features=[AGE, PSA],
    cat_features=[CLINICAL_STAGE, GLEASON_GLOBAL, GLEASON_PRIMARY, GLEASON_SECONDARY]
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import balanced_accuracy_score

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

logistic_auc = []
logistic_balanced = []

mlp_auc = []
mlp_balanced = []


patients = []
for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
    dataframe = table_dataset.imputed_df.copy()

    train_dataframe = dataframe.iloc[train_mask + valid_mask]
    test_dataframe = dataframe.iloc[test_mask]
    
    patients.append(test_dataframe[ID].to_list())

    X_train = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    y_train = train_dataframe[["PN"]].to_numpy().ravel()

    X_test = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    y_test = test_dataframe[["PN"]].to_numpy().ravel()

    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    logistic_auc.append(roc_auc_score(y_test, y_pred_proba))
    logistic_balanced.append(balanced_accuracy_score(y_test, y_pred))
    
    clf = MLPClassifier(random_state=0, max_iter=500).fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    mlp_auc.append(roc_auc_score(y_test, y_pred_proba))
    mlp_balanced.append(balanced_accuracy_score(y_test, y_pred))
    
def find_intersection(list_of_lists):
    if not list_of_lists:
        return []

    # Convert the first list to a set
    intersection_set = set(list_of_lists[0])

    # Iterate over the remaining lists and find the intersection
    for lst in list_of_lists[1:]:
        intersection_set = intersection_set.intersection(lst)

    # Convert the set back to a list
    intersection_list = list(intersection_set)

    return intersection_list

print(len(find_intersection([patients[0], patients[1]])))
print(len(find_intersection([patients[0], patients[2]])))
print(len(find_intersection([patients[0], patients[3]])))
print(len(find_intersection([patients[0], patients[4]])))
print(len(find_intersection([patients[1], patients[2]])))
print(len(find_intersection([patients[1], patients[3]])))
print(len(find_intersection([patients[1], patients[4]])))
print(len(find_intersection([patients[2], patients[3]])))
print(len(find_intersection([patients[2], patients[4]])))
print(len(find_intersection([patients[3], patients[4]])))

print(logistic_auc)
print(f"Logistic AUC: ({np.mean(logistic_auc)}+-{np.std(logistic_auc)})")
print(f"Logistic Balanced: ({np.mean(logistic_balanced)}+-{np.std(logistic_balanced)})")
print(f"MLP AUC: ({np.mean(mlp_auc)}+-{np.std(mlp_auc)})")
print(f"MLP Balanced: ({np.mean(mlp_balanced)}+-{np.std(mlp_balanced)})")


In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

bcr_cindex = []

for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
    dataframe = table_dataset.imputed_df.copy()
    
    train_dataframe = dataframe.iloc[train_mask + valid_mask]
    test_dataframe = dataframe.iloc[test_mask]
    
    train_dataframe = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "BCR", "BCR_TIME"]
    ].dropna()
    test_dataframe = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "BCR", "BCR_TIME"]
    ].dropna()

    X_train = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_train = train_dataframe[["BCR", "BCR_TIME"]].copy()
    df_train["BCR"] = df_train["BCR"].astype(bool)
    records = df_train.to_records(index=False)
    y_train = np.array(records, dtype = records.dtype.descr)

    X_test = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_test = test_dataframe[["BCR", "BCR_TIME"]].copy()
    df_test["BCR"] = df_test["BCR"].astype(bool)
    records = df_test.to_records(index=False)
    y_test = np.array(records, dtype = records.dtype.descr)

    clf = CoxnetSurvivalAnalysis().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    bcr_cindex.append(concordance_index_censored(y_test["BCR"], y_test["BCR_TIME"], y_pred)[0])

print(f"BCR C-index: ({np.mean(bcr_cindex)}+-{np.std(bcr_cindex)})")


In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

crpc_cindex = []

for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
    dataframe = table_dataset.imputed_df.copy()
    
    train_dataframe = dataframe.iloc[train_mask + valid_mask]
    test_dataframe = dataframe.iloc[test_mask]
    
    train_dataframe = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "CRPC", "CRPC_TIME"]
    ].dropna()
    test_dataframe = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "CRPC", "CRPC_TIME"]
    ].dropna()

    X_train = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_train = train_dataframe[["CRPC", "CRPC_TIME"]].copy()
    df_train["CRPC"] = df_train["CRPC"].astype(bool)
    records = df_train.to_records(index=False)
    y_train = np.array(records, dtype = records.dtype.descr)

    X_test = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_test = test_dataframe[["CRPC", "CRPC_TIME"]].copy()
    df_test["CRPC"] = df_test["CRPC"].astype(bool)
    records = df_test.to_records(index=False)
    y_test = np.array(records, dtype = records.dtype.descr)

    clf = CoxnetSurvivalAnalysis().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    crpc_cindex.append(concordance_index_censored(y_test["CRPC"], y_test["CRPC_TIME"], y_pred)[0])

print(f"CRPC C-index: ({np.mean(crpc_cindex)}+-{np.std(crpc_cindex)})")


In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

death_cindex = []

for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
    dataframe = table_dataset.imputed_df.copy()
    
    train_dataframe = dataframe.iloc[train_mask + valid_mask]
    test_dataframe = dataframe.iloc[test_mask]
    
    train_dataframe = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "DEATH", "DEATH_TIME"]
    ].dropna()
    test_dataframe = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "DEATH", "DEATH_TIME"]
    ].dropna()

    X_train = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_train = train_dataframe[["DEATH", "DEATH_TIME"]].copy()
    df_train["DEATH"] = df_train["DEATH"].astype(bool)
    records = df_train.to_records(index=False)
    y_train = np.array(records, dtype = records.dtype.descr)

    X_test = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_test = test_dataframe[["DEATH", "DEATH_TIME"]].copy()
    df_test["DEATH"] = df_test["DEATH"].astype(bool)
    records = df_test.to_records(index=False)
    y_test = np.array(records, dtype = records.dtype.descr)

    clf = CoxnetSurvivalAnalysis().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    try: 
        death_cindex.append(concordance_index_censored(y_test["DEATH"], y_test["DEATH_TIME"], y_pred)[0])
    except ValueError:
        pass

print(f"DEATH C-index: ({np.mean(death_cindex)}+-{np.std(death_cindex)})")


In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

metastasis_cindex = []

for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
    dataframe = table_dataset.imputed_df.copy()
    
    train_dataframe = dataframe.iloc[train_mask + valid_mask]
    test_dataframe = dataframe.iloc[test_mask]
    
    train_dataframe = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "METASTASIS", "METASTASIS_TIME"]
    ].dropna()
    test_dataframe = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "METASTASIS", "METASTASIS_TIME"]
    ].dropna()

    X_train = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_train = train_dataframe[["METASTASIS", "METASTASIS_TIME"]].copy()
    df_train["METASTASIS"] = df_train["METASTASIS"].astype(bool)
    records = df_train.to_records(index=False)
    y_train = np.array(records, dtype = records.dtype.descr)

    X_test = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_test = test_dataframe[["METASTASIS", "METASTASIS_TIME"]].copy()
    df_test["METASTASIS"] = df_test["METASTASIS"].astype(bool)
    records = df_test.to_records(index=False)
    y_test = np.array(records, dtype = records.dtype.descr)

    clf = CoxnetSurvivalAnalysis().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    metastasis_cindex.append(concordance_index_censored(y_test["METASTASIS"], y_test["METASTASIS_TIME"], y_pred)[0])

print(f"METASTASIS C-index: ({np.mean(metastasis_cindex)}+-{np.std(metastasis_cindex)})")


In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

htx_cindex = []

for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
    dataframe = table_dataset.imputed_df.copy()
    
    train_dataframe = dataframe.iloc[train_mask + valid_mask]
    test_dataframe = dataframe.iloc[test_mask]
    
    train_dataframe = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "HTX", "HTX_TIME"]
    ].dropna()
    test_dataframe = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA", "HTX", "HTX_TIME"]
    ].dropna()

    X_train = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_train = train_dataframe[["HTX", "HTX_TIME"]].copy()
    df_train["HTX"] = df_train["HTX"].astype(bool)
    records = df_train.to_records(index=False)
    y_train = np.array(records, dtype = records.dtype.descr)

    X_test = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    df_test = test_dataframe[["HTX", "HTX_TIME"]].copy()
    df_test["HTX"] = df_test["HTX"].astype(bool)
    records = df_test.to_records(index=False)
    y_test = np.array(records, dtype = records.dtype.descr)

    clf = CoxnetSurvivalAnalysis().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    htx_cindex.append(concordance_index_censored(y_test["HTX"], y_test["HTX_TIME"], y_pred)[0])

print(f"HTX C-index: ({np.mean(htx_cindex)}+-{np.std(htx_cindex)})")
