# Radiomic features selection

This notebook is designed to compute and save the data used for radiomics analysis.

In [None]:
import env_apps

import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from constants import *
from src.data.datasets import Mask, TableDataset
from src.data.processing.sampling import extract_masks


We define our clinical features.

In [None]:
AGE = Feature(column="AGE")
CLINICAL_STAGE = Feature(column="CLINICAL_STAGE", transform=MappingEncoding({"T1-T2": 0, "T3a": 1}))
GLEASON_GLOBAL = Feature(column="GLEASON_GLOBAL")
GLEASON_PRIMARY = Feature(column="GLEASON_PRIMARY")
GLEASON_SECONDARY = Feature(column="GLEASON_SECONDARY")
PSA = Feature(column="PSA")


We extract the masks.

In [None]:
masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

We create a plotting function.

In [None]:
def plot_features_importance(forest, columns, figsize=(12, 6)):
    fig, ax = plt.subplots(1, 1, figsize=figsize)

    importances = forest.feature_importances_

    tree_importance_sorted_idx = np.argsort(importances)
    tree_indices = np.arange(0, len(importances)) + 0.5

    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)[tree_importance_sorted_idx][-10:]

    ax.barh(tree_indices[:10], importances[tree_importance_sorted_idx][-10:], height=0.7, xerr=std)
    ax.set_yticks(tree_indices[:10])
    ax.set_yticklabels(columns[tree_importance_sorted_idx][-10:])
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    ax.set_ylim((0, len(importances[-10:])))
    fig.tight_layout()
    plt.show()


We create a function to retrieve targets.

In [None]:
def get_targets(path: str, target_column: str):
    df = pd.read_csv(path)
    targets = df.loc[:, [target_column]]
    targets = np.array(targets).ravel()
    
    return targets


We create a function extract radiomics.

In [None]:
def get_radiomics_df(path: str, modality: str):
    radiomics_df = pd.read_csv(path)
    radiomics_df = radiomics_df.sort_values(["ID"])

    columns = []
    for column in radiomics_df.columns:
        if modality == "PT" and column.startswith("original_shape"):
            continue
        elif column.startswith("diagnostics") or column == "ID":
            continue
        else:
            columns.append(column)

    radiomics_df = radiomics_df[columns]
    radiomics_df = radiomics_df.rename(columns=lambda x: f"{modality}_{x}")

    return radiomics_df


We create a function to save dataframes containing both clinical and radiomics data.

In [None]:
def save_outer_splits_dataframes(
    path_to_clinical_df: str,
    path_to_ct_radiomics_df: str,
    path_to_pt_radiomics_df: str,
    path_to_folder: str, 
    masks: dict,
    mapping: dict = None
):
    learning_df = pd.read_csv(path_to_clinical_df)

    table_dataset = TableDataset(
        df=learning_df,
        ids_col=ID,
        tasks=TABLE_TASKS,
        cont_features=[AGE, PSA],
        cat_features=[CLINICAL_STAGE, GLEASON_GLOBAL, GLEASON_PRIMARY, GLEASON_SECONDARY]
    )

    ct_radiomics_df = get_radiomics_df(path_to_ct_radiomics_df, "CT")
    pt_radiomics_df = get_radiomics_df(path_to_pt_radiomics_df, "PT")
    radiomics_df = pd.concat([ct_radiomics_df, pt_radiomics_df], axis=1)

    for k, v in masks.items():
        print(f"Outer split {k}")
        train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
        table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
        dataframe = table_dataset.imputed_df.copy()

        if mapping:
            dataframe[CLINICAL_STAGE.column] = dataframe[CLINICAL_STAGE.column].map(mapping)

        dataframes = [dataframe]
        for task in TABLE_TASKS:
            targets = dataframe.loc[:, [task.target_column]]
            targets = np.array(targets).ravel()
            targets = targets[train_mask + valid_mask]
            nan_mask = np.isnan(targets)

            y_train = targets[~nan_mask]
            X_train = radiomics_df.iloc[train_mask + valid_mask][~nan_mask]

            forest = RandomForestClassifier(n_estimators=10000, random_state=0)
            forest.fit(X_train, y_train)

            importances = forest.feature_importances_
            forest_importances = pd.Series(importances, index=radiomics_df.columns)
            most_important_features = list(forest_importances.nlargest(n=6).index)
            radiomics = radiomics_df[most_important_features].copy()
            radiomics = radiomics.rename(columns=lambda x: f"{task.target_column}_{x}")
            dataframes.append(radiomics)

        named_masks = {"train": train_mask, "valid": valid_mask, "test": test_mask}
        dataframe = pd.concat(dataframes, axis=1)
        display(dataframe)
        dataframe = pd.concat(
            objs=[dataframe.iloc[mask].assign(SETS=name) for name, mask in named_masks.items()],
            ignore_index=True
        )

        # dataframe.to_csv(os.path.join(path_to_folder, f"outer_split_{k}.csv"), index=False)


In [None]:
save_outer_splits_dataframes(
    path_to_clinical_df="local_data/learning_table.csv",
    path_to_ct_radiomics_df="local_data/ct_radiomics.csv",
    path_to_pt_radiomics_df="local_data/pt_radiomics.csv",
    path_to_folder="local_data/radiomics/",
    masks=masks,
    mapping={0: "T1-T2", 1: "T3a"}
)

In [None]:
forest = RandomForestClassifier(n_estimators=10000, random_state=0)
forest.fit(radiomics_df, targets)

plot_features_importance(forest, radiomics_df, (12, 28))


In [None]:
importances = forest.feature_importances_
forest_importances = pd.Series(importances, index=radiomics_df.columns)
most_important_features = list(forest_importances.nlargest(n=6).index)
radiomics = radiomics_df[most_important_features]
display(radiomics)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import balanced_accuracy_score

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

logistic_auc = []
logistic_balanced = []
patients = []
for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    
    X_train = radiomics.iloc[train_mask + valid_mask].to_numpy()
    X_test = radiomics.iloc[test_mask].to_numpy()

    y_train = targets[train_mask + valid_mask]
    y_test = targets[test_mask]

    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    logistic_auc.append(roc_auc_score(y_test, y_pred_proba))
    logistic_balanced.append(balanced_accuracy_score(y_test, y_pred))

print(logistic_auc)
print(f"Logistic AUC: ({np.mean(logistic_auc)}+-{np.std(logistic_auc)})")
print(f"Logistic Balanced: ({np.mean(logistic_balanced)}+-{np.std(logistic_balanced)})")
