In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import csv 
import pickle
from scipy import stats
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, f1_score, recall_score
from sklearn.linear_model import LogisticRegression

from domino.data.cxr import get_dp, build_cxr_df, get_cxr_activations, rle2mask


## Extract train/test features from Image-Only model

In [2]:
# Get a mosaic DataPanel with the data.
df = build_cxr_df.out(load=True)
dp = get_dp(df)
dp.head()

Unnamed: 0,image_id (NumpyArrayColumn),encoded_pixels (NumpyArrayColumn),pmx (NumpyArrayColumn),filepath (NumpyArrayColumn),chest_tube (NumpyArrayColumn),split (NumpyArrayColumn),gaze_seq (NumpyArrayColumn),gaze_heatmap (NumpyArrayColumn),gaze_max_visit (NumpyArrayColumn),gaze_unique (NumpyArrayColumn),gaze_time (NumpyArrayColumn),gaze_diffusivity (NumpyArrayColumn),expert_label (NumpyArrayColumn),index (ListColumn),input (CellColumn),img (CellColumn)
0,1.2.276.0.7230010.3.1.4.8323329.6904.151787520...,-1,0,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'0',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
1,1.2.276.0.7230010.3.1.4.8323329.13666.15178752...,557374 2 1015 8 1009 14 1002 20 997 26 990 32 ...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'1',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
2,1.2.276.0.7230010.3.1.4.8323329.11028.15178752...,-1,0,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'2',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
3,1.2.276.0.7230010.3.1.4.8323329.10366.15178752...,514175 10 1008 29 994 30 993 32 991 33 990 34 ...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'3',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
4,1.2.276.0.7230010.3.1.4.8323329.10016.15178752...,592184 33 976 58 956 73 941 88 926 102 917 109...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'4',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...


In [3]:
model_pth = "/media/nvme_data/observational_results_10_2020/original/cxr/emmental_cam/cam_0/seed_0/best_model_target_cxr_val_accuracy.pth"

act_dp = get_cxr_activations(dp=dp, model_path=model_pth)


HBox(children=(FloatProgress(value=0.0, max=95.0), HTML(value='')))




In [4]:
# train_val mask are points that have gaze
train_val_mask = dp["gaze_seq"].data != "nan"
act_dp[train_val_mask].head()

Unnamed: 0,image_id (NumpyArrayColumn),encoded_pixels (NumpyArrayColumn),pmx (NumpyArrayColumn),filepath (NumpyArrayColumn),chest_tube (NumpyArrayColumn),split (NumpyArrayColumn),gaze_seq (NumpyArrayColumn),gaze_heatmap (NumpyArrayColumn),gaze_max_visit (NumpyArrayColumn),gaze_unique (NumpyArrayColumn),gaze_time (NumpyArrayColumn),gaze_diffusivity (NumpyArrayColumn),expert_label (NumpyArrayColumn),index (ListColumn),input (TensorColumn),img (ListColumn),pred (NumpyArrayColumn),probs (NumpyArrayColumn),activation_block4 (NumpyArrayColumn)
0,1.2.276.0.7230010.3.1.4.8323329.1857.151787516...,378999 2 1018 8 1013 13 1009 15 1007 17 1006 1...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,1.0,train,"[(0.3002531741212215, 0.9348943074817682, 2), ...",[[ 0. 0. 0. 14. 2. 2. 1. 0.]\n [ 0. 0. ...,14.0,12.0,33.0,0.545455,1.0,'29',"np.ndarray(shape=torch.Size([3, 224, 224]))",<PIL.Image.Image image mode=L size=1024x1024 a...,0,"np.ndarray(shape=(2,))","np.ndarray(shape=(2048, 7, 7))"
1,1.2.276.0.7230010.3.1.4.8323329.1219.151787516...,-1,0,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,1.0,train,"[(0.3135288701615107, 0.5596042556980056, 2), ...",[[0. 0. 0. 1. 3. 0. 0. 0.]\n [0. 0. 0. 3. 3. 2...,7.0,19.0,46.0,0.369565,1.0,'39',"np.ndarray(shape=torch.Size([3, 224, 224]))",<PIL.Image.Image image mode=L size=1024x1024 a...,0,"np.ndarray(shape=(2,))","np.ndarray(shape=(2048, 7, 7))"
2,1.2.276.0.7230010.3.1.4.8323329.2118.151787517...,591104 8 1011 18 1001 26 993 32 988 36 986 37 ...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,1.0,train,"[(0.1227472372112641, 0.5808183519907184, 3), ...",[[ 0. 0. 0. 0. 4. 1. 0. 1.]\n [ 0. 0. ...,10.0,23.0,81.0,0.395062,1.0,'43',"np.ndarray(shape=torch.Size([3, 224, 224]))",<PIL.Image.Image image mode=L size=1024x1024 a...,0,"np.ndarray(shape=(2,))","np.ndarray(shape=(2048, 7, 7))"
3,1.2.276.0.7230010.3.1.4.8323329.2027.151787517...,500845 25 992 38 983 41 980 44 978 46 975 49 9...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,0.0,train,"[(0.3294461311053661, 0.5748967530438639, 4), ...",[[ 0. 0. 4. 1. 10. 8. 0. 0.]\n [ 0. 0. ...,11.0,21.0,78.0,0.641026,1.0,'50',"np.ndarray(shape=torch.Size([3, 224, 224]))",<PIL.Image.Image image mode=L size=1024x1024 a...,0,"np.ndarray(shape=(2,))","np.ndarray(shape=(2048, 7, 7))"
4,1.2.276.0.7230010.3.1.4.8323329.1972.151787517...,-1,0,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,0.0,train,"[(0.25439717874245193, 0.6421419414319375, 2),...",[[ 0. 0. 5. 0. 0. 2. 0. 0.]\n [ 0. 1. ...,10.0,22.0,75.0,0.266667,0.0,'53',"np.ndarray(shape=torch.Size([3, 224, 224]))",<PIL.Image.Image image mode=L size=1024x1024 a...,0,"np.ndarray(shape=(2,))","np.ndarray(shape=(2048, 7, 7))"


In [5]:
# act_dp_ = get_cxr_activations.out(run_id=35,load=True)
# act_dp_.head()

## Train LR model to predict mistakes from gaze feats

In [109]:
def gaze_mistake_predictor(act_dp,pmx_labels_trainval,val_scale,seed,gaze_key="gaze_time",verbose=False):
    sss = StratifiedShuffleSplit(
        n_splits=1, test_size=val_scale, random_state=seed
    )

    for train_ndx, val_ndx in sss.split(np.zeros(len(pmx_labels_trainval)), pmx_labels_trainval):
        train_ndxs = train_ndx
        val_ndxs = val_ndx


    image_feats_train = act_dp["activation_block4"][train_ndxs].reshape(-1,2048,49).mean(2)
    pmx_labels_train = act_dp["pmx"][train_val_mask][train_ndxs]
    expert_labels_train = act_dp["expert_label"][train_val_mask][train_ndxs]
    incorrect_labels_train = expert_labels_train != pmx_labels_train
    if gaze_key == "combined":
        gaze_feature_train = np.stack((
            act_dp["gaze_time"][train_val_mask][train_ndxs],
            act_dp["gaze_max_visit"][train_val_mask][train_ndxs],
            act_dp["gaze_unique"][train_val_mask][train_ndxs],
            act_dp["gaze_diffusivity"][train_val_mask][train_ndxs],
            )).T
    else:
        gaze_feature_train = act_dp[gaze_key][train_val_mask][train_ndxs].reshape(-1,1)

    X_train = gaze_feature_train #np.concatenate((gaze_feature_train,image_feats_train),axis=1)

    image_feats_val = act_dp["activation_block4"][val_ndxs].reshape(-1,2048,49).mean(2)
    pmx_labels_val = act_dp["pmx"][train_val_mask][val_ndxs]
    expert_labels_val = act_dp["expert_label"][train_val_mask][val_ndxs]
    incorrect_labels_val = expert_labels_val != pmx_labels_val
    if gaze_key == "combined":
        gaze_feature_val = np.stack((
            act_dp["gaze_time"][train_val_mask][val_ndxs],
            act_dp["gaze_max_visit"][train_val_mask][val_ndxs],
            act_dp["gaze_unique"][train_val_mask][val_ndxs],
            act_dp["gaze_diffusivity"][train_val_mask][val_ndxs],
            )).T
    else:
        gaze_feature_val = act_dp[gaze_key][train_val_mask][val_ndxs].reshape(-1,1)

    X_val = gaze_feature_val #np.concatenate((gaze_feature_val,image_feats_val),axis=1)
    

    clf = LogisticRegression(random_state=seed).fit(X_train, incorrect_labels_train)
    val_probs = clf.predict_proba(X_val)
    auroc = roc_auc_score(incorrect_labels_val,val_probs[:,1])

    if verbose:
        print(f"Num. of incorrect points in train: {incorrect_labels_train.sum()}/{len(incorrect_labels_train)}")
        print(f"Num. of incorrect points in val: {incorrect_labels_val.sum()}/{len(incorrect_labels_val)}")
        print(f"AUROC: {auroc:.3f}")

    return auroc



In [111]:
# stratify shuffle split the train_val mask to train and val
from sklearn.model_selection import StratifiedShuffleSplit

val_scale = 0.3

pmx_labels_trainval = act_dp["pmx"][train_val_mask]

gaze_keys = ["gaze_time","gaze_max_visit","gaze_unique","gaze_diffusivity","combined"]
#gaze_keys = ["combined"]

for gaze_key in gaze_keys:
    auc_list = []
    for seed in range(100):
        auc = gaze_mistake_predictor(act_dp,pmx_labels_trainval,val_scale,seed,gaze_key=gaze_key)
        auc_list.append(auc)
    auc_list = np.array(auc_list)

    print(f"{gaze_key}:\t\tAUROC {auc_list.mean():.3f} +- {auc_list.std():.4f}")


gaze_time:		AUROC 0.765 +- 0.0385
gaze_max_visit:		AUROC 0.752 +- 0.0386
gaze_unique:		AUROC 0.682 +- 0.0394
gaze_diffusivity:		AUROC 0.565 +- 0.0507
combined:		AUROC 0.762 +- 0.0384
