In [1]:
import numpy as np 
import meerkat as mk

from domino.data.cxr import get_dp, build_cxr_df, rle2mask, get_cxr_activations

In [2]:
# Get a mosaic DataPanel with the data.
df = build_cxr_df.out(load=True)
dp = get_dp(df)
dp.head()

Unnamed: 0,image_id (NumpyArrayColumn),encoded_pixels (NumpyArrayColumn),pmx (NumpyArrayColumn),filepath (NumpyArrayColumn),chest_tube (NumpyArrayColumn),split (NumpyArrayColumn),gaze_seq (NumpyArrayColumn),gaze_heatmap (NumpyArrayColumn),gaze_max_visit (NumpyArrayColumn),gaze_unique (NumpyArrayColumn),gaze_time (NumpyArrayColumn),gaze_diffusivity (NumpyArrayColumn),expert_label (NumpyArrayColumn),index (ListColumn),input (CellColumn),img (CellColumn)
0,1.2.276.0.7230010.3.1.4.8323329.6904.151787520...,-1,0,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'0',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
1,1.2.276.0.7230010.3.1.4.8323329.13666.15178752...,557374 2 1015 8 1009 14 1002 20 997 26 990 32 ...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'1',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
2,1.2.276.0.7230010.3.1.4.8323329.11028.15178752...,-1,0,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'2',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
3,1.2.276.0.7230010.3.1.4.8323329.10366.15178752...,514175 10 1008 29 994 30 993 32 991 33 990 34 ...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'3',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...
4,1.2.276.0.7230010.3.1.4.8323329.10016.15178752...,592184 33 976 58 956 73 941 88 926 102 917 109...,1,/media/4tb_hdd/siim/dicom-images-train/1.2.276...,,train,,,,,,,,'4',MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...,MedicalVolumeCell([PosixPath('/media/4tb_hdd/s...


In [3]:
# consider train mask to have real tube label 
train_mask = np.logical_and(~np.isnan(dp["chest_tube"].data), dp["split"].data=="train")
test_mask = np.logical_and(~np.isnan(dp["chest_tube"].data), dp["split"].data=="test")
print(f"Number in train: {train_mask.sum()}")
print(f"Number in test: {test_mask.sum()}")

train_dp = dp[train_mask]
test_dp = dp[test_mask]

Number in train: 951
Number in test: 1000


In [4]:
# print number of tubes in each class in train

def print_tube_subclass_count(train_dp):
    pos_tube_mask = np.logical_and(train_dp["pmx"].data==1,train_dp["chest_tube"].data==1)
    pos_notube_mask = np.logical_and(train_dp["pmx"].data==1,train_dp["chest_tube"].data==0)
    neg_tube_mask = np.logical_and(train_dp["pmx"].data==0,train_dp["chest_tube"].data==1)
    neg_notube_mask = np.logical_and(train_dp["pmx"].data==0,train_dp["chest_tube"].data==0)

    print(f"Positive, Tubes: {pos_tube_mask.sum()}")
    print(f"Positive, No Tubes: {pos_notube_mask.sum()}")
    print(f"Negative, Tubes: {neg_tube_mask.sum()}")
    print(f"Negative, No Tubes: {neg_notube_mask.sum()}")

print_tube_subclass_count(train_dp)

Positive, Tubes: 151
Positive, No Tubes: 54
Negative, Tubes: 107
Negative, No Tubes: 639


## Undersample majority classes

In [5]:
# randomly sample such that number of images is roughly the same in each subclass
np.random.seed(0)
retained_ndxs = []
for ndx in range(len(train_dp)):
    pmx = train_dp[ndx]["pmx"]
    tube = train_dp[ndx]["chest_tube"]
    keep = True
    if pmx and tube and np.random.rand()>0.3:
        keep = False
    if not pmx and not tube and np.random.rand()>0.15:
        keep = False

    if keep:
        retained_ndxs.append(ndx)

resampled_train_dp = train_dp[retained_ndxs]
print_tube_subclass_count(resampled_train_dp)

Positive, Tubes: 52
Positive, No Tubes: 54
Negative, Tubes: 107
Negative, No Tubes: 105


In [6]:
# merge resampled_train and test, then save dp
resampled_dp = resampled_train_dp.append(test_dp)
print(f"length of resampled_dp: {len(resampled_dp)}")

#mk.DataPanel.write(resampled_dp,path="/media/4tb_hdd/siim/balanced_dp_08-06-21.dp")

length of resampled_dp: 1318


## Oversample minority classes

In [24]:
# upsample such that number of images is roughly the same in each subclass
np.random.seed(0)
retained_ndxs = []

upsampled_train_dp = train_dp
for ndx in range(len(train_dp)):
    pmx = train_dp[ndx]["pmx"]
    tube = train_dp[ndx]["chest_tube"]
    if pmx and not tube:
        upsampled_train_dp = mk.concat([upsampled_train_dp,train_dp[ndx:ndx+1]])
        upsampled_train_dp = mk.concat([upsampled_train_dp,train_dp[ndx:ndx+1]])
    if not pmx and tube:
        upsampled_train_dp = mk.concat([upsampled_train_dp,train_dp[ndx:ndx+1]])
        upsampled_train_dp = mk.concat([upsampled_train_dp,train_dp[ndx:ndx+1]])
        upsampled_train_dp = mk.concat([upsampled_train_dp,train_dp[ndx:ndx+1]])
        upsampled_train_dp = mk.concat([upsampled_train_dp,train_dp[ndx:ndx+1]])
        upsampled_train_dp = mk.concat([upsampled_train_dp,train_dp[ndx:ndx+1]])


   
print_tube_subclass_count(upsampled_train_dp)

Positive, Tubes: 151
Positive, No Tubes: 162
Negative, Tubes: 642
Negative, No Tubes: 639


In [26]:
# merge resampled_train and test, then save dp
upsampled_dp = upsampled_train_dp.append(test_dp)
print(f"length of upsampled_train_dp: {len(upsampled_dp)}")

mk.DataPanel.write(upsampled_dp,path="/media/4tb_hdd/siim/upsampled_dp_08-24-21.dp")

length of upsampled_train_dp: 2594


## Correlation between tubes and abnormal area

In [7]:
dp_ = resampled_train_dp
pos_tube_mask = np.logical_and(dp_["pmx"].data==1,dp_["chest_tube"].data==1)
pos_notube_mask = np.logical_and(dp_["pmx"].data==1,dp_["chest_tube"].data==0)
rle_1 = dp_["encoded_pixels"][pos_tube_mask]
rle_2 = dp_["encoded_pixels"][pos_notube_mask]

segmasks_1 = np.array([rle2mask(rle,1024,1024).T for rle in rle_1 if rle!='-1'])
segmasks_2 = np.array([rle2mask(rle,1024,1024).T for rle in rle_2 if rle!='-1'])

abn_size1 = segmasks_1.mean((1,2))
abn_size2 = segmasks_2.mean((1,2))

print(f"Average abnormal area for pos tube: {abn_size1.mean():.4f}")
print(f"Average abnormal area for pos no tube: {abn_size2.mean():.4f}")

from scipy.stats import ttest_ind

print(f"p-value: {ttest_ind(abn_size1,abn_size2)[1]:.4f}")

Average abnormal area for pos tube: 0.0107
Average abnormal area for pos no tube: 0.0116
p-value: 0.7565


# Remove correlations with mimic predictions

In [17]:
# get mimic activations
model_pth = "/home/ksaab/Documents/domino/scratch/khaled/outputs/08-10_sabri_mimic/model_chkpt_runid4495.pt"
dp_mimic = get_cxr_activations(dp=resampled_dp, model_path=model_pth, run_type="mimic")

100%|██████████| 83/83 [00:02<00:00, 37.88it/s]


In [18]:
mimic_labels = [
    "atelectasis",
    "cardiomegaly",
    "consolidation",
    "edema",
    "enlarged_cardio",
    "fracture",
    "lung_opacity",
    "pleural_effusion",
    "pleural_other",
    "pneumonia",
    "pneumothorax",
    "support_devices",
    "lung_lesion",
    "no_finding",
]

mimic_probs = dp_mimic["output"].data.softmax(1)
mimic_probs[:,11] = 0 # remove support devices
mimic_probs[:,10] = 0 # remove pmx
#mimic_probs[:,0] = -1e10 # remove atelectasis
#mimic_probs[:,7] = -1e10 # remove pleural_effusion
mimic_preds = mimic_probs.argmax(1)

dp_mimic["mimic_pred"] = mimic_preds

In [23]:
# print number of tubes in each class in train

def print_mimic_subclass_count(dp, class_name):
    class_ndx = np.argmax(np.array(mimic_labels)==class_name)
    pos_mimic_mask = np.logical_and(dp["pmx"].data==1,dp["mimic_pred"].data==class_ndx)
    pos_nomimic_mask = np.logical_and(dp["pmx"].data==1,dp["mimic_pred"].data!=class_ndx)
    neg_mimic_mask = np.logical_and(dp["pmx"].data==0,dp["mimic_pred"].data==class_ndx)
    neg_nomimic_mask = np.logical_and(dp["pmx"].data==0,dp["mimic_pred"].data!=class_ndx)

    print(f"Positive, {class_name}: {pos_mimic_mask.sum()}")
    print(f"Positive, No {class_name}: {pos_nomimic_mask.sum()}")
    print(f"Negative, {class_name}: {neg_mimic_mask.sum()}")
    print(f"Negative, No {class_name}: {neg_nomimic_mask.sum()}")

resampled_train_mask = resampled_dp["split"].data=="train"
train_dp_mimic = dp_mimic[resampled_train_mask]
class_name = "pleural_effusion"
print_mimic_subclass_count(train_dp_mimic,class_name)

Positive, pleural_effusion: 45
Positive, No pleural_effusion: 61
Negative, pleural_effusion: 43
Negative, No pleural_effusion: 169


In [38]:
# randomly sample such that number of images is roughly the same in each subclass
np.random.seed(0)
retained_ndxs = []
class_ndx = np.argmax(np.array(mimic_labels)==class_name)
for ndx in range(len(train_dp_mimic)):
    pmx = train_dp_mimic[ndx]["pmx"]
    mimic = train_dp_mimic[ndx]["mimic_pred"] == class_ndx
    keep = True
    if pmx and not mimic and np.random.rand()>0.72:
        keep = False
    if not pmx and not mimic and np.random.rand()>0.25:
        keep = False

    if keep:
        retained_ndxs.append(ndx)

reresampled_train_dp = train_dp_mimic[retained_ndxs]
print_mimic_subclass_count(reresampled_train_dp,class_name)

Positive, pleural_effusion: 45
Positive, No pleural_effusion: 45
Negative, pleural_effusion: 43
Negative, No pleural_effusion: 43


In [39]:
# merge resampled_train and test, then save dp
reresampled_train_dp = reresampled_train_dp.append(dp_mimic[dp_mimic["split"].data=="test"])
print(f"length of reresampled_train_dp: {len(reresampled_train_dp)}")

#mk.DataPanel.write(reresampled_train_dp,path="/media/4tb_hdd/siim/pleural_effusion_tube_balanced_dp_08-06-21.dp")

length of reresampled_train_dp: 1176
