In [1]:
from fastai.vision.all import *

In [2]:
pd.options.display.max_columns = 100

In [3]:
datapath = Path("/../rsna_data/")
train_df = pd.read_csv(datapath/'train.csv')
train_df.pe_present_on_image.mean()

0.053915069524414806

#### Load All Image Files

In [4]:
imgdatapath = (datapath/'full_raw_512')
files = get_image_files(imgdatapath)

In [5]:
filesdict = defaultdict(list)
for o in files: filesdict[o.parent.name] += [o]

In [6]:
len(filesdict)

7279

In [7]:
labels_dict = dict(zip(train_df['SOPInstanceUID'], train_df['pe_present_on_image']))

In [8]:
len(files), len(labels_dict)

(1790594, 1790594)

In [9]:
def get_label(o): return labels_dict[o.stem.split("_")[1]]

#### Load Metadata

In [10]:
metadata_path = datapath/'metadata'

In [11]:
metadata_files = get_files(metadata_path, extensions='.csv')

In [12]:
metadata_files

(#7279) [Path('/../rsna_data/metadata/de6cf946cacb.csv'),Path('/../rsna_data/metadata/0d548bca2844.csv'),Path('/../rsna_data/metadata/6d6432ccbeac.csv'),Path('/../rsna_data/metadata/f8b0b66c706f.csv'),Path('/../rsna_data/metadata/064153f367c0.csv'),Path('/../rsna_data/metadata/ea930c47a345.csv'),Path('/../rsna_data/metadata/0f1d90963f0f.csv'),Path('/../rsna_data/metadata/6c217cee8898.csv'),Path('/../rsna_data/metadata/4d7a4494d46d.csv'),Path('/../rsna_data/metadata/1346256ea0e8.csv')...]

In [13]:
pid2metadata = {o.stem:pd.read_csv(o) for o in metadata_files}

#### Load Fold PIDs

In [14]:
resize = 512

In [15]:
do_cv = True
FOLD = 4

if do_cv: 
    cv_pids_dir = (datapath/'cv_pids')
    if not cv_pids_dir.exists(): cv_pids_dir.mkdir()
    cv_df = train_df[['StudyInstanceUID', 'negative_exam_for_pe']].drop_duplicates().reset_index(drop=True)
    all_pids = cv_df['StudyInstanceUID'].values
    valid_pids = pd.read_pickle(datapath/f'cv_pids/pids_fold{FOLD}.pkl')
    train_pids = list(set(all_pids).difference(valid_pids))

In [16]:
len(train_pids), len(valid_pids), len(train_pids+valid_pids)

(5824, 1455, 7279)

In [17]:
train_metadf = pd.concat([pid2metadata[o] for o in train_pids]).reset_index(drop=True)
valid_metadf = pd.concat([pid2metadata[o]  for o in valid_pids]).reset_index(drop=True)

#### Get Valid Files

In [18]:
train_files,valid_files = [],[]
for o in train_pids: train_files += filesdict[o]
for o in valid_pids: valid_files += filesdict[o]

In [19]:
len(train_files), len(valid_files), len(train_files+valid_files)

(1434610, 355984, 1790594)

#### Load Model

In [20]:
learn = load_learner(f"./models/xresnet34-{resize}-PR-fold{FOLD}-export.pkl", cpu=False)

#### Get preds & Visual Embeddings

In [21]:
class EmbeddingHook:
    def __init__(self, m, savedir, filename, csz=5000000):
        store_attr("m,savedir,filename,csz")
        
        if len(m._forward_hooks) > 0: self.reset()
        
        self.embeddings = tensor([])
        self.hook = Hook(m, self.hook_fn, cpu=True)
        self.save_iter = 0   
        
        savedir = Path(savedir)
        if not savedir.exists(): savedir.mkdir()
    
    def hook_fn(self, m, inp, out): 
        "Stack and save computed embeddings"
        self.embeddings = torch.cat([self.embeddings, out])
        if self.embeddings.shape[0] > self.csz:
            self.save()
            self.embeddings = tensor([])
    
    def reset(self): self.m._forward_hooks = OrderedDict()
        
    def save(self): 
        torch.save(self.embeddings, savedir/filename+"_part{self.save_iter}.pth")
        self.save_iter += 1

In [22]:
valid_dl = learn.dls.test_dl(valid_files, with_labels=True, bs=64)

In [23]:
embhook = EmbeddingHook(learn.model[1][1], datapath/f'cnn_embs/full_{resize}_FOLD{FOLD}', 'xresnet34_embeddings.pth')

In [24]:
preds, targs = learn.get_preds(dl=valid_dl, act=noop)

In [25]:
preds.shape, targs.shape, embhook.embeddings.shape

(torch.Size([355984, 2]), torch.Size([355984]), torch.Size([355984, 1024]))

In [26]:
# Save preds, embeddings and ordered valid filenames
torch.save(embhook.embeddings,  datapath/f'cnn_embs/full_{resize}_FOLD{FOLD}'/'embeddings.pth')
torch.save(preds,  datapath/f'cnn_embs/full_{resize}_FOLD{FOLD}'/'preds.pth')
torch.save(valid_dl.dataset.items,  datapath/f'cnn_embs/full_{resize}_FOLD{FOLD}'/'files.pth')

### Image Weighted Log Loss

**sz 256**

Fold 0, sz=256, temp=1.3, 0.3881

Fold 1, sz=256, temp = 1.3, 0.3684

**sz 512**

Fold 0, sz=512, temp =0.8 0.2639

Fold 1, sz=512, temp = 1.5, 0.2679

Fold 2  sz=512, temp = 1.4, 0.2686

Fold 3 sz=512, temp = 1.1, 0.2373

Fold 4 sz=512, temp = 1.1, 0.2533

qi = proportion of positive images

In [27]:
valid_labels = L(valid_files).map(get_label)

In [28]:
valid_p = np.mean(valid_labels)
1-valid_p

0.9451379837296058

In [29]:
accuracy(preds, targs)

tensor(0.9395)

In [30]:
sids = L(valid_files).map(lambda o: o.parent.name)

In [31]:
sid2qi =dict(pd.DataFrame({"sid":sids, "labels": valid_labels}).groupby("sid")['labels'].mean())

In [32]:
qis = tensor([sid2qi[o] for o in sids])

In [33]:
for temp in np.linspace(0.1, 2, 20):
    l = F.cross_entropy(preds.float()/temp, targs, reduction='none')
    avg_logloss = (l*qis).sum()/qis.sum()
    print(temp, avg_logloss.item())

0.1 1.4429523944854736
0.2 0.7361778020858765
0.3 0.5075522661209106
0.4 0.39870545268058777
0.5 0.33796584606170654
0.6 0.30145129561424255
0.7 0.27892276644706726
0.7999999999999999 0.26523709297180176
0.8999999999999999 0.25750088691711426
0.9999999999999999 0.25393974781036377
1.0999999999999999 0.2533901631832123
1.2 0.2550487816333771
1.3 0.2583395540714264
1.4 0.26283755898475647
1.5 0.2682230472564697
1.5999999999999999 0.2742520272731781
1.7 0.28073635697364807
1.8 0.28753021359443665
1.9 0.29452037811279297
2.0 0.3016183078289032


In [34]:
qis.sum()

tensor(18657.0137)

In [None]:
plt.hist((preds.float()/.8).softmax(1)[:, 1])

In [None]:
img_losses = F.cross_entropy(preds.float()/0.8, targs, reduction='none')
tot_img_loss = (img_losses*qis).sum()
tot_img_wgts = qis.sum()
avg_logloss = tot_img_loss/tot_img_wgts;avg_logloss

In [None]:
tot_img_loss, tot_img_wgts

### Exam Weighted Log Loss

**Mean baseline**

Fold 1 0.3518


In [None]:
exam_targets = L([
#           'positive_exam_for_pe'
            'negative_exam_for_pe',
            'indeterminate',

            'rv_lv_ratio_gte_1',
            'rv_lv_ratio_lt_1',
    # none

            'leftsided_pe',
            'rightsided_pe',
            'central_pe',

            'chronic_pe',
            'acute_and_chronic_pe',           
            # neither chronic or acute_and_chronic
          
    
    
#             'qa_motion',
#             'qa_contrast',
#             'flow_artifact',
#             'true_filling_defect_not_pe',
             ]); exam_targets

In [None]:
neg_pe_wgt = 0.0736196319
indeterminate_wgt = 0.09202453988

rv_lv_gte_1_wgt = 0.2346625767
rv_lv_lt_1_wgt = 0.0782208589

left_pe_wgt = 0.06257668712
right_pe_wgt = 0.06257668712
central_pe_wgt = 0.1877300613

chronic_wgt = 0.1042944785
acute_chronic_wgt = 0.1042944785

In [None]:
exam_wgts = tensor([0.0736196319,0.09202453988,0.2346625767,0.0782208589,0.06257668712,0.06257668712,0.1877300613,0.1042944785, 0.1042944785])

In [None]:
train_targsdf = train_df[train_df.StudyInstanceUID.isin(train_pids)][["StudyInstanceUID"]+exam_targets].drop_duplicates()
valid_targsdf = train_df[train_df.StudyInstanceUID.isin(valid_pids)][["StudyInstanceUID"]+exam_targets].drop_duplicates()

In [None]:
exam_mean_preds = dict(train_targsdf[exam_targets].mean())

In [None]:
exam_mean_preds

In [None]:
exam_losses = F.binary_cross_entropy(tensor(list(exam_mean_preds.values()))[None,...].repeat(len(valid_pids),1),
                                     tensor(valid_targsdf[exam_targets].values).float(), 
                                     reduction='none')

In [None]:
tot_exam_loss = (exam_losses*exam_wgts).sum()
tot_exam_wgts = (len(valid_pids)*exam_wgts.sum())
avg_exam_loss = tot_exam_loss/tot_exam_wgts; avg_exam_loss

### Combine both

Almost equal weights just take mean of two

In [None]:
img_wgt = 0.07361963

In [None]:
(tot_img_loss*img_wgt + tot_exam_loss) / (tot_img_wgts*img_wgt + tot_exam_wgts)