In [None]:
from fastai.vision.all import *

In [None]:
pd.options.display.max_columns = 100

In [None]:
datapath = Path("../rsna_data/")
train_df = pd.read_csv(datapath/'train.csv')
train_df.pe_present_on_image.mean()

#### Load All Image Files

In [None]:
imgdatapath = (datapath/'full_raw_512')
files = get_image_files(imgdatapath)

In [None]:
filesdict = defaultdict(list)
for o in files: filesdict[o.parent.name] += [o]

In [None]:
len(filesdict)

In [None]:
labels_dict = dict(zip(train_df['SOPInstanceUID'], train_df['pe_present_on_image']))

In [None]:
len(files), len(labels_dict)

In [None]:
def get_label(o): return labels_dict[o.stem.split("_")[1]]

#### Load Metadata

In [None]:
metadata_path = datapath/'metadata'

In [None]:
metadata_files = get_files(metadata_path, extensions='.csv')

In [None]:
metadata_files

In [None]:
pid2metadata = {o.stem:pd.read_csv(o) for o in metadata_files}

#### Load Fold PIDs

In [None]:
resize = 512
# resize = 256

In [None]:
do_cv = True
FOLD = 0

if do_cv: 
    cv_pids_dir = (datapath/'cv_pids')
    if not cv_pids_dir.exists(): cv_pids_dir.mkdir()
    cv_df = train_df[['StudyInstanceUID', 'negative_exam_for_pe']].drop_duplicates().reset_index(drop=True)
    all_pids = cv_df['StudyInstanceUID'].values
    valid_pids = pd.read_pickle(datapath/f'cv_pids/pids_fold{FOLD}.pkl')
    train_pids = list(set(all_pids).difference(valid_pids))

In [None]:
len(train_pids), len(valid_pids), len(train_pids+valid_pids)

In [None]:
train_metadf = pd.concat([pid2metadata[o] for o in train_pids]).reset_index(drop=True)
valid_metadf = pd.concat([pid2metadata[o]  for o in valid_pids]).reset_index(drop=True)

#### Get Valid Files

In [None]:
train_files,valid_files = [],[]
for o in train_pids: train_files += filesdict[o]
for o in valid_pids: valid_files += filesdict[o]

In [None]:
len(train_files), len(valid_files), len(train_files+valid_files)

#### Load Model

In [None]:
# learn = load_learner(f"./models/xresnet34-{resize}-PR-fold{FOLD}-export.pkl", cpu=False)
learn = load_learner(f"./models/effb3-{resize}-PR-fold{FOLD}-export.pkl", cpu=False)

#### Get preds & Visual Embeddings

In [None]:
class EmbeddingHook:
    def __init__(self, m, savedir, filename, csz=500000):
        store_attr("m,savedir,filename,csz")
        
        if len(m._forward_hooks) > 0: self.reset()
        
        self.embeddings = tensor([])
        self.hook = Hook(m, self.hook_fn, cpu=True)
        self.save_iter = 0   
        
        savedir = Path(savedir)
        if not savedir.exists(): savedir.mkdir()
    
    def hook_fn(self, m, inp, out): 
        "Stack and save computed embeddings"
        self.embeddings = torch.cat([self.embeddings, out])
        if self.embeddings.shape[0] > self.csz:
            self.save()
            self.embeddings = tensor([])
    
    def reset(self): self.m._forward_hooks = OrderedDict()
        
    def save(self): 
        torch.save(self.embeddings, self.savedir/f"{self.filename}_part{self.save_iter}.pth")
        self.save_iter += 1

In [None]:
len(train_files), len(valid_files)

In [None]:
all_files = train_files + valid_files

In [None]:
len(all_files)

In [None]:
all_dl = learn.dls.test_dl(all_files, with_labels=True, bs=64)

In [None]:
folder = f"full_EFFNETB3_{resize}_ALL_FROM_FOLD{FOLD}"; folder

In [None]:
# embhook = EmbeddingHook(learn.model[1][1], datapath/f'cnn_embs/{folder}', 'xresnet34_embeddings')
embhook = EmbeddingHook(learn.model._avg_pooling, datapath/f'cnn_embs/{folder}', 'effb3_embeddings')

In [None]:
preds, targs = learn.get_preds(dl=all_dl, act=noop)

In [None]:
# # Save preds, embeddings and ordered valid filenames
# torch.save(embhook.embeddings,  datapath/f'cnn_embs/{folder}'/'xresnet34_embeddings_finalpart.pth')
# torch.save(preds,  datapath/f'cnn_embs/{folder}'/'preds.pth')
# torch.save(all_dl.dataset.items,  datapath/f'cnn_embs/{folder}'/'files.pth')

In [None]:
# Save preds, embeddings and ordered valid filenames
torch.save(embhook.embeddings,  datapath/f'cnn_embs/{folder}'/'effb3_embeddings_finalpart.pth')
torch.save(preds,  datapath/f'cnn_embs/{folder}'/'preds.pth')
torch.save(all_dl.dataset.items,  datapath/f'cnn_embs/{folder}'/'files.pth')

In [None]:
# embeddings = torch.cat([torch.load(o) for o in [o for o in (datapath/f'cnn_embs/{folder}').ls() if 'embeddings' in str(o)]])

In [None]:
# embeddings.shape, preds.shape

embeddings

qi = proportion of positive images

### Image Weighted Log Loss (Competition Metric) - 2D CNN models

sz 256

Xresnet34 Fold 0, sz=256, temp=1.3, 0.3881 / Effnetb3 Fold 0, sz=256 temp=1.2 0.3356

Xresnet34 Fold 1, sz=256, temp = 1.3, 0.3684

sz 512

Xresnet34 Fold 0, sz=512, temp =0.8 0.2639 / Effnetb3 Fold 0, sz=512, temp=1.5 0.2655

Xresnet34 Fold 1, sz=512, temp = 1.5, 0.2679

Xresnet34 Fold 2 sz=512, temp = 1.4, 0.2686

Xresnet34 Fold 3 sz=512, temp = 1.1, 0.2373

Xresnet34 Fold 4 sz=512, temp = 1.1, 0.2533

In [None]:
valid_labels = L(valid_files).map(get_label)

In [None]:
valid_p = np.mean(valid_labels)
1-valid_p

In [None]:
accuracy(preds, targs)

In [None]:
sids = L(valid_files).map(lambda o: o.parent.name)

In [None]:
sid2qi =dict(pd.DataFrame({"sid":sids, "labels": valid_labels}).groupby("sid")['labels'].mean())

In [None]:
qis = tensor([sid2qi[o] for o in sids])

In [None]:
for temp in np.linspace(0.1, 2, 20):
    l = F.cross_entropy(preds.float()/temp, targs, reduction='none')
    avg_logloss = (l*qis).sum()/qis.sum()
    print(temp, avg_logloss.item())

In [None]:
qis.sum()

In [None]:
plt.hist((preds.float()/.8).softmax(1)[:, 1])

In [None]:
img_losses = F.cross_entropy(preds.float()/0.8, targs, reduction='none')
tot_img_loss = (img_losses*qis).sum()
tot_img_wgts = qis.sum()
avg_logloss = tot_img_loss/tot_img_wgts;avg_logloss

In [None]:
tot_img_loss, tot_img_wgts

### Exam Weighted Log Loss

**Mean baseline**

Fold 1 0.3518


In [None]:
exam_targets = L([
#           'positive_exam_for_pe'
            'negative_exam_for_pe',
            'indeterminate',

            'rv_lv_ratio_gte_1',
            'rv_lv_ratio_lt_1',
    # none

            'leftsided_pe',
            'rightsided_pe',
            'central_pe',

            'chronic_pe',
            'acute_and_chronic_pe',           
            # neither chronic or acute_and_chronic
          
    
    
#             'qa_motion',
#             'qa_contrast',
#             'flow_artifact',
#             'true_filling_defect_not_pe',
             ]); exam_targets

In [None]:
neg_pe_wgt = 0.0736196319
indeterminate_wgt = 0.09202453988

rv_lv_gte_1_wgt = 0.2346625767
rv_lv_lt_1_wgt = 0.0782208589

left_pe_wgt = 0.06257668712
right_pe_wgt = 0.06257668712
central_pe_wgt = 0.1877300613

chronic_wgt = 0.1042944785
acute_chronic_wgt = 0.1042944785

In [None]:
exam_wgts = tensor([0.0736196319,0.09202453988,0.2346625767,0.0782208589,0.06257668712,0.06257668712,0.1877300613,0.1042944785, 0.1042944785])

In [None]:
train_targsdf = train_df[train_df.StudyInstanceUID.isin(train_pids)][["StudyInstanceUID"]+exam_targets].drop_duplicates()
valid_targsdf = train_df[train_df.StudyInstanceUID.isin(valid_pids)][["StudyInstanceUID"]+exam_targets].drop_duplicates()

In [None]:
exam_mean_preds = dict(train_targsdf[exam_targets].mean())

In [None]:
exam_mean_preds

In [None]:
exam_losses = F.binary_cross_entropy(tensor(list(exam_mean_preds.values()))[None,...].repeat(len(valid_pids),1),
                                     tensor(valid_targsdf[exam_targets].values).float(), 
                                     reduction='none')

In [None]:
tot_exam_loss = (exam_losses*exam_wgts).sum()
tot_exam_wgts = (len(valid_pids)*exam_wgts.sum())
avg_exam_loss = tot_exam_loss/tot_exam_wgts; avg_exam_loss

### Combine both

Almost equal weights just take mean of two

In [None]:
img_wgt = 0.07361963

In [None]:
(tot_img_loss*img_wgt + tot_exam_loss) / (tot_img_wgts*img_wgt + tot_exam_wgts)