In [77]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
import os
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
import logging
import glob
import random
import pydicom
import json
import PIL
import boto3

from tqdm import tqdm

sys.path.append('../')
from rsna import utils

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt

In [8]:
! ls /mirco-kaggle/rsna

all.zip				     stage_1_test_images
GCP Credits Request Link - RSNA.txt  stage_1_test_images.zip
label_map.pbtxt			     stage_1_train_images
stage_1_detailed_class_info.csv      stage_1_train_images.zip
stage_1_sample_submission.csv	     stage_1_train_labels.csv


In [10]:
pred_path = "../ml/v1/pred"

# Submission file

In [52]:
label_map = {0: 'other', 1: 'high_density'}

In [61]:
def _load_from_tfrec(tfrec_path, label_map, score_threshold=0.2):

    detections = {}
    
    for string_record in tf.python_io.tf_record_iterator(path=tfrec_path):
        
        example = tf.train.Example()
        example.ParseFromString(string_record)

        patientId = example.features.feature["image/filename"].bytes_list.value[0].decode("utf-8")
        patientId = os.path.basename(patientId).split(".dcm")[0]

        height = example.features.feature["image/height"].int64_list.value[0]
        width = example.features.feature["image/width"].int64_list.value[0]

        detected_labels = example.features.feature["image/detection/label"].int64_list.value
    
        frame_detections = []
        for i, label in enumerate(detected_labels):
            
            xmin = example.features.feature[f"image/detection/bbox/xmin"].float_list.value[i]
            xmax = example.features.feature[f"image/detection/bbox/xmax"].float_list.value[i]
            ymin = example.features.feature[f"image/detection/bbox/ymin"].float_list.value[i]
            ymax = example.features.feature[f"image/detection/bbox/ymax"].float_list.value[i]

            score = example.features.feature["image/detection/score"].float_list.value[i]

            if score < score_threshold:
                continue

            frame_detections.append(
                dict(
                    name=label_map[label],
                    xmin=round(xmin * width),
                    xmax=round(xmax * width),
                    ymin=round(ymin * height),
                    ymax=round(ymax * height),
                    score=score,
                )
            )

        detections[patientId] = frame_detections

    return detections

In [63]:
detections = _load_from_tfrec(tfrec_path=tfrec_path, label_map=label_map)

In [68]:
detections

{'0851e63b-8a7c-4089-afd2-b63c28795669': [],
 '087d375c-c266-4069-84dd-4d1d9d906539': [],
 '0965f1e9-2905-40ef-8612-4984e2c7e9ea': [],
 '06b1975e-066e-4430-bf60-c0a9d82cfdc3': [],
 '0666201a-fab4-4828-81dd-73292312c766': [],
 '03cd7a5b-d5d7-40a1-81b1-c4264920530a': [],
 '0700bc73-b6e3-412e-9e2f-aa0b83424804': [],
 '0b8e3d9e-0bcd-41a3-b788-0175c43b2624': [],
 '05ff7ddc-3fb7-4378-ac14-a10a08c64aaf': [],
 '00436515-870c-4b36-a041-de91049b9ab4': [],
 '0889b108-8f03-4fef-9ef3-5ddabde15ce7': [],
 '0697baae-e475-4fa3-ad82-51b319a026cc': [],
 '093ce867-9a80-4748-83ed-3d81d26b7741': [],
 '0b2e67fe-09ae-40b0-8089-4dddb7c2c0cf': [],
 '061628a9-8a6d-4e01-8f26-b7a6cca2b13d': [],
 '06e40d99-227c-4fa5-a51a-8691b155b837': [],
 '07c7047e-1be1-43ba-85bd-75f8d9434757': [],
 '06fc2885-127a-4274-b20c-554d352ea21b': [],
 '0bb24183-8b59-48f1-8bbf-4d889976fc82': [],
 '01e800d4-4d8d-4ef5-a40a-f37b923bb7ad': [],
 '01d1f0be-5216-4af5-b2dc-a160c1c56122': [],
 '0a7b0cc8-af04-4d2c-9267-6fdfb05f48f2': [],
 '062374a3

In [93]:
def make_submission_string(b):
    bbox = utils._bbox_convert_to_min_size(**b)
    return f"{round(b['score'], 2)} {bbox['xmin']} {bbox['ymin']} {bbox['width']} {bbox['height']}"
    
def f(v):
    submission_strings = []
    for b in v:
        submission_strings.append(make_submission_string(b))
    return ",".join(submission_strings)

In [118]:
d = [{'patientId':k, 'submission_strings': f(v)} for k, v in detections.items()]

In [119]:
dd = pd.DataFrame(d)

In [122]:
dd.loc[110]

patientId                  09547f4f-c52e-4fd7-8738-0e01e6277034
submission_strings    0.33 269 247 755 744,0.33 269 247 755 744
Name: 110, dtype: object

In [121]:
dd.loc[110, 'submission_strings'] = "0.33 269 247 755 744,0.33 269 247 755 744"

In [130]:
dd.loc[dd['submission_strings'].str.len() > 1, :]

Unnamed: 0,patientId,submission_strings
110,09547f4f-c52e-4fd7-8738-0e01e6277034,"0.33 269 247 755 744,0.33 269 247 755 744"


In [146]:
ee = dd.loc[100:115]
ee = ee.set_index('patientId')

In [158]:
ff = pd.DataFrame(ee.submission_strings.str.split(',').tolist(), index=ee.index).stack().reset_index().drop('level_1', axis=1)
ff.columns = ['id', 'string']
ff

Unnamed: 0,id,string
0,0a9c4c04-5918-48da-9ba1-b2a3add66ae9,
1,0949dfc4-7613-432c-8d16-4cf0aa5b15bb,
2,061d0ff4-a611-42ce-b780-1d1eb55e50ab,
3,08c343b6-9dc4-405b-87b3-b6534abcb990,
4,00a85be6-6eb0-421d-8acf-ff2dc0007e8a,
5,06d5a58d-baf1-4937-bfc3-00db1fb2b1be,
6,0ab1a1e1-3fe3-4baa-876b-f66825931486,
7,0a29f841-b738-499a-bc8e-0589de77ef82,
8,0b53bc2e-60ed-48c4-8754-c094c5b4c92c,
9,0927bc1e-e1af-4c7e-93c4-7127d366f57d,


In [148]:
ff

Unnamed: 0,patientId,level_1,0
0,0a9c4c04-5918-48da-9ba1-b2a3add66ae9,0,
1,0949dfc4-7613-432c-8d16-4cf0aa5b15bb,0,
2,061d0ff4-a611-42ce-b780-1d1eb55e50ab,0,
3,08c343b6-9dc4-405b-87b3-b6534abcb990,0,
4,00a85be6-6eb0-421d-8acf-ff2dc0007e8a,0,
5,06d5a58d-baf1-4937-bfc3-00db1fb2b1be,0,
6,0ab1a1e1-3fe3-4baa-876b-f66825931486,0,
7,0a29f841-b738-499a-bc8e-0589de77ef82,0,
8,0b53bc2e-60ed-48c4-8754-c094c5b4c92c,0,
9,0927bc1e-e1af-4c7e-93c4-7127d366f57d,0,


In [125]:
ee

Unnamed: 0,patientId,submission_strings
100,0a9c4c04-5918-48da-9ba1-b2a3add66ae9,
101,0949dfc4-7613-432c-8d16-4cf0aa5b15bb,
102,061d0ff4-a611-42ce-b780-1d1eb55e50ab,
103,08c343b6-9dc4-405b-87b3-b6534abcb990,
104,00a85be6-6eb0-421d-8acf-ff2dc0007e8a,
105,06d5a58d-baf1-4937-bfc3-00db1fb2b1be,
106,0ab1a1e1-3fe3-4baa-876b-f66825931486,
107,0a29f841-b738-499a-bc8e-0589de77ef82,
108,0b53bc2e-60ed-48c4-8754-c094c5b4c92c,
109,0927bc1e-e1af-4c7e-93c4-7127d366f57d,


In [None]:
# predict only the first 50 entries
sample_submission_fp = 'sample_submission.csv'
predict(test_image_fps[:50], filepath=sample_submission_fp)

In [None]:
output = pd.read_csv(sample_submission_fp, names=['id', 'pred_string'])
output.head(50)