In [1]:
!pip install python-gdcm

import pydicom
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import concurrent.futures
import ast

data_dir = "~/var/aml-xrays"
data_path = Path(data_dir).expanduser()



## CSVs
### Image Level Train CSV

In [66]:
image_level = pd.read_csv(data_path / "train_image_level.csv")
image_level

6334


Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e
...,...,...,...,...
6329,ffcc6edd9445_image,,none 1 0 0 1 1,7e6c68462e06
6330,ffd91a2c4ca0_image,,none 1 0 0 1 1,8332bdaddb6e
6331,ffd9b6cf2961_image,"[{'x': 2197.38566, 'y': 841.07361, 'width': 31...",opacity 1 2197.38566 841.07361 2513.80265 1292...,7eed9af03814
6332,ffdc682f7680_image,"[{'x': 2729.27083, 'y': 332.26044, 'width': 14...",opacity 1 2729.27083 332.26044 4225.52099 2936...,a0cb0b96fb3d


### Study Level Train CSV

In [51]:
study_level = pd.read_csv(data_path / "train_study_level.csv")
study_level

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0
1,000c9c05fd14_study,0,0,0,1
2,00292f8c37bd_study,1,0,0,0
3,005057b3f880_study,1,0,0,0
4,0051d9b12e72_study,0,0,0,1
...,...,...,...,...,...
6049,ffcb4630f46f_study,0,1,0,0
6050,ffe4d6e8fbb0_study,0,1,0,0
6051,ffe94fcb14fa_study,0,1,0,0
6052,ffebf1ef4a9c_study,0,1,0,0


In [8]:
from sklearn.model_selection import train_test_split

def split_dataset(base_path: Path, random_state=42):
    study_level = pd.read_csv(base_path / "train_study_level.csv")
    image_level = pd.read_csv(base_path / "train_image_level.csv")
    image_level["study_id"] = image_level["StudyInstanceUID"] + "_study"
    merged = pd.merge(image_level, study_level, left_on="study_id", right_on="id", suffixes=("", "_o"))
    y = np.squeeze((merged.values[:, [5, 6, 7, 8]]))

    merged = merged.drop(["id_o"], axis=1)

    def to_file_path(row):
        image_id = row["id"][:-6]
        return list(base_path.glob(f'data/train/{row["StudyInstanceUID"]}/**/{image_id}.png'))[0]

    merged["file_path"] = merged.apply(to_file_path, axis=1)
    train_X, test_X, _, _ = train_test_split(merged, y, test_size=0.2, random_state=random_state)
    return train_X, test_X

def build_bce_weights(merged_data):
    pos_weights = np.ones(4)
    labels_only = merged_data.iloc[:, 5:9]
    print(labels_only)
    for index in range(4):
        values = labels_only.iloc[:, index].value_counts().sort_index()
        print(values)
        pos_weights[index] = values[0] / values[1]
    return pos_weights

train_x, _ = split_dataset(base_path=data_path)
print(build_bce_weights(train_x))

      Negative for Pneumonia  Typical Appearance  Indeterminate Appearance  \
1142                       0                   0                         1   
2654                       0                   1                         0   
5395                       1                   0                         0   
1170                       0                   0                         1   
4371                       1                   0                         0   
...                      ...                 ...                       ...   
3772                       0                   1                         0   
5191                       0                   1                         0   
5226                       0                   0                         1   
5390                       1                   0                         0   
860                        0                   1                         0   

      Atypical Appearance  
1142                    0  
2654   

### Sample Submission

In [None]:
sample_sub = pd.read_csv(data_path / "sample_submission.csv")
sample_sub

In [None]:
def resolve_image_paths(study_id, dataset="train"):
    study_path = data_path / dataset / study_id
    image_paths = []
    for series_path in study_path.iterdir():
        if series_path.is_dir():
            [image_paths.append(image_path) for image_path in series_path.iterdir()]
    return image_paths

def study_id_from_csv_id(csv_id):
    return csv_id[:-6]

def csv_id_to_paths(csv_id, dataset="train"):
    return resolve_image_paths(study_id_from_csv_id(csv_id), dataset)

In [None]:
csv_id_to_paths("00086460a852_study")

In [None]:
typical = study_level[study_level["Typical Appearance"] == 1]

In [None]:
sample_csv_id = typical.head(1)["id"].values[0]
dicom_obj = pydicom.dcmread(csv_id_to_paths(sample_csv_id)[0])

In [None]:
dicom_obj

In [None]:
plt.figure(figsize=(10,20))
plt.imshow(dicom_obj.pixel_array, cmap=plt.cm.bone)

In [None]:
study_level.loc[:, study_level.columns != "id"].sum(axis=1).unique()

In [None]:
def show_dcm(dcm, ax):
    img_data = dcm.pixel_array
    img_data = pydicom.pixel_data_handlers.util.apply_voi_lut(img_data, dcm)
    if dcm.PhotometricInterpretation == "MONOCHROME1":
        img_data = np.amax(img_data) - img_data
    ax.imshow(img_data, cmap=plt.cm.bone)
    
def add_bounding_boxes(dcm, ax):
    instance_uid = dcm.SOPInstanceUID
    box_str = image_level[image_level["id"] == instance_uid + "_image"]["boxes"].iloc[0]
    if (not pd.isnull(box_str)):
        boxes = ast.literal_eval(box_str)
        if (isinstance(boxes, list)):
            for box in boxes:
                mp_box = patches.Rectangle((box['x'], box['y']), box['width'], box['height'], edgecolor="r", facecolor='none')
                ax.add_patch(mp_box)

def show_samples_for(prediction, number_of_samples=3):
    samples = study_level[study_level[prediction] == 1].sample(number_of_samples)
    study_paths_iter = map(csv_id_to_paths, samples["id"])
    for i, study_paths in enumerate(study_paths_iter):
        fig, axs = plt.subplots(1, len(study_paths), figsize=(10,8))
        fig.suptitle(f'Study: {samples["id"].iloc[i]}')
        for im_i, image_path in enumerate(study_paths):
            ax = axs[im_i] if isinstance(axs, np.ndarray) else axs
            ax.set_title(f'Instance / Image Nr. {im_i + 1} / {len(study_paths)}')
            dcm = pydicom.dcmread(image_path)
            plt.figure()
            show_dcm(dcm, ax)
            add_bounding_boxes(dcm, ax)

## Samples: Negative

In [None]:
show_samples_for("Negative for Pneumonia")

## Samples: Typical Appearance

In [None]:
show_samples_for("Typical Appearance")

## Samples: Indeterminate Appearance

In [None]:
show_samples_for("Indeterminate Appearance")

## Samples: Atypical Appearance

In [None]:
show_samples_for("Atypical Appearance")

# Tests for image conversion
## DICOM read to Pillow

In [None]:
dcm_obj = pydicom.dcmread(csv_id_to_paths("336b277e175e_study")[0])

In [None]:
from pydicom.pixel_data_handlers import apply_modality_lut
from pydicom.pixel_data_handlers import apply_voi_lut
from PIL import Image

def to_pillow(dcm: pydicom.Dataset) -> Image:
    img_array = apply_modality_lut(dcm.pixel_array, dcm)
    img_array = apply_voi_lut(img_array, dcm)
    if dcm.PhotometricInterpretation == "MONOCHROME1":
        img_array = np.amax(img_array) - img_array

    # every radiologist will kill us for that one:
    # might scale based on target, e.g. tissue, bone etc. results in better predictions
    img_array = img_array - np.min(img_array)
    img_array = img_array / np.max(img_array)

    img_array = (img_array * 255).astype(np.uint8)

    return Image.fromarray(img_array)

In [None]:
path = csv_id_to_paths('000c9c05fd14_study')[0]
to_pillow(pydicom.dcmread(path)).show()

### Figure out existing attributes in data set

In [None]:
from tqdm import tqdm
def find_attribute_values(dicom_attrs: list[str]):
    attr_dict = {}
    for attr in dicom_attrs:
        attr_dict[attr] = set()

    for study in tqdm(study_level['id']):
        paths = csv_id_to_paths(study)
        for single_path in paths:
            dcm = pydicom.dcmread(single_path)
            for attr in attr_dict.keys():
                if attr in dcm:
                    attr_dict[attr].add(dcm[attr].value)
                if attr in dcm.file_meta:
                    attr_dict[attr].add(dcm.file_meta[attr].value)

    return attr_dict

In [None]:
found_values = find_attribute_values(['PhotometricInterpretation', 'BitsAllocated', 'Modality', 'TransferSyntaxUID'])

In [None]:
found_values

Transfer types in Data Set:
* (1.2.840.10008.1.2.1) Explicit VR Little Endian  --> supported by without additional library
* (1.2.840.10008.1.2.4.70) JPEG Lossless, Nonhierarchical, First- Order Prediction  --> requires python-gdcm package installed

## Convert all to .png (only for testing purpose - Do not use)

In [None]:
import math
import concurrent

def split_list(lst, parts):
    n = math.ceil(len(lst) / parts)
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def convert_all_to_png(search_dir: Path, thread_count=32):
    paths = list(search_dir.glob('**/*.dcm'))
    executor_pool = concurrent.futures.ThreadPoolExecutor(thread_count)
    pending_futures = []
    chunks = list(split_list(paths, thread_count))

    for tid, next_chunk in enumerate(chunks):
        pending_futures.append(executor_pool.submit(convert_chunk, next_chunk, search_dir, tid))

    concurrent.futures.wait(pending_futures)

def convert_chunk(chunk: list[Path], search_dir: Path):
    for single_path in chunk:
        dcm = pydicom.dcmread(single_path)
        pillow_img = to_pillow(dcm)
        new_path = search_dir / "converted" / single_path.with_suffix(".png").relative_to(data_path)
        new_path.parent.mkdir(parents=True, exist_ok=True)
        pillow_img.save(str(new_path))

In [None]:
# convert_all_to_png(data_path)

# Prepare Dataset
Only the first cell of the notebook has to be executed before. The package `python-gdcm` needs to be installed
used python env!
1. Convert to .png (no progressbar ..). .Images will be located at: `$(data_dir)/converted/{train|test}/..`

In [None]:
import dataprep

In [None]:
dataprep.convert_all_to_png(data_path)

2. Create COCO annotation file for train set

In [None]:
coco = dataprep.CxrCOCO(data_path)
coco.to_json_file()

In [None]:
import fiftyone as fo

ds = fo.Dataset.from_dir(
    dataset_type=fo.types.COCODetectionDataset,
    labels_path=data_path / "labels.json",
    data_path=data_path
)
session = fo.launch_app(ds)

In [None]:
from dataprep import append_labels_to_coco

append_labels_to_coco(Path("~/var/aml-models/test.json").expanduser(), data_path / "train_study_level.csv", data_path / "train_image_level.csv")
