In [17]:
from google.cloud import storage
import os
import pandas as pd
import subprocess
import meerkat as mk


In [163]:
dataset_dir = "/home/common/datasets/mimic-cxr"

In [97]:

storage_client = storage.Client(project="hai-gcp-fine-grained")
bucket = storage_client.bucket(
    "mimic-cxr-2.0.0.physionet.org", 
    user_project="hai-gcp-fine-grained"
)




In [98]:
blob_name = "mimic-cxr-reports.zip"
filepath = os.path.join(dataset_dir, blob_name)
bucket.blob(blob_name).download_to_filename(filepath)

In [12]:
dataset_dir = "/home/common/datasets/mimic-cxr"
for blob_name in  ["cxr-record-list.csv.gz", "cxr-study-list.csv.gz"]:
    filepath = os.path.join(dataset_dir, blob_name)
    bucket.blob(blob_name).download_to_filename(filepath)
    subprocess.run(["gunzip", filepath])

bucket.blob(blob_name).download_to_filename(filepath)


# subprocess.run(["unzip", filepath])


In [161]:
# prepare jpg version

storage_client = storage.Client(project="hai-gcp-fine-grained")
bucket = storage_client.bucket(
    "mimic-cxr-jpg-2.0.0.physionet.org", 
    user_project="hai-gcp-fine-grained"
)

for blob_name in [
    "mimic-cxr-2.0.0-chexpert.csv.gz",
    "mimic-cxr-2.0.0-metadata.csv.gz",
    "mimic-cxr-2.0.0-negbio.csv.gz",
    "mimic-cxr-2.0.0-split.csv.gz",
]:
    filepath = os.path.join(dataset_dir, blob_name)
    bucket.blob(blob_name).download_to_filename(filepath)
    subprocess.run(["gunzip", filepath])





In [142]:
study_dp = mk.DataPanel.from_csv(
    os.path.join(dataset_dir, "cxr-study-list.csv")
)
def loader(path: str):
    with open(os.path.join(dataset_dir, path), "r") as f:
        return f.read()

col = mk.ImageColumn.from_filepaths(study_dp["path"], loader=loader)
study_dp["report"] = col

In [151]:
study_dp["report"][400]

'                                 FINAL REPORT\n HISTORY:  Seizures, question interval change.\n \n CHEST, SINGLE AP PORTABLE VIEW.\n \n An ET tube is present, tip approximately 3.7 cm above the carina.  A right IJ\n central line is present, tip near SVC/RA junction.  An NG tube is present, tip\n extending beneath the diaphragm, off the film.  \n \n There is probable cardiomegaly, though the cardiomediastinal silhouette is\n unchanged.  There is possible mild vascular plethora, but no overt CHF.  There\n is minimal atelectasis at the left lung base.  Again seen is density seen\n along the right sixth rib laterally, ? callus about a rib fracture.\n \n IMPRESSION:  Essentially unchanged compared with ___ at 9:19 a.m.\n'

In [167]:
dp = mk.DataPanel.from_csv(
    os.path.join(dataset_dir, "cxr-record-list.csv")
)

paths = pd.Series(dp["path"].data)
paths.str.split

In [172]:
dp["jpg_path"] = paths.str.split(".").str[0] + ".jpg"

In [173]:
import io
from pydicom import dcmread
from dosma import DicomReader
from PIL import Image


class GCSLoader:
    def __init__(self, bucket_name: str, project: str, loader: callable = None):
        storage_client = storage.Client(project=project)
        self.bucket = storage_client.bucket(bucket_name, user_project=project)
        self.loader = (lambda x: x) if loader is None else loader

    def __call__(self, blob_name):
        return self.loader(
            io.BytesIO(self.bucket.blob(str(blob_name)).download_as_bytes())
        )

loader = GCSLoader(
    bucket_name="mimic-cxr-jpg-2.0.0.physionet.org",
    project="hai-gcp-fine-grained",
    #loader=dcmread,
    loader=Image.open
)



In [174]:
col = mk.ImageColumn.from_filepaths(dp["jpg_path"], loader=loader)

In [183]:
for x in col[:100].batch(num_workers=8):
    pass

In [114]:
def get_metadata(x):
    metadata = {}
    for k, v in x.items():
        if (k.group in [0x0018, 0x0008, 0x0002, 0x0020] and hasattr(v, "name")):
            metadata[v.name.lower().replace(" ", "_")] = v.value
    return metadata

out = col.lz[:10].map(get_metadata, pbar=True)

100%|██████████| 10/10 [00:05<00:00,  1.83it/s]


In [137]:
metadata = {}
for k, v in col[0].items():
    if k.group in [0x0018, 0x0008, 0x0002, 0x0020]:
        metadata[v.name.lower().replace(" ", "_")] = v.value

AttributeError: 'RawDataElement' object has no attribute 'name'

In [186]:
out = col[0]

In [8]:
import meerkat as mk
from dosma import DicomReader
from meerkat.contrib.siim_cxr import cxr_transform, cxr_transform_pil

loader = DicomReader(group_by=None, default_ornt=("SI", "AP"))
cell = mk.MedicalVolumeCell(
    "/root/test.dcm", loader=loader, transform=cxr_transform_pil
)
blob = bucket.blob("files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.dcm")

In [17]:
out = loader("/home/sabri/test.dcm")

In [15]:
from pydicom import dcmread
dcmread("/home/sabri/test.dcm")

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 206
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: Digital X-Ray Image Storage - For Presentation
(0002, 0003) Media Storage SOP Instance UID      UI: 2.25.30925724177439423411425919179398157560
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 2.25.55362949469033348352269585565668676650
(0002, 0013) Implementation Version Name         SH: 'MIMIC-CXR v2.0.0'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['DERIVED', 'PRIMARY']
(0008, 0016) SOP Class UID                       UI: Digital X-Ray Image Storage - For Presentation
(0008, 0018) SOP Instance UID                    UI: 2.25.309257241774394234114259191793981