https://www.kaggle.com/xhlulu/siim-covid-19-convert-to-jpg-256px

In [1]:
import pandas as pd
import numpy as np
import pydicom
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

import os
from PIL import Image

In [2]:
df = pd.read_csv('/workspace/data/df_train_study_level.csv')

In [6]:
save_path = "/workspace/data/train_640_2/"
if os.path.exists(save_path) == False:
    os.makedirs(save_path)

In [10]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

def save_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    
    png_path = save_path + path.split('/')[-1].replace("dcm", "png")
    
#     im = resize(data, size=640)
#     im.save(png_path)
    
    cv2.imwrite(png_path, data)
    
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [11]:
dicom_path_list = df.dicom_path

In [12]:
from multiprocessing import Pool
import time

pool = Pool(processes=20)

with tqdm(total=len(dicom_path_list)) as t:
    for _ in pool.imap_unordered(save_xray, dicom_path_list):
        t.update(1)

100%|██████████| 6334/6334 [04:19<00:00, 24.38it/s]


In [10]:
df["npy_path"] = [save_path + path.split('/')[-1].replace("dcm", "png") for path in df["dicom_path"]]

In [11]:
df.to_csv('/workspace/data/df_train_study_level_npy640_2.csv')

In [4]:
df

Unnamed: 0,id,dicom_path,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,cv
0,eeecfd50b220_study,/workspace/data/train/eeecfd50b220/0fd96597c55...,0,1,0,0,3
1,a89c1f3470e1_study,/workspace/data/train/a89c1f3470e1/c23cc53bdbf...,0,0,1,0,4
2,9cc2f64f0c58_study,/workspace/data/train/9cc2f64f0c58/2eebbe88278...,0,1,0,0,2
3,7b6c49da06db_study,/workspace/data/train/7b6c49da06db/c1fd5829f05...,0,1,0,0,2
4,d14080fd6f2a_study,/workspace/data/train/d14080fd6f2a/9d6098d515e...,0,1,0,0,0
...,...,...,...,...,...,...,...
6329,6bb38a2b98f0_study,/workspace/data/train/6bb38a2b98f0/70960f315ac...,0,1,0,0,4
6330,8501ffeadc53_study,/workspace/data/train/8501ffeadc53/0c640403081...,0,1,0,0,0
6331,c1ba4d912111_study,/workspace/data/train/c1ba4d912111/5d1e7b4f209...,0,1,0,0,4
6332,7e4059d6a0f9_study,/workspace/data/train/7e4059d6a0f9/ac80ee2496b...,1,0,0,0,2


In [9]:
df.groupby('cv').sum().sum().sum()

6334

In [6]:
df_study = pd.read_csv('/workspace/data/train_study_level.csv')

In [7]:
df_study

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0
1,000c9c05fd14_study,0,0,0,1
2,00292f8c37bd_study,1,0,0,0
3,005057b3f880_study,1,0,0,0
4,0051d9b12e72_study,0,0,0,1
...,...,...,...,...,...
6049,ffcb4630f46f_study,0,1,0,0
6050,ffe4d6e8fbb0_study,0,1,0,0
6051,ffe94fcb14fa_study,0,1,0,0
6052,ffebf1ef4a9c_study,0,1,0,0
