https://www.kaggle.com/xhlulu/siim-covid-19-convert-to-jpg-256px

In [5]:
import pandas as pd
import numpy as np
import pydicom
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

import os
from PIL import Image

In [6]:
df = pd.read_csv('/workspace/data/df_train_study_level.csv')

In [7]:
df.id.value_counts()

0fd2db233deb_study    9
a7335b2f9815_study    7
8943d1d85097_study    6
a4e94133d95a_study    5
970c96c9fa5d_study    5
                     ..
e48cc454acd6_study    1
17adc5dabfc2_study    1
7a4341a6a2ee_study    1
66028624dad2_study    1
1d155f64f846_study    1
Name: id, Length: 6054, dtype: int64

In [8]:
save_path = "/workspace/data/train_640_2/"
if os.path.exists(save_path) == False:
    os.makedirs(save_path)

In [9]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

def save_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    
    png_path = save_path + path.split('/')[-1].replace("dcm", "png")
    return path, data.shape[0], data.shape[1]
    
#     im = resize(data, size=640)
#     im.save(png_path)
    
#     cv2.imwrite(png_path, data)
    
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [10]:
dicom_path_list = df.dicom_path

In [11]:
from multiprocessing import Pool
import time

pool = Pool(processes=20)

ret_all = []

with tqdm(total=len(dicom_path_list)) as t:
    for ret in pool.imap_unordered(save_xray, dicom_path_list):
        t.update(1)
        ret_all += ret

100%|██████████| 6334/6334 [03:37<00:00, 29.12it/s]


In [15]:
all_info = np.array(ret_all).reshape(-1, 3)

In [23]:
df_new = pd.read_csv('/workspace/data/df_train_study_level_npy640_2.csv', index_col=0)

In [24]:
df_hw = pd.DataFrame({
    "dicom_path": all_info[:, 0],
    "height": all_info[:, 1],
    "width": all_info[:, 2],
})

In [25]:
df_hw

Unnamed: 0,dicom_path,height,width
0,/workspace/data/train/2280689e5dd3/6db0483124a...,2414,2827
1,/workspace/data/train/ba07cc459be2/c47117ced66...,2416,2872
2,/workspace/data/train/9cc2f64f0c58/2eebbe88278...,2005,2836
3,/workspace/data/train/f5658ffaf0d3/70e627e95c6...,2320,2832
4,/workspace/data/train/a89c1f3470e1/c23cc53bdbf...,2330,2846
...,...,...,...
6329,/workspace/data/train/58072ae8b0f0/50b397ed409...,3480,4240
6330,/workspace/data/train/2f4e25cb6e47/0e6b15649a3...,3488,4256
6331,/workspace/data/train/281d15ca6f64/7e638d2bd19...,3488,4256
6332,/workspace/data/train/8501ffeadc53/0c640403081...,3480,4240


In [26]:
df_new

Unnamed: 0,id,dicom_path,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,cv,npy_path
0,eeecfd50b220_study,/workspace/data/train/eeecfd50b220/0fd96597c55...,0,1,0,0,3,/workspace/data/train_640_2/13131b0c3db4.png
1,a89c1f3470e1_study,/workspace/data/train/a89c1f3470e1/c23cc53bdbf...,0,0,1,0,4,/workspace/data/train_640_2/238208eb51b4.png
2,9cc2f64f0c58_study,/workspace/data/train/9cc2f64f0c58/2eebbe88278...,0,1,0,0,2,/workspace/data/train_640_2/fa9729ef16b6.png
3,7b6c49da06db_study,/workspace/data/train/7b6c49da06db/c1fd5829f05...,0,1,0,0,2,/workspace/data/train_640_2/b6b631939d4f.png
4,d14080fd6f2a_study,/workspace/data/train/d14080fd6f2a/9d6098d515e...,0,1,0,0,0,/workspace/data/train_640_2/ce289d7a37bb.png
...,...,...,...,...,...,...,...,...
6329,6bb38a2b98f0_study,/workspace/data/train/6bb38a2b98f0/70960f315ac...,0,1,0,0,4,/workspace/data/train_640_2/ef29cf6d3cb4.png
6330,8501ffeadc53_study,/workspace/data/train/8501ffeadc53/0c640403081...,0,1,0,0,0,/workspace/data/train_640_2/193ac79cabfd.png
6331,c1ba4d912111_study,/workspace/data/train/c1ba4d912111/5d1e7b4f209...,0,1,0,0,4,/workspace/data/train_640_2/70e85faa319e.png
6332,7e4059d6a0f9_study,/workspace/data/train/7e4059d6a0f9/ac80ee2496b...,1,0,0,0,2,/workspace/data/train_640_2/5f4b00225aaf.png


In [27]:
df_new.merge(df_hw, on="dicom_path", how="left")

Unnamed: 0,id,dicom_path,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,cv,npy_path,height,width
0,eeecfd50b220_study,/workspace/data/train/eeecfd50b220/0fd96597c55...,0,1,0,0,3,/workspace/data/train_640_2/13131b0c3db4.png,2336,2836
1,a89c1f3470e1_study,/workspace/data/train/a89c1f3470e1/c23cc53bdbf...,0,0,1,0,4,/workspace/data/train_640_2/238208eb51b4.png,2330,2846
2,9cc2f64f0c58_study,/workspace/data/train/9cc2f64f0c58/2eebbe88278...,0,1,0,0,2,/workspace/data/train_640_2/fa9729ef16b6.png,2005,2836
3,7b6c49da06db_study,/workspace/data/train/7b6c49da06db/c1fd5829f05...,0,1,0,0,2,/workspace/data/train_640_2/b6b631939d4f.png,3052,3012
4,d14080fd6f2a_study,/workspace/data/train/d14080fd6f2a/9d6098d515e...,0,1,0,0,0,/workspace/data/train_640_2/ce289d7a37bb.png,3480,4248
...,...,...,...,...,...,...,...,...,...,...
6329,6bb38a2b98f0_study,/workspace/data/train/6bb38a2b98f0/70960f315ac...,0,1,0,0,4,/workspace/data/train_640_2/ef29cf6d3cb4.png,2544,3056
6330,8501ffeadc53_study,/workspace/data/train/8501ffeadc53/0c640403081...,0,1,0,0,0,/workspace/data/train_640_2/193ac79cabfd.png,3480,4240
6331,c1ba4d912111_study,/workspace/data/train/c1ba4d912111/5d1e7b4f209...,0,1,0,0,4,/workspace/data/train_640_2/70e85faa319e.png,2320,2832
6332,7e4059d6a0f9_study,/workspace/data/train/7e4059d6a0f9/ac80ee2496b...,1,0,0,0,2,/workspace/data/train_640_2/5f4b00225aaf.png,2304,2836


In [34]:
df_new.merge(df_hw, on="dicom_path", how="left").to_csv('/workspace/data/df_train_study_level_npy640_2_w_hw.csv', index=False)

In [35]:
df_new_w_bbox = pd.read_csv('/workspace/data/df_train_study_level_npy640_3_w_bbox.csv')

In [36]:
df_new_w_bbox.merge(df_hw, on="dicom_path", how="left").to_csv('/workspace/data/df_train_study_level_npy640_3_w_bbox_hw.csv', index=False)

In [37]:
df_new_w_bbox.merge(df_hw, on="dicom_path", how="left")

Unnamed: 0,id,x,y,w,h,study_id,image_id,have_box,dicom_path,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,cv,npy_path,is_none,height,width
0,04f41a8958f7_image,688.06282,966.82563,518.48212,1130.17438,6e4a0581cefe,04f41a8958f7,1,/workspace/data/train/6e4a0581cefe/018ed20fa9c...,0,1,0,0,0,/workspace/data/train_640_2/04f41a8958f7.png,0,2490,3408
1,04f41a8958f7_image,2482.36026,1636.77436,652.47168,652.47168,6e4a0581cefe,04f41a8958f7,1,/workspace/data/train/6e4a0581cefe/018ed20fa9c...,0,1,0,0,0,/workspace/data/train_640_2/04f41a8958f7.png,0,2490,3408
2,04f41a8958f7_image,1235.67308,1628.03597,509.74353,559.26147,6e4a0581cefe,04f41a8958f7,1,/workspace/data/train/6e4a0581cefe/018ed20fa9c...,0,1,0,0,0,/workspace/data/train_640_2/04f41a8958f7.png,0,2490,3408
3,0572ef0d0c1a_image,1818.65264,233.50598,613.04395,839.53784,adbfed2da701,0572ef0d0c1a,1,/workspace/data/train/adbfed2da701/e2fa197720c...,0,1,0,0,3,/workspace/data/train_640_2/0572ef0d0c1a.png,0,2436,3032
4,0572ef0d0c1a_image,598.60492,61.37052,688.54175,881.81674,adbfed2da701,0572ef0d0c1a,1,/workspace/data/train/adbfed2da701/e2fa197720c...,0,1,0,0,3,/workspace/data/train_640_2/0572ef0d0c1a.png,0,2436,3032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9671,ffd9b6cf2961_image,707.25199,722.07926,392.14044,849.18683,7eed9af03814,ffd9b6cf2961,1,/workspace/data/train/7eed9af03814/668a64e3a5f...,0,1,0,0,1,/workspace/data/train_640_2/ffd9b6cf2961.png,0,2388,3050
9672,ffdc682f7680_image,2729.27083,332.26044,1496.25016,2604.58334,a0cb0b96fb3d,ffdc682f7680,1,/workspace/data/train/a0cb0b96fb3d/ccf363aa080...,0,1,0,0,2,/workspace/data/train_640_2/ffdc682f7680.png,0,3488,4256
9673,ffdc682f7680_image,1005.81250,1584.67711,662.22913,775.83337,a0cb0b96fb3d,ffdc682f7680,1,/workspace/data/train/a0cb0b96fb3d/ccf363aa080...,0,1,0,0,2,/workspace/data/train_640_2/ffdc682f7680.png,0,3488,4256
9674,ffe942c8655f_image,208.86463,91.53448,450.96747,628.05473,7d82d53204b8,ffe942c8655f,1,/workspace/data/train/7d82d53204b8/431ee249d16...,0,1,0,0,1,/workspace/data/train_640_2/ffe942c8655f.png,0,1140,1387


In [4]:
df

Unnamed: 0,id,dicom_path,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,cv
0,eeecfd50b220_study,/workspace/data/train/eeecfd50b220/0fd96597c55...,0,1,0,0,3
1,a89c1f3470e1_study,/workspace/data/train/a89c1f3470e1/c23cc53bdbf...,0,0,1,0,4
2,9cc2f64f0c58_study,/workspace/data/train/9cc2f64f0c58/2eebbe88278...,0,1,0,0,2
3,7b6c49da06db_study,/workspace/data/train/7b6c49da06db/c1fd5829f05...,0,1,0,0,2
4,d14080fd6f2a_study,/workspace/data/train/d14080fd6f2a/9d6098d515e...,0,1,0,0,0
...,...,...,...,...,...,...,...
6329,6bb38a2b98f0_study,/workspace/data/train/6bb38a2b98f0/70960f315ac...,0,1,0,0,4
6330,8501ffeadc53_study,/workspace/data/train/8501ffeadc53/0c640403081...,0,1,0,0,0
6331,c1ba4d912111_study,/workspace/data/train/c1ba4d912111/5d1e7b4f209...,0,1,0,0,4
6332,7e4059d6a0f9_study,/workspace/data/train/7e4059d6a0f9/ac80ee2496b...,1,0,0,0,2


In [9]:
df.groupby('cv').sum().sum().sum()

6334

In [6]:
df_study = pd.read_csv('/workspace/data/train_study_level.csv')

In [7]:
df_study

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0
1,000c9c05fd14_study,0,0,0,1
2,00292f8c37bd_study,1,0,0,0
3,005057b3f880_study,1,0,0,0
4,0051d9b12e72_study,0,0,0,1
...,...,...,...,...,...
6049,ffcb4630f46f_study,0,1,0,0
6050,ffe4d6e8fbb0_study,0,1,0,0
6051,ffe94fcb14fa_study,0,1,0,0
6052,ffebf1ef4a9c_study,0,1,0,0
