In [None]:
!conda install gdcm -c conda-forge -y
#!pip install pylibjpeg
#!conda install -c conda-forge pydicom -y

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import shutil
from PIL import Image

import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from tqdm.auto import tqdm

In [None]:
def read_pidicom(path, voi_lut=True, fix_monochrome=True):
    dicom=pydicom.read_file(path)
    if voi_lut:
            data=apply_voi_lut(dicom.pixel_array, dicom)
    else:
            data=dicom.pixel_array
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    
    # normalize the data    
    data=data-np.min(data)
    data=data/np.max(data)
    data=(data*255).astype(np.uint8)
    return data

In [None]:
# Lanczos resampling is typically used to increase the sampling rate of a digital signal, or to shift it by a fraction of the sampling interval.
def resize(data, shape, keep_ratio=False, resample=Image.LANCZOS):
    data=Image.fromarray(data)
    if keep_ratio:
        data.thumbnail(shape, resample)
    else:
        data=data.resize(shape, resample)
    return data

In [None]:

img_id=[]
shape=[]
splits=[]
for split in ['train','test']:
    dir_save=f'./{split}/'
    
    # create the directory if it doesnt exist. If it does, delete and 
    # create again.
    try:
        os.mkdir(dir_save)
    except:
        shutil.rmtree(dir_save)
        os.mkdir(dir_save)
    for dir_nm, _, file_nm in tqdm(os.walk(f'../input/siim-covid19-detection/{split}')):
        
        for f in file_nm:
            # create the file path and read the dicom files
            data=read_pidicom(os.path.join(dir_nm, f))
            
            # Resize the data
            im=resize(data, shape=(512,512))
            # Save the converted files as .jpeg
            im.save(os.path.join(dir_save, f.replace('dcm', 'jpeg')))
            
            # save the image ids as jpg, image shape 
            img_id.append(f.replace('.dcm',''))
            shape.append(data.shape)
            splits.append(split)
        

In [None]:
# zip the files
!tar -zcvf train.tar.gz  ./
!tar -zcvf test.tar.gz  ./

In [None]:
# Remove the folders containing jpeg files after compressing them
shutil.rmtree('./train')
shutil.rmtree('./test')

In [None]:
# Create a dataframe with the meta data
img_meta=pd.DataFrame.from_dict({'image_id':img_id,'shape':shape,'splits':splits})
img_meta.to_csv('./img_meta.csv', index=False)

In [None]:
img_meta