In [None]:
import numpy as np 
import pandas as pd 
import pydicom,os,cv2
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
import warnings
warnings.simplefilter("ignore")
%matplotlib inline
print(os.listdir("../input"))

In [None]:
# Show some images
from glob import glob
train_fns = sorted(glob('../input/siim-train-test/dicom-images-train/*/*/*.dcm'))
test_fns = sorted(glob('../input/siim-train-test/dicom-images-test/*/*/*.dcm'))

print(f'Number of train dicom files in folder:{len(train_fns)}')
print(f'Number of test dicom files in folder:{len(test_fns)}')

df = pd.read_csv('../input/siim-train-test/train-rle.csv')
print(df.shape)

print(f'Total no of unique images in csv file: {df["ImageId"].nunique()}')
print(f'Images with duplicate Encoded pixels, ie multiple annotations:{df[df.duplicated(subset=["ImageId"])].shape[0]}')

df.head()

In [None]:

missing = 0
multiple = 0
patients_data = []
for k,paths in enumerate(train_fns):
    patient = {}
    img_id = paths.split('/')[-1]
    data = pydicom.dcmread(paths)
    try:
        tmp = df[df['ImageId'] == '.'.join(img_id.split('.')[:-1])]
        
        if tmp.shape[0] > 1: 
            multiple += 1
        rle = tmp[' EncodedPixels'].values
        if rle[0] == '-1':
            pixels = rle[0]
        else:    
            pixels = [i for i in rle]
        
        patient["UID"] = data.SOPInstanceUID
        patient['EncodedPixels'] = pixels
        patient["Age"] = data.PatientAge
        patient["Sex"] = data.PatientSex
        patient["Modality"] = data.Modality
        patient["BodyPart"] = data.BodyPartExamined
        patient["ViewPosition"] = data.ViewPosition
        patient["filepath"] = paths
        patients_data.append(patient)
    except:
        missing += 1

print(f'We have {missing} dicom in folder which is not present in csv')
print(f'There are {multiple} images with more than 1 annotation')
df_patients = pd.DataFrame(patients_data, columns=["UID", "EncodedPixels", "Age", 
                            "Sex", "Modality", "BodyPart", "ViewPosition", "filepath"])

df_patients['Pneumothorax'] = df_patients['EncodedPixels'].apply(lambda x:0 if x == '-1' else 1)
df_patients['Pneumothorax'] = df_patients['Pneumothorax'].astype('int')
print("images with labels: ", df_patients.shape[0])
df_patients.head()  

In [None]:
def rle2mask(rles, width, height):
    """
    
    rle encoding if images
    input: rles(list of rle), width and height of image
    returns: mask of shape (width,height)
    """
    
    mask= np.zeros(width* height)
    for rle in rles:
        array = np.asarray([int(x) for x in rle.split()])
        starts = array[0::2]
        lengths = array[1::2]

        current_position = 0
        for index, start in enumerate(starts):
            current_position += start
            mask[current_position:current_position+lengths[index]] = 255
            current_position += lengths[index]

    return mask.reshape(width, height).T

In [None]:
from PIL import Image
import shutil

train_img_dir = '/kaggle/working/out/train/images/'
test_img_dir = '/kaggle/working/out/test/images/'
train_msk_dir = '/kaggle/working/out/train/masks/'

for loc in [train_img_dir,test_img_dir,train_msk_dir]:
#     shutil.rmtree(loc)
    if not os.path.exists(loc):
        os.makedirs(loc)

In [None]:
# test dir
for img_path in tqdm(test_fns):
    f = os.path.basename(img_path)
    img = pydicom.dcmread(img_path).pixel_array
    img_mem = Image.fromarray(img)
    img_mem.save(test_img_dir + f.replace('.dcm','.png'))

In [None]:
for i in tqdm(range(df_patients.shape[0])):
    tmp = df_patients.iloc[i]
    image_path = tmp['filepath']
    encoding = tmp['EncodedPixels']
    f = os.path.basename(image_path)
    img = pydicom.dcmread(image_path).pixel_array
    img_mem = Image.fromarray(img) 
    img_mem.save(train_img_dir + f.replace('.dcm','.png'))
    try:
        mask = rle2mask(encoding,img.shape[0],img.shape[1])
        mask_mem = Image.fromarray(mask)
        mask_mem.save(train_msk_dir + f.replace('.dcm','.png'))
    except:
        pass

In [None]:
os.chdir('/kaggle/working')

In [None]:
import os
import zipfile
    
def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file), 
                       os.path.relpath(os.path.join(root, file), 
                                       os.path.join(path, '..')))
      
zipf = zipfile.ZipFile('image_masks.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('out', zipf)
zipf.close()

In [None]:
from IPython.display import FileLink
FileLink(r'image_masks.zip')