<a href="https://colab.research.google.com/github/Niccolo-Rocchi/Medical_Imaging_project/blob/main/data_upload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
%%capture 
!pip install pydicom

In [None]:
# For reading files
from pydicom import dcmread 
import glob
from google.colab import drive
# For dealing with data
import numpy as np
import pandas as pd
import re
import random
random.seed(123)
# For plots
import matplotlib.pyplot as plt
# For nets utils
import tensorflow as tf

In [None]:
# Mount Drive data
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/pneumotorax_data

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/DS Lab in Medicine - projects/Medical Imaging - project/pneumotorax_data


## Data upload

In [None]:
# Read patients IDs and their encoded pixels
encoded_pixels = pd.read_csv('./encoded_pixels.csv')
# Read dicom files IDs
dicom_files_IDs = glob.glob('./dicom_files/*')
dicom_files_IDs = pd.DataFrame({"dicom_ID" : [re.findall(r"\./dicom_files/(.+)\.dcm", id)[0] for id in dicom_files_IDs]})
# Intersect patients IDs with dicom IDs
encoded_pixels = pd.merge(encoded_pixels, dicom_files_IDs, how = "inner", left_on = "ImageId", right_on = "dicom_ID")
# Mark healthy patients as "0"
encoded_pixels[" EncodedPixels"] = encoded_pixels[" EncodedPixels"].apply(lambda x: x.split())
encoded_pixels["Health"] = encoded_pixels[" EncodedPixels"].apply(lambda x: 0 if len(x)==1 else 1)

In [None]:
# Find by how many pts healthy vs unhealthy the dataset differ
diff = len(encoded_pixels[encoded_pixels["Health"] == 0]) - len(encoded_pixels[encoded_pixels["Health"] == 1])
# Select this number of pts in a random manner and drop them in the dataset
healthy_idx = list(encoded_pixels[encoded_pixels["Health"] == 0].index)
random.shuffle(healthy_idx)
encoded_pixels.drop(index = healthy_idx[:diff], inplace = True)
# Assert
assert(len(encoded_pixels[encoded_pixels["Health"] == 0]) == len(encoded_pixels[encoded_pixels["Health"] == 1]))

In [None]:
# Reset indices
encoded_pixels.reset_index(drop = True, inplace = True)
# Sample the dataset
train_frac = 0.7
train_set = encoded_pixels.sample(frac = train_frac, random_state = 123)
val_set = encoded_pixels.drop(index = train_set.index).sample(frac = 1, random_state = 1)
# Assert
assert([i for i in val_set.index if i in train_set.index] == [])

In [None]:
# RLE to mask conversion provided by competition organizers with the dataset.
def rle2mask(rle, width, height):
    mask= np.zeros(width* height)
    array = np.asarray([int(x) for x in rle])
    starts = array[0::2]
    lengths = array[1::2]

    current_position = 0
    for index, start in enumerate(starts):
        current_position += start
        mask[current_position:current_position+lengths[index]] = 255
        current_position += lengths[index]

    return mask.reshape(width, height, order='F')

In [None]:
# Create input for Keras' fit_generator function
class DataGenerator:

  # Method that yields (image, mask) tuple
  def data_generator(self, data):
    i = 0
    while i < len(data):
      # Extract ID and its encoded pixels
      id, rle = data[["ImageId","EncodedPixels"]].iloc(i)
      # Convert encoded pixels to mask
      mask = rle2mask(rle, 1024, 1024)
      # Read the image associate to ImageId
      try:
        dcm_file = dcmread(f"./dicom_files/{id}.dcm")
      except:
        continue
      dcm_image = dcm_file.pixel_array
      # Rescale image
      image = tf.keras.layers.Rescaling(dcm_image, 255)
      # Resize image and mask
      mask = tf.keras.layers.Resizing(256, 256, interpolation="bilinear", crop_to_aspect_ratio=False)(mask)
      image = tf.keras.layers.Resizing(256, 256, interpolation="bilinear", crop_to_aspect_ratio=False)(image)
      # Expand image dimension
      image = np.expand_dims(image, axis=-1)
      yield (image, mask)
      i += 1

  # Method used to train the net, i.e. to generate training set
  def train_generator(self, total_items, batch_size, epochs):
    # Create a tensorflow iterator
    tf_iterator = tf.data.Dataset.from_generator(self.data_generator(train_set), output_types=(tf.float64, tf.int64))
    # Create epochs
    tf_iterator = tf_iterator.repeat(epochs)
    # Create batches
    tf_iterator = tf_iterator.batch(batch_size)
    # Convert to a proper iterator
    tf_iterator = tf_iterator.make_one_shot_iterator()
    # Yield the result
    while True:
      batch_images, batch_masks = tf_iterator.get_next()
      yield (batch_images, batch_masks)

  # Method used to generate validation set
  def val_generator(self, total_items, batch_size, epochs):
    # Create a tensorflow iterator
    tf_iterator = tf.data.Dataset.from_generator(self.data_generator(val_set), output_types=(tf.float64, tf.int64))
    # Create epochs
    tf_iterator = tf_iterator.repeat(epochs)
    # Create batches
    tf_iterator = tf_iterator.batch(batch_size)
    # Convert to a proper iterator
    tf_iterator = tf_iterator.make_one_shot_iterator()
    # Yield the result
    while True:
      batch_images, batch_masks = tf_iterator.get_next()
      yield (batch_images, batch_masks)

## References

1. https://towardsdatascience.com/medical-image-dataloaders-in-tensorflow-2-x-ee5327a4398f
2. https://stackoverflow.com/questions/55375416/tensorflow-model-fit-using-a-dataset-generator
3. https://faroit.com/keras-docs/1.2.0/models/model/
4. https://www.tensorflow.org/api_docs/python/tf/keras/layers/Rescaling
5. https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=nightly#from_generator