<a href="https://colab.research.google.com/github/Niccolo-Rocchi/Medical_Imaging_project/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Semantic Segmentation on Pneuomothorax dataset


---


Task: Binary Mask Prediction (Pneuomothorax Area) 

Pneuomothorax Challenge: https://siim.org/page/pneumothorax_challenge

*Authors: Pirola Federico, Rocchi Niccolò*

*Università degli studi di Milano-Bicocca, Milan, Italy*

# Data Preprocessing

## Imports

In [None]:
%%capture 
!pip install pydicom

In [None]:
# For reading files
from pydicom import dcmread 
import glob
from google.colab import drive
# For dealing with data
import numpy as np
import pandas as pd
import re
import random
random.seed(123)
import os
# Sklearn utils
from sklearn.model_selection import train_test_split

In [None]:
# Mount Drive data
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/pneumotorax_data

Mounted at /content/drive/
/content/drive/.shortcut-targets-by-id/1YbGUVvpEjZVVfuIQhnDIUOGgMhN2UDrk/pneumotorax_data


## Train and Validation sets Creation

In [None]:
# Read .csv file
myDB = pd.read_csv('./encoded_pixels.csv')

# Add patient variables
myDB['Health'] = np.zeros((len(myDB)),dtype=bool)
myDB['Age'] = np.zeros((len(myDB)),dtype=np.int32)
myDB['PneumothArea'] = np.zeros((len(myDB)),dtype=np.float32)
myDB['Sex'] = 'N/A'
myDB['HasFile'] = False
myDB['Position'] = 'N/A'

for rowIdx,pat in myDB.iterrows():
    fName = os.path.join('/content/gdrive/MyDrive/Med Imaging/train_whole/dicom_files',pat['ImageId'] + '.dcm')
    if not os.path.isfile(fName):
        continue
    dcmHead = dcmread(fName)
    myDB.loc[rowIdx,['Sex']]=dcmHead.PatientSex
    myDB.loc[rowIdx,['Age']]=int(dcmHead.PatientAge)
    myDB.loc[rowIdx,['HasFile']]=True
    myDB.loc[rowIdx,['Position']]=dcmHead.ViewPosition
    if '-1' in pat[1]:
        myDB.loc[rowIdx,['Health']]=True
    else:
        rleList = np.array([int(s) for s in pat[1].split() ])
        nPix = rleList[1::2]
        myDB.loc[rowIdx,['PneumArea']]=np.sum(nPix)*np.prod(dcmHead.PixelSpacing)

In [None]:
# Cut off patients with age > 100
encoded_pixels = myDB[myDB['HasFile']][myDB['Age']<100]

In [None]:
# Find by how many pts healthy vs unhealthy the dataset differ
diff = len(encoded_pixels[encoded_pixels["Health"] == 1]) - len(encoded_pixels[encoded_pixels["Health"] == 0])

# Select this number of pts in a random manner and drop them in the dataset
healthy_idx = list(encoded_pixels[encoded_pixels["Health"] == 1].index)
random.shuffle(healthy_idx)
encoded_pixels.drop(index = healthy_idx[:diff], inplace = True)
encoded_pixels.reset_index(drop = True, inplace = True)

# Assert
assert(len(encoded_pixels[encoded_pixels["Health"] == 1]) == len(encoded_pixels[encoded_pixels["Health"] == 0]))

In [None]:
# Split between training and validation stratifying by Health condition, Sex and Position during RX acquisition
train_set, val_set = train_test_split(encoded_pixels, train_size=0.8,shuffle = True, random_state=123, stratify=encoded_pixels[['Health','Sex','Position']])

# Assert
assert([i for i in val_set.index if i in train_set.index] == [])

In [None]:
# Make size compatible with batches
max_batch_size = 32
train_set = train_set.iloc[:-(len(train_set)%max_batch_size)]
val_set = val_set.iloc[:-(len(val_set)%max_batch_size)]

# Reset indices
train_set.reset_index(drop = True, inplace = True)
val_set.reset_index(drop = True, inplace = True)

In [None]:
# Drop variables not needed during training
DeleteList = ['Health','Age','PneumothArea','Sex','HasFile','Position']
train_set.drop(DeleteList, axis=1,inplace=True)
val_set.drop(DeleteList, axis=1,inplace=True)

In [None]:
# Save sets
train_set.to_csv("train_set.csv")
val_set.to_csv("val_set.csv")