# Install Library

> **dicomsdl is a python library that we used to convert dicom images into png**

In [1]:
from IPython.display import clear_output

!pip -q install dicomsdl

clear_output()

**Necessary Libraries**

In [2]:
# Lib for basic operations
import os
import glob
from joblib import Parallel, delayed
from tqdm import tqdm
from pathlib import Path
import shutil

# lib for image processing operations
import cv2
import dicomsdl as dicom

# lib for arithmetic and data processing operation
import pandas as pd
import numpy as np

# lib for tensorflow and keras processing
import tensorflow as tf

# np reandom seed
seed_number = 31415
np.random.seed(seed_number)

# Create basic configuration class
**This Configuration class consisting information about project**

In [3]:
resize_dim = [(1024,512), (512,256)]

class ProjectConfiguration:
    def __init__(self):
        # input path conf
        self.rootPath = "/kaggle/input/rsna-breast-cancer-detection"
        self.train_image_path = self.rootPath + "/train_images"
        self.test_image_path = self.rootPath + "/test_images"
        self.traing_metainfo_path = self.rootPath + "/train.csv"
        self.test_metainfo_path = self.rootPath + "/test.csv"
        
        # output path conf
        self.outputPath = "/kaggle/working"
        self.train_image_outputpath = self.outputPath + "/train_images"
        self.test_image_outputpath = self.outputPath + "/test_images"
        
        # image size conf
        self.image_width = 1024
        self.image_height = 1024
        self.resize_dimension=resize_dim[0]
        self.image_channel = 1
        
        # image extensions
        self.input_image_extension = ".dcm"
        self.output_image_extension = ".png"
        
        # image sample size(10%)
        self.sample_size = 0.15
        
config = ProjectConfiguration()

**Check total number of left and right image**
* CHECK LEFT AND RIGHT IMAGE COUNT
* CHECK PATIENT_ID COUNT
* CHECK IMAGE_ID COUNT

In [None]:
# CHECK TOTAL COUNT OF LEFT AND RIGHT IMAGE
def check_lr_image(dataset):
    left = dataset["laterality"].apply(lambda x: x=='L')
    right = dataset["laterality"].apply(lambda x: x=='R')

    print("left iamge: ",dataset[left]["laterality"].count())
    print("right iamge: ",dataset[right]["laterality"].count())

# CHECK TOTAL COUNT OF PATIENT ID
def check_count_of_patient_id(dataset):
    print("Total Patient_ID is: ",len(dataset['patient_id'].unique()))

#CHECK TOTAL COUNT OF IMAGE_ID
def check_count_of_image_id(dataset):
    print("Total Image_ID is: ",dataset['image_id'].count())

# GET SAMPLE SIZE 
def get_sample_size(total_size):
        return (int)(total_size*config.sample_size)

# Load Metainfo file

In [4]:
# load TRAIN metainfo
TRAIN_DATASET = pd.read_csv(config.traing_metainfo_path)

# load TEST metainfo
TEST_DATASET = pd.read_csv(config.test_metainfo_path)

**Add two new col in both train and test csv**
1. input_image_path: which contains the information about **dcm** image path
1. output_image_path: which contains the information about **After ROI** image path

In [None]:
# define train image (input and output) path
TRAIN_DATASET["input_image_path"] = str(config.train_image_path) + "/" + TRAIN_DATASET["patient_id"].astype(str) + "/"+ TRAIN_DATASET["image_id"].astype(str) + str(config.input_image_extension)
            
TRAIN_DATASET["output_image_path"] = str(config.train_image_outputpath) + "/" + TRAIN_DATASET["patient_id"].astype(str) + "/"+ TRAIN_DATASET["image_id"].astype(str) + str(config.output_image_extension)

# define test image (input and output) path
TEST_DATASET["input_image_path"] = str(config.test_image_path) + "/" + TEST_DATASET["patient_id"].astype(str) + "/"+ TEST_DATASET["image_id"].astype(str) + str(config.input_image_extension)
            
TEST_DATASET["output_image_path"] = str(config.test_image_outputpath) + "/" + TEST_DATASET["patient_id"].astype(str) + "/"+ TEST_DATASET["image_id"].astype(str) + str(config.output_image_extension)

In [5]:

TRAIN_DATASET.to_csv('TrainDataset.csv')
TEST_DATASET.to_csv('TestDataset.csv')

**Save Updated TRAIN, TEST CSV into output directory**

In [None]:
TRAIN_DATASET.shape

In [None]:
#check_lr_image(TRAIN_CANCER_YES)
#check_count_of_patient_id(TRAIN_CANCER_YES)
#check_count_of_image_id(TRAIN_CANCER_YES)

# CONVERT IMAGE [DCM TO PNG]
**The following three functions respectively are responsible for:**
1. Converting from dcm image to png image
2. Extracting roi (region of interest) from the image
3. Saving images inside the output dir

In [None]:
# CONVERT DCM TO PNG
def convert_dcm_to_img(image_path):
    dicom_image = dicom.open(image_path)
    image = dicom_image.pixelData(storedvalue=False)
    image = image - np.min(image)
    image = image / np.max(image)

    if dicom_image.PhotometricInterpretation == 'MONOCHROME1':
        image = 1.0 - image
    
    image = cv2.resize(image, (config.image_height, config.image_width), interpolation=cv2.INTER_LINEAR)
    image = (image * 255).astype(np.uint8)
    return image

# CONVERT PNG TO ROI
def convert_png_to_roi(image):
    bin_image = cv2.threshold(image, 20, 255, cv2.THRESH_BINARY)[1]
    contours, _ = cv2.findContours(bin_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contour = max(contours, key=cv2.contourArea)
    ys = contour.squeeze()[:, 0]
    xs = contour.squeeze()[:, 1]
    roi = image[np.min(xs):np.max(xs), np.min(ys):np.max(ys)]
    final_image = cv2.resize(roi, config.resize_dimension[::-1], interpolation=cv2.INTER_LINEAR)
    
    return final_image

# SAVE IMAGE IN OUTPUT DIRECTORY
def save_image(input_path, output_path):
    PNG_IMAGE = convert_dcm_to_img(input_path)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    ROI_IMAGE = convert_png_to_roi(PNG_IMAGE)
    cv2.imwrite(output_path, ROI_IMAGE)

**Remove previously created output**

In [None]:
#shutil.rmtree(config.train_image_outputpath)
#shutil.rmtree(config.test_image_outputpath)

**Save TEST Images**

In [None]:
%%time
test_output = Parallel(n_jobs=4,backend='threading')(delayed(save_image)(input_path, output_path)\
                                                  for input_path, output_path in tqdm(zip(TEST_DATASET["input_image_path"],TEST_DATASET["output_image_path"]), leave=True, position=0))

**Save TRAIN Images**

In [None]:
%%time
train_output = Parallel(n_jobs=4,backend='threading')(delayed(save_image)(input_path, output_path)\
                                                  for input_path, output_path in tqdm(zip(TRAIN_DATASET["input_image_path"],TRAIN_DATASET["output_image_path"]),leave=True, position=0))

In [None]:

!cp /kaggle/input/rsna-breast-cancer-detection/sample_submission.csv /kaggle/working/