
# <span style="color:teal"> RSNA Screening Mammography Breast Cancer Detection<a class="anchor"  id="projectTopic"></a></span>
### <span style="color:teal"> Detect breast cancers in screening mammograms <a class="anchor"  id="detect"></a></span>

# <span style="color:teal">Developed by : <a class="anchor"  id="detect"></a></span>

* [Gebreyowhans Hailekiros](https://www.kaggle.com/gebreyowhansbahre/)
<!-- * [Mahbub Hasan](https://www.kaggle.com/mahbubulhasan/) -->
* [Muhammad Danish Sadiq](https://www.kaggle.com/muhammaddanishsadiq/)

* This notebook is developed to make inference on the trained model [RSNA_BCD_Train[TPU_VM]_EfficientNet](https://www.kaggle.com/code/gebreyowhansbahre/rsna-bcd-train-tpu-vm-bc86d1) using unseen test data and submit it to competition. Since we trained to models during training; we have to use average of the predictions of the two models to get the final prediction.

# <span style="color:teal"> Notebooks <a class="anchor"  id="notebooks"></a></span>
* Image preprocessing Notebook: [RSNA_BCD_DICOM_PNG_ROI](https://www.kaggle.com/code/gebreyowhansh/rsna-bcd-dicom-png-roi)

* Training Notebook: [RSNA_BCD_Train[TPU_VM]_EfficientNet](https://www.kaggle.com/code/gebreyowhansbahre/rsna-bcd-train-tpu-vm-bc86d1)

* Test Notebook: [RSNA-BCD-GPU-TEST_EfficientNet](https://www.kaggle.com/code/gebreyowhansbahre/rsna-bcd-gpu-test/edit/run/128324368)

 

# <span style="color:teal">1. Imporing and installing libraries <a class="anchor"  id="libraries"></a></span>

In [None]:
from IPython.display import clear_output
!pip install -qU --upgrade pip
clear_output()
!pip install -qU /kaggle/input/whl-files/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
!pip install -qU /kaggle/input/whl-files/pylibjpeg_libjpeg-1.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl/pylibjpeg_libjpeg-1.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -qU /kaggle/input/whl-files/python_gdcm-3.0.20-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl/python_gdcm-3.0.20-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
import os, random, cv2, dicomsdl
import numpy as np
import pandas as pd
from IPython import display as ipd

from tqdm import tqdm
from joblib import Parallel, delayed
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid

import tensorflow as tf
from tensorflow import keras
from tensorflow.python.client import device_lib
from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')


In [None]:
print('np:', np.__version__)
print('pd:', pd.__version__)
print('tf:',tf.__version__)

# <span style="color:teal"> 2. Create basic configuration class <a class="anchor"  id="configuration"></a></span>
 * Configuration class consisting of information about project

In [None]:
class Config:
    
    def __init__(self):
        
        self.debug = False
        
        self.device = 'GPU'
        self.num_devices = 1
#         self.model_name = 'EfficientNetB3'
        self.seed = 150
        
        self.models_path = '/kaggle/input/rsna-bcd-train-tpu-vm-bc86d1/'
        
        self.weights = "/kaggle/input/whl-files/efficientnetb3_notop.h5/efficientnetb3_notop.h5"
        
        self.input_data_path = '/kaggle/input/rsna-breast-cancer-detection/'
        
        self.output_path = '/kaggle/working/'
        self.test_images_path=self.output_path+'test_images/'
        
        self.test_path = self.input_data_path + 'test_images/'
        self.test_csv = self.input_data_path + 'test.csv'
        
        self.sub_csv = '/kaggle/working/submission.csv'  
        self.sample_sub_csv = self.input_data_path + 'sample_submission.csv'
        self.models = ['model_3.h5','model_4.h5']
        
        self.threshold = 0.6
        
        self.batch_size=32
        self.epochs=10
        self.dropout=0.4
        self.optimizer='adam'
        self.loss='binary_crossentropy'
        
        self.img_size =(512,256)
        self.resize_dim = 512
        self.img_ext = 'png'

config = Config()

# <span style="color:teal"> 3. Device Configurations <a class="anchor"  id="configuration"></a></span>

In [None]:
num_devices = len(tf.config.list_physical_devices('GPU'))

if num_devices > 1:
    config.num_devices = num_devices
    strategy = tf.distribute.MirroredStrategy()
    print(f'Running on {num_devices} GPU devices')
elif num_devices == 1:
    strategy = tf.distribute.get_strategy()
    print(f'Running on {num_devices} GPU device')
else:
    strategy = tf.distribute.get_strategy()
    config.device = 'CPU'
    print(f'Running on CPU')

tf.config.optimizer.set_jit(True)
tf.keras.mixed_precision.set_global_policy(policy="float32")
config.batch_size = config.batch_size * config.num_devices

# <span style="color:teal">4.Functions to convert ,extract ROI and save dicom images <a class="anchor"  id="utilityfunctions"></a></span>
 

### <span style="color:teal">4.1 Dicom to png <a class="anchor"  id="dicomtopng"></a></span>

In [None]:
def dicom_to_png(dicom_path):
    dicom = dicomsdl.open(dicom_path)
    image = dicom.pixelData(storedvalue=False)
    image = image - np.min(image)
    image = image / np.max(image)

    if dicom.PhotometricInterpretation == 'MONOCHROME1':
        image = 1.0 - image
        
    image = cv2.resize(image, (config.resize_dim, config.resize_dim), interpolation=cv2.INTER_LINEAR)
    image = (image * 255).astype(np.uint8)
    return image


### <span style="color:teal">4.2 Extract region of interest <a class="anchor"  id="regionofInterest"></a></span>

In [None]:
def png_to_roi(image, image_path):
    bin_image = cv2.threshold(image, 20, 255, cv2.THRESH_BINARY)[1]
    contours, _ = cv2.findContours(bin_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contour = max(contours, key=cv2.contourArea)
    ys = contour.squeeze()[:, 0]
    xs = contour.squeeze()[:, 1]
    roi = image[np.min(xs):np.max(xs), np.min(ys):np.max(ys)]
    return cv2.resize(roi, config.img_size[::-1], interpolation=cv2.INTER_LINEAR)

In [None]:
def process(dicom_path, image_path):
    image = dicom_to_png(dicom_path)
    os.makedirs(os.path.dirname(image_path), exist_ok=True)
    image = png_to_roi(image, image_path)
    cv2.imwrite(image_path, image)

# <span style="color:teal"> 5. Dataframe informations <a class="anchor"  id="dataframeinfo"></a></span>

In [None]:
print('\n  Train:')
train_df = pd.read_csv(config.input_data_path + 'train.csv')
display(train_df.head())

print('\n  Test :')
test_df = pd.read_csv(config.test_csv)
test_df['dicom_path'] = config.test_path + test_df['patient_id'].astype(str) + '/' + test_df['image_id'].astype(str) + '.dcm'
test_df['image_path'] = config.test_images_path + test_df['patient_id'].astype(str) + '/' + test_df['image_id'].astype(str) + '.png'
display(test_df.head())

In [None]:
test_df.info()

In [None]:
Parallel(n_jobs=4, backend='threading')(delayed(process)(dicom_path, image_path) 
                   for dicom_path, image_path in tqdm(zip(test_df['dicom_path'], 
                                                          test_df['image_path'])))
clear_output()


#### <span style="color:teal"> Check If Data Exists ? <a class="anchor"  id="existence"></a></span>

In [None]:
tf.io.gfile.exists(test_df.dicom_path.iloc[0])

# <span style="color:teal">6. Defining input features <a class="anchor"  id="inputefeatures"></a></span>

In [None]:
train_df_processed = pd.read_csv('/kaggle/input/rsna-bcd-processed-file/train_df_Processed.csv')
    
processed_train_df_columns = train_df_processed.columns
processed_train_df_columns = np.append(processed_train_df_columns, 'prediction_id')
print(processed_train_df_columns)

test_df = pd.DataFrame(test_df, columns=processed_train_df_columns).fillna(0.0)
test_df.head()

In [None]:
test_df.info()

In [None]:
exclude_cols = ['patient_id', 'image_id', 'site_id','machine_id', 'cancer', 'age', 'stratify', 
                'image_path', 'fold', 'prediction_id']
input_features = test_df.columns.difference(exclude_cols)

input_features

# <span style="color:teal">7. Normalize the inpute features <a class="anchor"  id="nomalization"></a></span>

In [None]:
test_df[input_features] = (test_df[input_features] - train_df_processed[input_features].mean()) / train_df_processed[input_features].std()
test_df[input_features] = test_df[input_features].astype('float32')

test_df[input_features]

# <span style="color:teal">8. Data Pipeline <a class="anchor"  id="pipline"></a></span>

### <span style="color:teal">8.1 Decode images <a class="anchor"  id="decode"></a></span>
 * **tf.image.decode_png()** and **tf.image.decode_jpeg** are TensorFlow functions that decodes a PNG-encoded   image into a tensor of type uint8.
 
 * These functions takes the following arguments: 
     * **image**: A string tensor containing a PNG or jpeg -encoded image.
     * **channels**: An optional integer specifying the number of color channels in the decoded image. By  default, this is set to 3,
     

* The function returns a **uint8** tensor representing the decoded image with the shape of (height, width, channels) 
 

In [None]:
def decode_image(label=True, img_size=config.img_size, ext=config.img_ext):
    
    def _decode_image(Input_Image, label=None):
        image = tf.io.read_file(Input_Image['input_image'])
        
        if ext == 'png':
            ## PNG-encoded image into a tensor of type uint8.
            image = tf.image.decode_png(image, channels=3)
        elif ext in ['jpg', 'jpeg']:
            ## jpeg-encoded image into a tensor of type uint8.
            image = tf.image.decode_jpeg(image, channels=3)
        else:
            raise ValueError("Image extension not supported")
        
        ## explicit size needed for TPU
        image = tf.image.resize(image, img_size)
        ## convert image to floats in [0, 1] range
        image = tf.cast(image, tf.float32) / 255.0
        
        Input_Image['input_image'] = image
        
        if label is None:
            return Input_Image
        else:
            return Input_Image, label
    
    if label:
        return _decode_image
    else:
        return lambda x: _decode_image(x, None)

### <span style="color:teal">8.2 Data augumentation <a class="anchor"  id="augumentation"></a></span>
* Using Augmentations to reduce overfitting and make model more robust by :
 * 1. random_flip_left_right for applying position transforamtion
 * 2. perofrming some random_hue,random_saturation,random_contrast,random_brightness for pixel transforamtion

In [None]:
def data_augment(label=True):
    def _augment(Input_Image, label=None):
        image = Input_Image['input_image']
        #position transforamtion
        image = tf.image.random_flip_left_right(image)
        # pixel-augment
        image = tf.image.random_hue(image, config.hue)
        image = tf.image.random_saturation(image,config.sat[0], config.sat[1])
        image = tf.image.random_contrast(image,config.cont[0], config.cont[1])
        image = tf.image.random_brightness(image,config.bri)
        Input_Image['input_image'] = image
        if label is not None:
            return Input_Image, label
        else:
            return Input_Image

    if label:
        return _augment
    else:
        return lambda x: _augment(x, None)

### <span style="color:teal">8.3 Build tf.data.dataset <a class="anchor"  id="dataset"></a></span>

In [None]:
def build_dataset(df,input_features,image_size=config.img_size,batch_size=config.batch_size, 
                  label=True,cache=False,ext=config.img_ext):
    
    decode = decode_image(label, img_size=image_size, ext=ext)
    input_data = {'input_image': df['image_path'].values, 'input_features': df[input_features].values}
    
    if label:
        label_data = df['cancer'].apply(lambda x: int(x)).values
        dataset = tf.data.Dataset.from_tensor_slices((input_data, label_data))
    else:
        dataset = tf.data.Dataset.from_tensor_slices(input_data)
        
    dataset = dataset.map(decode, num_parallel_calls=tf.data.AUTOTUNE)
    
    if cache:
        dataset = dataset.cache()
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


In [None]:
test_dataset = build_dataset(test_df, input_features, 
                             batch_size=config.batch_size, 
                             label=False, 
                             cache=False)

In [None]:
# for item in test_dataset.take(1):
#     print(item)

# <span style="color:teal">9. Evaluation metric(f1 score)<a class="anchor"  id="dataset"></a></span>

In [None]:
def p_f1(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred)
    tn = tf.reduce_sum((1 - y_true) * (1 - y_pred))
    fp = tf.reduce_sum((1 - y_true) * y_pred)
    fn = tf.reduce_sum(y_true * (1 - y_pred))
    
    p = tp / (tp + fp + tf.keras.backend.epsilon())
    r = tp / (tp + fn + tf.keras.backend.epsilon())
    
    f1 = 2 * p * r / (p + r + tf.keras.backend.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)

    return tf.reduce_mean(f1)

# <span style="color:teal">10. Defining the model <a class="anchor"  id="model"></a></span>

In [None]:
def build_model(input_features, 
                loss=config.loss, 
                dropout=config.dropout, 
                optimizer=config.optimizer, 
                img_size=config.img_size):
    with strategy.scope():
        input_image = tf.keras.layers.Input(shape=(*img_size,3), name='input_image')
        input_features = tf.keras.layers.Input(shape=[len(input_features)], name='input_features')
        
        efficientNetBase_model = tf.keras.applications.EfficientNetB3(input_shape=(*img_size,3),
                                                 include_top=False, 
                                                 drop_connect_rate=0.3,
                                                 weights=config.weights)(input_image)
        
        x = tf.keras.layers.GlobalAveragePooling2D()(efficientNetBase_model)
        x = tf.keras.layers.Dense(512,activation="relu")(x)
        _output = tf.keras.layers.Dropout(dropout)(x)
        _output = tf.keras.layers.BatchNormalization()(_output)
        _output = tf.keras.layers.Dense(256,activation="relu", kernel_regularizer=
                                        tf.keras.regularizers.l2(0.01))(_output)
        _output = tf.keras.layers.Dropout(dropout)(x)
        _output = tf.keras.layers.BatchNormalization()(_output)
        _output = tf.keras.layers.Concatenate()([_output, input_features])
        _output = tf.keras.layers.Dense(1, activation='sigmoid')(_output)
        
        model = tf.keras.Model(inputs=[input_image, input_features], outputs=_output)
        
        model.compile(optimizer=optimizer,
                      loss=loss,
                      metrics=['accuracy', 
                               p_f1])

        return model
    
model = build_model(input_features)
# model.run_eagerly = True
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, dpi=64)

# <span style="color:teal">11. Predictions <a class="anchor"  id="prediction"></a></span>

In [None]:
test_dataset = build_dataset(test_df, input_features, batch_size=config.batch_size,
                             label=False, cache=False)

predictions = []

for _mdl in config.models:
    print(f'Predicting with model {_mdl}...')
    model.load_weights(f'{config.models_path}/models/'+_mdl)
    pred = model.predict(test_dataset)
    predictions.append(pred)
    
predictions = np.mean(predictions, axis=0)
predictions

# <span style="color:teal">12. Prepare submission data <a class="anchor"  id="submission"></a></span>

In [None]:
prediction_reshaped=predictions.reshape(-1)
print("Predictions after reshaped :",prediction_reshaped)

prediction_ids=test_df['prediction_id']
print("Prediction Id's : ",prediction_ids)

In [None]:
predicted_df = pd.DataFrame({'prediction_id':prediction_ids, 
                        'cancer':prediction_reshaped})
predicted_df

In [None]:
submission_df = pd.read_csv(config.sample_sub_csv)
del submission_df['cancer']

submission_df = submission_df.merge(predicted_df, on='prediction_id', how='left')
submission_df = submission_df.groupby('prediction_id')['cancer'].max().reset_index()

submission_df.to_csv(config.sub_csv, index=False)

In [None]:
submission_df.info()

In [None]:
submission_df.head()