# Import necessary packages

In [29]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # boxplot, histplot
import cv2

# The latest keras-preprocessing
!pip install git+https://github.com/keras-team/keras-preprocessing.git

from keras.models import Sequential
#Import from keras_preprocessing not from keras.preprocessing
import tensorflow as tf
#from keras import backend as K
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Rescaling, Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix


Collecting git+https://github.com/keras-team/keras-preprocessing.git
  Cloning https://github.com/keras-team/keras-preprocessing.git to /tmp/pip-req-build-38icwqh_
  Running command git clone --filter=blob:none --quiet https://github.com/keras-team/keras-preprocessing.git /tmp/pip-req-build-38icwqh_
  Resolved https://github.com/keras-team/keras-preprocessing.git to commit 3e380065d4afc7347aaee8d89325a16b22158438
  Preparing metadata (setup.py) ... [?25l[?25hdone


# Loading Data from Google Drive
Images in the original dataset have been converted into 256*256 .png format

In [30]:
from google.colab import drive
drive.mount('/content/gdrive')
#!ls "/content/gdrive/My Drive"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [31]:
df_train = pd.read_csv('/content/gdrive/My Drive/train.csv')
df_test = pd.read_csv('/content/gdrive/My Drive/test.csv')
#df_train.info()
#df_test.info()
df_train['cancer'].unique()
df_train['cancer'].value_counts()

0    53548
1     1158
Name: cancer, dtype: int64

# Data Preprocessing
## Balance the data

In [33]:
df_benign = df_train[df_train['cancer'] == 0]
df_malign = df_train[df_train['cancer'] == 1]

df_benign = df_benign.sample(n=1202, random_state=1) # Make sure each time random the same samples.
df_sample = pd.concat([df_benign, df_malign])
# sort the value and mix the entries
df_sample.sort_values(by = ['patient_id', 'image_id'], ascending = [True, True], na_position = 'first')

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
582,2,106,76321767,L,CC,65.0,1,1,1,,0,,21,False
583,2,106,2018825992,L,MLO,65.0,1,1,1,,0,,21,False
1441,2,115,503361456,R,CC,55.0,0,0,0,,0,,21,False
7222,2,177,1110981332,R,MLO,71.0,0,0,0,,0,,29,False
9736,2,204,890985677,L,CC,60.0,0,0,0,,0,,48,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51431,1,65456,694039741,R,MLO,54.0,0,0,0,1.0,1,B,49,False
51474,2,65492,1186509485,L,MLO,59.0,1,1,1,,0,,48,False
51475,2,65492,1306638994,L,CC,59.0,1,1,1,,0,,48,False
51480,2,65493,442293322,R,MLO,50.0,0,0,0,,0,,21,False


Observation: Now we have a almost perfect balanced sample dataset!

In [34]:
# Create image_name matching the name in the image name directory by concatenating the patient_id and image_id
df_sample['image_name'] = df_sample["patient_id"].astype(str) + '_' + df_sample["image_id"].astype(str)
df_test['image_name'] = df_test["patient_id"].astype(str) + '_' + df_test["image_id"].astype(str)

In [35]:
# The append_ext function is defined to append extension to the image name
# Note
def append_ext(fname):
  if fname.endswith('.png'):
    print('Silly code: Do not add suffix twice.')
  else: return fname+'.png'

df_sample['image_name']=df_sample['image_name'].apply(append_ext)
print(df_sample.head())

df_test['image_name']=df_test['image_name'].apply(append_ext)
print(df_test.head())

       site_id  patient_id    image_id laterality view   age  cancer  biopsy  \
38182        2       50994   799654229          R   CC  55.0       0       0   
11895        2       22706  1838448164          L   CC  63.0       0       0   
38810        2       51875  1763503482          R   CC  57.0       0       0   
12636        1       23467  1009349413          L  MLO  45.0       0       0   
39664        1       52873   457185009          R  MLO  59.0       0       0   

       invasive  BIRADS  implant density  machine_id  difficult_negative_case  \
38182         0     NaN        0     NaN          21                    False   
11895         0     NaN        0     NaN          21                    False   
38810         0     NaN        0     NaN          29                    False   
12636         0     1.0        0       A          49                    False   
39664         0     1.0        0       A          49                    False   

                 image_name  
38

# Split the sample dataset into training set and validation set:
   

*   training set contains 80% records, and validation set has the rest 20% instances.
*   Keras ImageDataGenerator to map the images in the RSNA_256 directory to the class cancer in the corresponing df_sample.
*   The batch_size of the training set and the validation set has set as the number that divides the total number of images in the train set and valid respectively.






In [36]:
# Source Code: https://vijayabhaskar96.medium.com/tutorial-on-keras-flow-from-dataframe-1fd4493d237c
# Modified by Jia Lin
datagen=ImageDataGenerator(validation_split =0.2)
train_generator=datagen.flow_from_dataframe(dataframe=df_sample,
                                            directory="/content/gdrive/My Drive/RSNA_256/",
                                            x_col="image_name",
                                            y_col="cancer",
                                            subset="training",
                                            batch_size=32, # 1888 = 8*4*59
                                            seed=0,
                                            shuffle=True,
                                            color_mode="grayscale", # the original dataset only gray
                                            class_mode="raw", # 'cancer' is numerical datatype
                                            target_size=(256,256))
valid_generator=datagen.flow_from_dataframe(dataframe=df_sample,
                                            directory="/content/gdrive/My Drive/RSNA_256/",
                                            x_col="image_name",
                                            y_col="cancer",
                                            subset="validation",
                                            batch_size=8, # 472 = 8*59
                                            seed=0,
                                            shuffle=True,
                                            color_mode="grayscale",
                                            class_mode="raw", # 'cancer' is numerical datatype
                                            target_size=(256,256))

Found 1888 validated image filenames.
Found 472 validated image filenames.


In [37]:
test_datagen=ImageDataGenerator()
test_generator=test_datagen.flow_from_dataframe(dataframe=df_test,
                                                directory="/content/gdrive/My Drive/testing_images/",
                                                x_col="image_name",
                                                y_col=None,
                                                batch_size=1,
                                                seed=0,
                                                shuffle=False,
                                                color_mode="grayscale",
                                                class_mode=None,
                                                target_size=(256,256))

Found 4 validated image filenames.


# Build the model:

In [38]:
# Define the CNN model
model = Sequential()

model.add(Rescaling(1./255, offset=0.0, input_shape=(256,256,1)))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(256, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))

model.add(Dense(2, activation='softmax')) # Two classes output

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling (Rescaling)       (None, 256, 256, 1)       0         
                                                                 
 conv2d (Conv2D)             (None, 256, 256, 64)      640       
                                                                 
 activation (Activation)     (None, 256, 256, 64)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, 254, 254, 128)     73856     
                                                                 
 activation_1 (Activation)   (None, 254, 254, 128)     0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 127, 127, 128)    0         
 )                                                               
                                                        

In [39]:
# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
print(STEP_SIZE_TRAIN)
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
print(STEP_SIZE_VALID)
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
print(STEP_SIZE_TEST)

59
59
4


# Fitting/Training the model

In [41]:
model.fit(train_generator,  validation_data=valid_generator, epochs=32)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x7a2205172bf0>

# Evaluate the model


In [42]:
model.evaluate(valid_generator, steps = STEP_SIZE_VALID)



[10.121707916259766, 0.2817796468734741]

In [43]:
from sklearn.metrics import classification_report, confusion_matrix

#Confution Matrix and Classification Report
pred = model.predict(valid_generator)
pred = np.argmax(pred, axis=1)
print('Confusion Matrix')
confusion_matrix = confusion_matrix(valid_generator.labels, pred)
print(confusion_matrix)

target_names = ['Benign','Malign']
print(classification_report(valid_generator.labels, pred, target_names=target_names, zero_division=0))

Confusion Matrix
[[133 339]
 [  0   0]]
              precision    recall  f1-score   support

      Benign       1.00      0.28      0.44       472
      Malign       0.00      0.00      0.00         0

    accuracy                           0.28       472
   macro avg       0.50      0.14      0.22       472
weighted avg       1.00      0.28      0.44       472



# Predict the output

In [44]:
filenames = test_generator.filenames
number_test_samples = len(filenames)
test_generator.reset()
pred = model.predict(test_generator, number_test_samples)
pred = np.argmax(pred, axis = 1)
print(pred)

[1 1 1 1]
