# Loading the images

In [13]:
import numpy as np
import pandas as pd

## Labels Data Overview

In [28]:
# Would need to use this file labels to move files into sub-folders 0 and 1 under train and test.

In [29]:
labels = pd.read_csv('../raw_data/train_labels.csv')

In [30]:
labels.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [31]:
labels.count()

id       220025
label    220025
dtype: int64

In [32]:
labels.groupby(by='label').count()

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,130908
1,89117


## Moving files into sub-folders

In [47]:
import os
import shutil
from os.path import exists

In [48]:
# Path where images need to be moved - Change the root and create 0 and 1 folders in it.
full_path = '/Users/chloeguillaume/code/GuillaumeRib/project-cancer-detection/raw_data/train'

In [50]:
# Code to move files into 0 and 1 sub-folders
for index,row in labels.reset_index().iterrows():
    file = row['id']
    folder = str(row['label'])
    source_path = os.path.join(full_path,file+'.tif')
    if exists(source_path) is True:
        destination_path = os.path.join(full_path,folder,file+'.tif')
        shutil.move(source_path,destination_path)

## Check image name / id class

In [102]:
# Check if image is in 0 or 1:
image_to_test = 'fc06025318f0a1fc213851294b1952d8a8b082bf'
print(f"image is Cat 0: {image_to_test in list(label_0['id'])}")
print(f"image is Cat 1: {image_to_test in list(label_1['id'])}")

image is Cat 0: True
image is Cat 1: False


## Image Data Generator

### Train / Test set paths

In [110]:
# Change path accordingly / train_small 8000 images 50/50
local_train_path = '../raw_data/train_small'
local_test_path = '../raw_data/test_small'

### TF Image Data Generator

In [111]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [118]:
# ImageGenerator preprocess images / minimum params / to read tif images
train_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()

In [119]:
# Load from directory to flow passsed into ImageGenerator
train_generator = train_datagen.flow_from_directory(local_train_path,
                                                    target_size=(96,96), # 256x256 by default. can be 'upsampled'?
                                                    class_mode='binary',
                                                    batch_size=5)

test_generator = test_datagen.flow_from_directory(local_test_path,
                                                    target_size=(96,96), # 256x256 by default. can be 'upsampled'?
                                                    class_mode='binary',
                                                    batch_size=5)

Found 8000 images belonging to 2 classes.
Found 4000 images belonging to 2 classes.


## CNN Model Initialization

In [None]:
# Rescaling layer
    model.add(Rescaling(1./255, input_shape=(96,96,3)))

In [98]:
# INIT MODEL and COMPILE
from tensorflow.keras import layers, models
from tensorflow.keras.layers.experimental.preprocessing import Rescaling

def init_model():
    model = models.Sequential()

    # Lets add convolution layers, 
    model.add(layers.Conv2D(4, kernel_size=2, activation='relu',input_shape=(96,96,3)))
 
  
    model.add(layers.Conv2D(8, kernel_size=2, activation="relu"))
    model.add(layers.MaxPooling2D(2))

    model.add(layers.Conv2D(8, kernel_size=2, activation="relu"))
    model.add(layers.MaxPooling2D(2))
    
    model.add(layers.Conv2D(8, kernel_size=2, activation="relu"))


    model.add(layers.Flatten())
    
    model.add(layers.Dense(10, activation='relu'))
  
    model.add(layers.Dense(1, activation='sigmoid'))

     ### Model compilation
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [99]:
model = init_model()

In [100]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_20 (Conv2D)          (None, 95, 95, 4)         52        
                                                                 
 conv2d_21 (Conv2D)          (None, 94, 94, 8)         136       
                                                                 
 max_pooling2d_20 (MaxPoolin  (None, 47, 47, 8)        0         
 g2D)                                                            
                                                                 
 conv2d_22 (Conv2D)          (None, 46, 46, 8)         264       
                                                                 
 max_pooling2d_21 (MaxPoolin  (None, 23, 23, 8)        0         
 g2D)                                                            
                                                                 
 conv2d_23 (Conv2D)          (None, 22, 22, 8)        

## Model Training

In [120]:
# Fitting the model
from tensorflow.keras.callbacks import EarlyStopping


epochs = 30
batch_size = 64
es = EarlyStopping(patience=5, restore_best_weights=True,verbose=1)

history = model.fit(train_generator,
                    validation_data=val_generator,
                    epochs = epochs,
                    batch_size = batch_size, 
                    verbose = 1, 
                    callbacks = [es])

ValueError: `validation_split` is only supported for Tensors or NumPy arrays, found following types in the input: [<class 'keras.preprocessing.image.DirectoryIterator'>]

### Evaluate the model on test set

In [116]:
results = model.evaluate(test_generator, verbose = 1 )
print(f'The accuracy on the test set is of {results[1]*100:.2f} %')

The accuracy on the test set is of 70.20 %
