# Loading the images

In [13]:
import numpy as np
import pandas as pd

## Train set path

In [21]:
local_train_path = '../raw_data/train_small'

## Image Data Generator as preproc accepting tif 

In [22]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [23]:
ImageDataGenerator()

<keras.preprocessing.image.ImageDataGenerator at 0x1274a11c0>

In [24]:
# Full params

```python
ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=0,
    width_shift_range=0.0,
    height_shift_range=0.0,
    brightness_range=None,
    shear_range=0.0,
    zoom_range=0.0,
    channel_shift_range=0.0,
    fill_mode='nearest',
    cval=0.0,
    horizontal_flip=False,
    vertical_flip=False,
    rescale=None,
    preprocessing_function=None,
    data_format=None,
    validation_split=0.0,
    interpolation_order=1,
    dtype=None
)
```

In [25]:
# Image Generator accept tiff images / used for Data Augmentation as well

In [26]:
# ImageGenerator preprocess images / params to be checked
train_datagen = ImageDataGenerator(rescale = 1./255,
                                   rotation_range=40,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=True,
                                   fill_mode='nearest',
                                   validation_split=0.3
                                  )

In [27]:
# Load from directory to flow passsed into ImageGenerator
train_generator = train_datagen.flow_from_directory(local_train_path,
                                                    target_size=(96,96), # 256x256 by default. can be 'upsampled'?
                                                    class_mode='binary',
                                                    batch_size=5)

Found 469 images belonging to 2 classes.


* **train generator** is object passed on fit method
* example:
```python
model.fit(
        train_generator,
        steps_per_epoch=2000,
        epochs=50,
        validation_data=validation_generator,
        validation_steps=800)
```

## Labels Data Overview

In [28]:
# Would need to use this file labels to move files into sub-folders 0 and 1 under train and test.

In [29]:
labels = pd.read_csv('../raw_data/train_labels.csv')

In [30]:
labels.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [31]:
labels.count()

id       220025
label    220025
dtype: int64

In [32]:
labels.groupby(by='label').count()

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,130908
1,89117


In [33]:
# Label = 0
label_0 = labels[labels['label']==0]
label_0.head(10)

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0
5,acfe80838488fae3c89bd21ade75be5c34e66be7,0
8,559e55a64c9ba828f700e948f6886f4cea919261,0
9,8eaaa7a400aa79d36c2440a4aa101cc14256cda4,0
10,a106469bbfda4cdc5a9da7ac0152927bf1b4a92d,0
12,a1991e73a9b676faddd2bd47c39754b14d1eb923,0
13,08566ce82d4406f464c9c2a3cd014704735db7a9,0


In [34]:
# Label = 1
label_1=labels[labels['label']==1]
label_1.head(10)

Unnamed: 0,id,label
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
6,a24ce148f6ffa7ef8eefb4efb12ebffe8dd700da,1
7,7f6ccae485af121e0b6ee733022e226ee6b0c65f,1
11,c3d660212bf2a11c994e0eadff13770a9927b731,1
14,94fa32b29cc1c00403176c0795fffa3cfaa0f20e,1
17,0b820b71670c039dd0a51333d1c919f471a9e940,1
19,d34af1e7500f2f3de41b0e6fdeb2ed245d814590,1
23,464327050ef07bb927f8bfb5c4e4dd5ebd4d3c09,1
24,6961bdcc16f6c1d7db88fc6a7823178288c2a29e,1
28,233bf46a575c1731821073e318c029e5df8b12ff,1


## Moving files into sub-folders

In [37]:
import os
import shutil
from os.path import exists

In [39]:
# Path where images need to be moved - Change the root and create 0 and 1 folders in it.
full_path = '/Users/chloeguillaume/code/GuillaumeRib/project-cancer-detection/raw_data/train_small'

In [40]:
# Code to move files into 0 and 1 sub-folders
for index,row in labels.reset_index().iterrows():
    file = row['id']
    folder = str(row['label'])
    source_path = os.path.join(full_path,file+'.tif')
    if exists(source_path) is True:
        print(source_path)
        destination_path = os.path.join(full_path,folder,file+'.tif')
        shutil.move(source_path,destination_path)

## Check image name / id class

In [41]:
# Check if image is in 0 or 1:
image_to_test = '0a1b615eacd80e7a9a8ba9991c04f1453bd4556b'
print(f"image is Cat 0: {image_to_test in list(label_0['id'])}")
print(f"image is Cat 1: {image_to_test in list(label_1['id'])}")

image is Cat 0: False
image is Cat 1: True
