# Data Preparation

Preparing data for Keras Image Data Generators.

Image files were splitted into below folders structure:


- train_dir
 - no_tumor
 - has_tumor
- val_dir
 - no_tumor
 - has_tumor
- test_dir
 - no_tumor
 - has_tumor


In [0]:
!pip install kaggle

from zipfile import ZipFile
import pandas as pd
import numpy as np
import os
import shutil 

from sklearn.model_selection import train_test_split

In [0]:
!mkdir /root/.kaggle
!echo '{"username":"luke01","key":"492a1168da411683abcf66f7003a2c66"}' > /root/.kaggle/kaggle.json

In [0]:
!kaggle config set -n path -v{/content}

- path is now set to: {/content}


In [0]:
!chmod 600 /root/.kaggle/kaggle.json

In [0]:
!kaggle competitions download -c histopathologic-cancer-detection -p /content

Downloading sample_submission.csv.zip to /content
  0% 0.00/1.33M [00:00<?, ?B/s]
100% 1.33M/1.33M [00:00<00:00, 96.9MB/s]
Downloading train_labels.csv.zip to /content
 98% 5.00M/5.10M [00:00<00:00, 25.2MB/s]
100% 5.10M/5.10M [00:00<00:00, 24.4MB/s]
Downloading test.zip to /content
100% 1.30G/1.30G [00:16<00:00, 95.5MB/s]
100% 1.30G/1.30G [00:16<00:00, 87.1MB/s]
Downloading train.zip to /content
100% 4.97G/4.98G [01:16<00:00, 70.0MB/s]
100% 4.98G/4.98G [01:16<00:00, 70.2MB/s]


In [0]:
zf = ZipFile('/content/train.zip', 'r')
zf.extractall('/content/cancer/train/')
zf.close()


zf = ZipFile('/content/train_labels.csv.zip', 'r')
zf.extractall('/content/cancer/labels/')
zf.close()

In [0]:
labels_path='/content/cancer/labels/'
train_labels = pd.read_csv(f'{labels_path}train_labels.csv')

print("Rows number in file with labels: " , train_labels.shape)
print("Number of downloaded images: ", len(os.listdir('/content/cancer/train/')))

Rows number in file with labels:  (220025, 2)
Number of downloaded images:  220025


### Spliting downloaded images into train, validation and test datasets
Creating balanced sets based on label.

In [0]:
y = train_labels['label']

df_train, df_test = train_test_split(train_labels, test_size=0.1, random_state=42, stratify=y)

df_train, df_val = train_test_split(train_labels, test_size=0.2, random_state=42, stratify=y)


print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(176020, 2)
(44005, 2)
(22003, 2)


### Creating a new directory

In [0]:
# Create a new directory
data_dir = '/content/data/'
os.mkdir(data_dir)



# create a path to 'data_dir' to which we will join the names of the new folders

# train_dir
train_dir = os.path.join(data_dir, 'train')
os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(data_dir, 'validation')
os.mkdir(val_dir)

# test_dir
test_dir = os.path.join(data_dir, 'test')
os.mkdir(test_dir)


# Inside each folder we create seperate folders for each class

# create new subfolders inside train folder
no_tumor = os.path.join(train_dir, 'no_tumor')
os.mkdir(no_tumor)
has_tumor = os.path.join(train_dir, 'has_tumor')
os.mkdir(has_tumor)


# create new subfolders inside validation folder
no_tumor = os.path.join(val_dir, 'no_tumor')
os.mkdir(no_tumor)
has_tumor = os.path.join(val_dir, 'has_tumor')
os.mkdir(has_tumor)

# create new subfolders inside test folder
no_tumor = os.path.join(test_dir, 'no_tumor')
os.mkdir(no_tumor)
has_tumor = os.path.join(test_dir, 'has_tumor')
os.mkdir(has_tumor)

### Check if the folders have been created

In [0]:
print('train: ', os.listdir(train_dir))
print('validation: ', os.listdir(val_dir))
print('test: ', os.listdir(test_dir))

train:  ['no_tumor', 'has_tumor']
validation:  ['no_tumor', 'has_tumor']
test:  ['no_tumor', 'has_tumor']


### Getting a list of images for train, validation and test and assigning an appropriate label to the file name

In [0]:
train_ids = list(df_train['id'])
val_ids = list(df_val['id'])
test_ids = list(df_test['id'])

# Set the id as the index in dataframe with labels
train_labels.set_index('id', inplace=True)

image='730b42edf8f14f730fcd9ac98fdc43e45e47a5b9'
target = train_labels.loc[image,'label']
target

1

### Coping images to prepared folders based on datasets split

In [0]:
# train images transfer

for image in train_ids:
    
    # adding .tif extension to the image id
    file_name = image + '.tif'
    # getting image label
    target = train_labels.loc[image,'label']
    
    # matching label with folder name
    if target == 0:
        label = 'no_tumor'
    if target == 1:
        label = 'has_tumor'
    
    # source path
    src = os.path.join('/content/cancer/train/', file_name)
    # destination path
    dst = os.path.join(train_dir, label, file_name)
    # coping from the source to the destination
    shutil.copyfile(src, dst)


# validation images transfer

for image in val_ids:
    
    # adding .tif extension to the image id
    file_name = image + '.tif'
    # getting image label
    target = train_labels.loc[image,'label']
    
    # matching label with folder name
    if target == 0:
        label = 'no_tumor'
    if target == 1:
        label = 'has_tumor'
    
    # source path
    src = os.path.join('/content/cancer/train/', file_name)
    # destination path
    dst = os.path.join(val_dir, label, file_name)
    # coping from the source to the destination
    shutil.copyfile(src, dst)

for image in test_ids:
    
    # adding .tif extension to the image id
    file_name = image + '.tif'
    # getting image label
    target = train_labels.loc[image,'label']
    
    # matching label with folder name
    if target == 0:
        label = 'no_tumor'
    if target == 1:
        label = 'has_tumor'
    
    # source path
    src = os.path.join('/content/cancer/train/', file_name)
    # destination path
    dst = os.path.join(test_dir, label, file_name)
    # coping from the source to the destination
    shutil.copyfile(src, dst)    

### Checking file number in all created folders

In [0]:
print('Count of tumor train images: ', len(os.listdir(os.path.join(train_dir, 'has_tumor'))))
print('Count of no tumor train images: ', len(os.listdir(os.path.join(train_dir, 'no_tumor'))))
print('Count of tumor validation images: ', len(os.listdir(os.path.join(val_dir, 'has_tumor'))))
print('Count of no tumor validation images: ', len(os.listdir(os.path.join(val_dir, 'no_tumor'))))
print('Count of tumor test images: ', len(os.listdir(os.path.join(test_dir, 'has_tumor'))))
print('Count of no tumor test images: ', len(os.listdir(os.path.join(test_dir, 'no_tumor'))))

Count of tumor train images:  71294
Count of no tumor train images:  104726
Count of tumor validation images:  17823
Count of no tumor validation images:  26182
Count of tumor test images:  8912
Count of no tumor test images:  13091


### Packing prepared files for later use

In [0]:
shutil.make_archive('/content/drive/My Drive/HCD_data', 'zip','/content/data')

'/content/drive/My Drive/HCD_data.zip'

In [0]:
os.path.getsize('/content/drive/My Drive/HCD_data.zip')

5889000525