# 1. Load the necessary libraries

In [5]:
import pandas as pd
import numpy as np
import os
import shutil
from os.path import exists

In [6]:
# Define full path and load dataframe with the original data
full_path = '/home/jari/code/GuillaumeRib/project-cancer-detection/raw_data/train/'
df_labels = pd.read_csv('../raw_data/train_labels.csv')

# 2. Analyze the datasets

In [7]:
# Original dataset sizes as downloaded from Kaggle
test_org = 57500 # original size of test dataset
train_org = df_labels.shape[0] # original size of train dataset
total_org = int(test_org + train_org) # original total size of dataset
ratio_org = test_org/total_org # original ratio test/total

# New dataset sizes (without original test dataset since we do not have its labels)
total_new = train_org # new total size of dataset
test_new = int(np.round(ratio_org * total_new,0)) # new size of test dataset
train_new = total_new - test_new # new size of train dataset

# Original dataset labels
values = df_labels.value_counts('label') # how many images do we have for each label
values_0 = values[0] # images with label=0
values_1 = values[1] # images with label=1
values_total = values_0 + values_1 # total number of images
values_0_ratio = values_0/values_total # percentage of label=0 images over total in original train dataset
values_1_ratio = values_1/values_total # percentage of label=1 images over total in original train dataset
original_ratio = f'{int(np.round(values_0_ratio*100,0))}/{int(np.round(values_1_ratio*100,0))}' # original ratio of labels (simplified)

# New dataset labels
test_new_0 = int(test_new*values_0_ratio) # number of images with label=0 in new test dataset
test_new_1 = int(test_new*values_1_ratio) # number of images with label=1 in new test dataset
train_new_0 = int(train_new/2) # number of images with label=0 in new train dataset
train_new_1 = int(train_new/2) # number of images with label=1 in new train dataset

# Print relevant results
print('New train set size:',train_new) # must fulfill 50/50 label ratio
print('New test size:',test_new) # must fulfill original label ratio
print(f'For the new test dataset we need {test_new_0} images with label=0 and {test_new_1} images with label=1 (ratio {original_ratio})')
print(f'For the new train dataset we need {train_new_0} images with label=0 and {train_new_1} images with label=1 (ratio 50/50)')
print('Images with label=0 in original train dataset:',values_0)
print('Images with label=1 in original train dataset:',values_1)
print(f'Ratio of label=0 vs. label=1 in original train dataset: {int(np.round(values_0_ratio*100,0))}/{int(np.round(values_1_ratio*100,0))}')

New train set size: 174438
New test size: 45587
For the new test dataset we need 27122 images with label=0 and 18464 images with label=1 (ratio 59/41)
For the new train dataset we need 87219 images with label=0 and 87219 images with label=1 (ratio 50/50)
Images with label=0 in original train dataset: 130908
Images with label=1 in original train dataset: 89117
Ratio of label=0 vs. label=1 in original train dataset: 59/41


# 3. Create the new datasets

## 3.1 Test dataset

In [38]:
# Create list with image ids for test dataset with 59/41 ratio of label=0 vs. label=1
test_label_0 = df_labels[df_labels['label']==0].sample(n=test_new_0) # create df for new test dataset with label=0
test_label_0_ids = [x for x in test_label_0['id']] # list with image ids for label=0 for test dataset
test_label_1 = df_labels[df_labels['label']==1].sample(n=test_new_1) # create df for new test dataset with label=1
test_label_1_ids = [x for x in test_label_1['id']] # list with image ids for label=1 for test dataset

# Delete used rows from original dataframe so there are no data leakages between test vs. train datasets
for image in df_labels['id']:
    if image in test_label_0_ids:
        df_labels.drop(df_labels[df_labels['id'] == image].index, inplace = True)
    elif image in test_label_1_ids:
        df_labels.drop(df_labels[df_labels['id'] == image].index, inplace = True)

# Create directory to store test data
input_dir = '../raw_data/train/'
output_test = os.mkdir('../raw_data/test_new/') # run only the first time to create the directory
#output_test = '../raw_data/test_new/'

# Split test data in corresponding directories
for item in test_label_0_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_test)
for item in test_label_1_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_test)

KeyboardInterrupt: 

## 3.2 Train datasets

### 3.2.1 Select train data

In [57]:
# Create new train dataset with 50/50 ratio of labels
train_label_0 = df_labels[df_labels['label']==0].sample(n=min(df_labels.value_counts('label'))) # create df for new train dataset with label=0
## min(df_labels.value_counts('label')) to choose the smallest between label=0 and label=1 to ensure that we have enought from each
train_label_0_ids = [x for x in train_label_0['id']] # list with image ids for label=0 for train dataset
train_label_1 = df_labels[df_labels['label']==1].sample(n=min(df_labels.value_counts('label'))) # create df for new train dataset with label=0
train_label_1_ids = [x for x in train_label_1['id']] # list with image ids for label=1 for train dataset

# Define directory to store train data
input_dir = '../raw_data/train/'
#output_train = os.mkdir('../raw_data/train_new/')
output_train = '../raw_data/train_new/'

# Split train data in corresponding directories
for item in train_label_0_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_train)
for item in train_label_1_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_train)

### 3.2.2 Split into 0 and 1 directores

In [62]:
# Manually create 0 and 1 directories within our train folder
# To be automated within the function later
# Uncomment and run once

#os.mkdir('../raw_data/train_new/0')
#os.mkdir('../raw_data/train_new/1')

In [64]:
# Code to move files into 0 and 1 sub-folders
for index,row in df_labels.reset_index().iterrows():
    file = row['id']
    folder = str(row['label']) # assign the appropiate folder according to label
    source_path = os.path.join('../raw_data/train_new/',file+'.tif')
    if exists(source_path) is True:
        destination_path = os.path.join('../raw_data/train_new/',folder,file+'.tif')
        shutil.move(source_path,destination_path) # move image to corresponding folder according to label
    else:
        print('File not found')

File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not found
File not f

# 4. Additional

#### For simplicity we shall rename our train and test directories:

In [None]:
os.rename('../raw_data/test_new/', '../raw_data/test/')
os.rename('../raw_data/train_new/', '../raw_data/train/')

#### We can check if we have the right amount of files in each directory by entering the directory through the terminal and typping:

In [68]:
ls | wc -l

3


#### In directories '0' and '1' within 'train_new' we should have:

In [69]:
min(df_labels.value_counts('label'))

77227

#### In 'test_new' we should have:

In [70]:
test_new

45587