# 1. Load the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os
import shutil
from os.path import exists

In [2]:
# Define full path and load dataframe with the original data
full_path = '/home/jari/code/GuillaumeRib/project-cancer-detection/raw_data/train/'
df_labels = pd.read_csv('../raw_data/train_labels.csv')

# 2. Analyze the datasets

In [14]:
# Original dataset sizes as downloaded from Kaggle
test_org = 57500 # original size of test dataset
train_org = df_labels.shape[0] # original size of train dataset
total_org = int(test_org + train_org) # original total size of dataset
ratio_org = test_org/total_org # original ratio test/total

# New dataset sizes (without original test dataset since we do not have its labels)
total_new = train_org # new total size of dataset
test_new = int(np.round(ratio_org * total_new,0)) # new size of test dataset
train_new = total_new - test_new # new size of train dataset

# Original dataset labels
values = df_labels.value_counts('label') # how many images do we have for each label
values_0 = values[0] # images with label=0
values_1 = values[1] # images with label=1
values_total = values_0 + values_1 # total number of images
values_0_ratio = values_0/values_total # percentage of label=0 images over total in original train dataset
values_1_ratio = values_1/values_total # percentage of label=1 images over total in original train dataset
original_ratio = f'{int(np.round(values_0_ratio*100,0))}/{int(np.round(values_1_ratio*100,0))}' # original ratio of labels (simplified)

# New dataset labels
test_new_0 = int(test_new*values_0_ratio) # number of images with label=0 in new test dataset
test_new_1 = int(test_new*values_1_ratio) # number of images with label=1 in new test dataset
train_new_0 = int(train_new/2) # number of images with label=0 in new train dataset
train_new_1 = int(train_new/2) # number of images with label=1 in new train dataset

# Print relevant results
print('New train set size:',train_new) # must fulfill 50/50 label ratio
print('New test size:',test_new) # must fulfill original label ratio
print(f'For the new test dataset we need {test_new_0} images with label=0 and {test_new_1} images with label=1 (ratio {original_ratio})')
print(f'For the new train dataset we need {train_new_0} images with label=0 and {train_new_1} images with label=1 (ratio 50/50)')
print('Images with label=0 in original train dataset:',values_0)
print('Images with label=1 in original train dataset:',values_1)
print(f'Ratio of label=0 vs. label=1 in original train dataset: {int(np.round(values_0_ratio*100,0))}/{int(np.round(values_1_ratio*100,0))}')

New train set size: 131194
New test size: 43245
For the new test dataset we need 25729 images with label=0 and 17515 images with label=1 (ratio 59/41)
For the new train dataset we need 65597 images with label=0 and 65597 images with label=1 (ratio 50/50)
Images with label=0 in original train dataset: 103786
Images with label=1 in original train dataset: 70653
Ratio of label=0 vs. label=1 in original train dataset: 59/41


# 3. Create the new datasets

## 3.1 Test dataset

### 3.2.1 Select test data

In [4]:
# Create list with image ids for test dataset with 59/41 ratio of label=0 vs. label=1
test_label_0 = df_labels[df_labels['label']==0].sample(n=test_new_0) # create df for new test dataset with label=0
test_label_0_ids = test_label_0['id'] # list with image ids for label=0 for test dataset
test_label_1 = df_labels[df_labels['label']==1].sample(n=test_new_1) # create df for new test dataset with label=1
test_label_1_ids = test_label_1['id'] # list with image ids for label=1 for test dataset

# Delete used rows from original dataframe so there are no data leakages between test vs. train datasets
df_labels.drop(test_label_0.index, inplace = True)
df_labels.drop(test_label_1.index, inplace = True)

### 3.2.2 Split into 0 and 1 directores

In [9]:
# Define input directory where data is stored
input_dir = '../raw_data/train/'

# Manually create 0 and 1 directories within our test_new folder
output_test = os.mkdir('../raw_data/test_new/') # run only the first time to create the directory
output_test_0 = os.mkdir('../raw_data/test_new/0') # run only the first time to create the directory
output_test_1 = os.mkdir('../raw_data/test_new/1') # run only the first time to create the directory
output_test_0 = '../raw_data/test_new/0'
output_test_1 = '../raw_data/test_new/1'

# Split test data in corresponding directories
for item in test_label_0_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_test_0)
for item in test_label_1_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_test_1)

## 3.2 Train datasets

### 3.2.1 Select train data

In [10]:
min(df_labels.value_counts('label'))

70653

In [15]:
# Create list with image ids for test dataset with 59/41 ratio of label=0 vs. label=1
train_label_0 = df_labels[df_labels['label']==0].sample(n=min(df_labels.value_counts('label'))) # create df for new train dataset with label=0
## min(df_labels.value_counts('label')) = choose the smallest between label=0 and label=1 to ensure that we have enought from each
train_label_0_ids = train_label_0['id'] # list with image ids for label=0 for test dataset
train_label_1 = df_labels[df_labels['label']==1].sample(n=min(df_labels.value_counts('label'))) # create df for new test dataset with label=1
train_label_1_ids = train_label_1['id'] # list with image ids for label=1 for test dataset

### 3.2.2 Split into 0 and 1 directores

In [16]:
# Define input directory where data is stored
input_dir = '../raw_data/train/'

# Manually create 0 and 1 directories within our train_new folder
output_train = os.mkdir('../raw_data/train_new/') # run only the first time to create the directory
output_train_0 = os.mkdir('../raw_data/train_new/0') # run only the first time to create the directory
output_train_1 = os.mkdir('../raw_data/train_new/1') # run only the first time to create the directory
output_train_0 = '../raw_data/train_new/0'
output_train_1 = '../raw_data/train_new/1'

# Split train data in corresponding directories
for item in train_label_0_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_train_0)
for item in train_label_1_ids:
    shutil.move(os.path.join(input_dir, item+'.tif'), output_train_1)

# 4. Additional

#### We can delete out original 'test' and 'train' directories:

In [19]:
shutil.rmtree('../raw_data/test/')
shutil.rmtree('../raw_data/train/')

#### For simplicity we shall rename our train and test directories:

In [20]:
shutil.move('../raw_data/test_new/', '../raw_data/test/')
shutil.move('../raw_data/train_new/', '../raw_data/train/')
shutil.move('../raw_data/train_labels.csv', '../raw_data/labels.csv') # rename the original .csv file for labels

'../raw_data/train/'

#### For convenience we shall also create a '.csv' file in each directory with the image ids for future use:

In [21]:
# Test for label=0
path_test_0 = '../raw_data/test/0'
labels_test_0 = os.listdir(path_test_0)
test_0_df = pd.DataFrame(labels_test_0)
test_0_df.to_csv(os.path.join(path_test_0,'labels_test_0.csv'), index = False)

# Test for label=1
path_test_1 = '../raw_data/test/1'
labels_test_1 = os.listdir(path_test_1)
test_1_df = pd.DataFrame(labels_test_1) 
test_1_df.to_csv(os.path.join(path_test_1,'labels_test_1.csv'), index = False)

# Test all
test_df = pd.concat([test_0_df,test_1_df], ignore_index=True)
test_df.to_csv(os.path.join('../raw_data/test/labels_test.csv'), index = False)

# Train for label=0
path_train_0 = '../raw_data/train/0'
labels_train_0 = os.listdir(path_train_0)
train_0_df = pd.DataFrame(labels_train_0) 
train_0_df.to_csv(os.path.join(path_train_0,'labels_train_0.csv'), index = False)

# Train for label=1
path_train_1 = '../raw_data/train/1'
labels_train_1 = os.listdir(path_train_1)
train_1_df = pd.DataFrame(labels_train_1) 
train_1_df.to_csv(os.path.join(path_train_1,'labels_train_1.csv'), index = False)

# Train all
train_df = pd.concat([train_0_df,train_1_df], ignore_index=True)
train_df.to_csv(os.path.join('../raw_data/train/labels_train.csv'), index = False)

#### We can check if we have the right amount of files in each directory by entering the directory through the terminal and typping:

In [None]:
ls | wc -l

#### In directories '0' and '1' within 'train' we should have:

In [None]:
min(df_labels.value_counts('label'))

#### In 'test' we should have:

In [None]:
test_new