# 1. Load the necessary libraries and dataframes

In [6]:
import pandas as pd
import numpy as np
import os
import shutil
from os.path import exists

In [7]:
# Define full path and load dataframe with the data
full_path = '/home/jari/code/GuillaumeRib/project-cancer-detection/raw_data/'

labels_test = pd.read_csv(os.path.join(full_path,'test/labels_test.csv'), names=['id'])
labels_test_0 = pd.read_csv(os.path.join(full_path,'test/0/labels_test_0.csv'), names=['id'])
labels_test_1 = pd.read_csv(os.path.join(full_path,'test/1/labels_test_1.csv'), names=['id'])
labels_train_0 = pd.read_csv(os.path.join(full_path,'train/0/labels_train_0.csv'), names=['id'])
labels_train_1 = pd.read_csv(os.path.join(full_path,'train/1/labels_train_1.csv'), names=['id'])

# 2. Analyze the datasets

In [8]:
test_size = labels_test.shape[0]
test_0_size = labels_test_0.shape[0]
test_1_size = labels_test_1.shape[0]
train_0_size = labels_train_0.shape[0]
train_1_size = labels_train_1.shape[0]

print('Test dataset size:',test_size)
print('Test dataset for label=0 size:',test_0_size)
print('Test dataset for label=1 size:',test_1_size)
print('Train dataset for label=0 size:',train_0_size)
print('Train dataset for label=1 size:',train_1_size)

Test dataset size: 45587
Test dataset for label=0 size: 27123
Test dataset for label=1 size: 18465
Train dataset for label=0 size: 70654
Train dataset for label=1 size: 70654


In [9]:
ratio_test_train = 0.21 # test size respect to train size
ratio_labels_train = 0.5 # labels dsitribution for train

small_dataset_size = 10000 # size of our new small dataset
test_small_size = int(np.round(small_dataset_size * ratio_test_train,0)) # size of new small test set
test_0_small_size = int(np.round(test_small_size * 0.59,0)) # test images with label=0 in new small tes set
test_1_small_size = int(np.round(test_small_size * 0.41,0)) # test images with label=1 in new small test set
train_small_size = small_dataset_size - test_small_size # size of new small train set
train_0_small_size = int(np.round(train_small_size * ratio_labels_train)) # images with label=0 in new small train set
train_1_small_size = int(np.round(train_small_size * ratio_labels_train)) # images with label=0 in new small train set

print(f'We need {test_0_small_size} images from the test/0 directory for our new test_small dataset')
print(f'We need {test_1_small_size} images from the test/1 directory for our new test_small dataset')
print(f'We need {train_0_small_size} images from the train/0 directory for our new train_small dataset')
print(f'We need {train_1_small_size} images from the train/1 directory for our new train_small dataset')

We need 1239 images from the test/0 directory for our new test_small dataset
We need 861 images from the test/1 directory for our new test_small dataset
We need 3950 images from the train/0 directory for our new train_small dataset
We need 3950 images from the train/1 directory for our new train_small dataset


# 3. Select the data

## 3.1 Create small dataframes

In [10]:
test_0_small = labels_test_0.sample(random_state=42, n=test_0_small_size)
test_1_small = labels_test_1.sample(random_state=42, n=test_1_small_size)
train_0_small = labels_train_0.sample(random_state=42, n=train_0_small_size)
train_1_small = labels_train_1.sample(random_state=42, n=train_1_small_size)

## 3.2 Copy selected images to new (small) directories

### 3.2.1 Test small:

In [11]:
test_0_small_ids = test_0_small['id']
test_1_small_ids = test_1_small['id']

In [13]:
input_test_0 = os.path.join(full_path,'test/0')
input_test_1 = os.path.join(full_path,'test/1')

os.mkdir('../raw_data/test/test_small/') # run only the first time to create the directory
os.mkdir('../raw_data/test/test_small/0_small') # run only the first time to create the directory
os.mkdir('../raw_data/test/test_small/1_small') # run only the first time to create the directory

output_test_0 = os.path.join(full_path,'test/test_small/0_small')
output_test_1 = os.path.join(full_path,'test/test_small/1_small')

for item in test_0_small_ids:
    shutil.copy(os.path.join(input_test_0,item),output_test_0)
for item in test_1_small_ids:
    shutil.copy(os.path.join(input_test_1,item),output_test_1)

### 3.2.2 Train small:

In [14]:
train_0_small_ids = train_0_small['id']
train_1_small_ids = train_1_small['id']

In [15]:
input_train_0 = os.path.join(full_path,'train/0')
input_train_1 = os.path.join(full_path,'train/1')

os.mkdir('../raw_data/train/train_small/') # run only the first time to create the directory
os.mkdir('../raw_data/train/train_small/0_small') # run only the first time to create the directory
os.mkdir('../raw_data/train/train_small/1_small') # run only the first time to create the directory

output_train_0 = os.path.join(full_path,'train/train_small/0_small')
output_train_1 = os.path.join(full_path,'train/train_small/1_small')

for item in train_0_small_ids:
    shutil.copy(os.path.join(input_train_0,item),output_train_0)
for item in train_1_small_ids:
    shutil.copy(os.path.join(input_train_1,item),output_train_1)

# 4. Additional

#### For convenience we shall also create a '.csv' file in each directory with the image ids for future use:

In [16]:
# Test for label=0
path_test_0 = '../raw_data/test/test_small/0_small'
labels_test_0 = os.listdir(path_test_0)
test_0_df = pd.DataFrame(labels_test_0)
test_0_df.to_csv(os.path.join(path_test_0,'labels_test_0_small.csv'), index = False)

# Test for label=1
path_test_1 = '../raw_data/test/test_small/1_small'
labels_test_1 = os.listdir(path_test_1)
test_1_df = pd.DataFrame(labels_test_1)
test_1_df.to_csv(os.path.join(path_test_1,'labels_test_1_small.csv'), index = False)

# Test all
test_df = pd.concat([test_0_df,test_1_df], ignore_index=True)
test_df.to_csv(os.path.join('../raw_data/test/test_small/labels_test_small.csv'), index = False)

# Train for label=0
path_train_0 = '../raw_data/train/train_small/0_small'
labels_train_0 = os.listdir(path_train_0)
train_0_df = pd.DataFrame(labels_train_0)
train_0_df.to_csv(os.path.join(path_train_0,'labels_train_0_small.csv'), index = False)

# Train for label=1
path_train_1 = '../raw_data/train/train_small/1_small'
labels_train_1 = os.listdir(path_train_1)
train_1_df = pd.DataFrame(labels_train_1)
train_1_df.to_csv(os.path.join(path_train_1,'labels_train_1_small.csv'), index = False)

# Train all
train_df = pd.concat([train_0_df,train_1_df], ignore_index=True)
train_df.to_csv(os.path.join('../raw_data/train/train_small/labels_train_small.csv'), index = False)

#### We can check if we have the right amount of files in each directory by entering the directory through the terminal and typping:

ls | wc -l

#### In directory 'test_small/0_small' we should have this many files:

In [17]:
test_0_small_size
# We will have one more element in the count because of the .csv file

1239

#### In directory 'test_small/1_small' we should have this many files:

In [18]:
test_1_small_size
# We will have one more element in the count because of the .csv file

861

#### In directory 'train/0_small' we should have this many files:

In [19]:
train_0_small_size
# We will have one more element in the count because of the .csv file

3950

#### In directory 'train/1_small' we should have this many files:

In [20]:
train_1_small_size
# We will have one more element in the count because of the .csv file

3950

In [31]:
pd.DataFrame(test_1_df[0].isin(train_1_df[0])).value_counts()

False    861
dtype: int64