# Preprocessing the original Dataset

# Importing required packages

In [1]:
import os
from PIL import Image
import pandas as pd
from shutil import copyfile

## Creating a validation set from the training set using the Holdout Method

The vast majority of the dataset is made of patient instances with a benign melanoma (98.24% of the dataset).
- 32542 patient's with a benign Melanoma
- 584 patient's with a malignant Melanoma

So that our model is not baised towards classifying Melanomas as benign we will only use 584 of the 32542 instances that are benign (50/50 split between benign and malignant melanomas).

Since we have a relatively small dataset we will use a 60/20/20 split for training, validation and testing sets respectively.

In [2]:
train_df = pd.read_csv("train.csv")

In [3]:
numTrainInstances = 350 # 584*0.6 = 350
numValAndTestingInstances = 117 # 584*0.2 = 117

In [4]:
benign_train = train_df[train_df["target"] == 0][:numTrainInstances]
benign_val = train_df[train_df["target"] == 0][numTrainInstances:numTrainInstances+numValAndTestingInstances]
benign_test = train_df[train_df["target"] == 0][numTrainInstances+numValAndTestingInstances:numTrainInstances+(numValAndTestingInstances*2)]

In [5]:
benign_train

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000
...,...,...,...,...,...,...,...,...,...,...,...
348,ISIC_0218747,IP_3200260,female,30.0,lower extremity,unknown,benign,0,8,6000,4000
349,ISIC_0219450,IP_8298925,female,50.0,torso,unknown,benign,0,11,6000,4000
350,ISIC_0219493,IP_7770500,male,60.0,upper extremity,unknown,benign,0,14,6000,4000
351,ISIC_0219948,IP_0634841,male,55.0,upper extremity,unknown,benign,0,9,6000,4000


In [6]:
benign_val

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
353,ISIC_0220649,IP_6331820,female,55.0,torso,unknown,benign,0,0,6000,4000
354,ISIC_0220926,IP_8848835,female,55.0,torso,unknown,benign,0,10,6000,4000
355,ISIC_0221351,IP_7466144,female,35.0,upper extremity,unknown,benign,0,10,640,480
356,ISIC_0221590,IP_4023055,male,60.0,torso,nevus,benign,0,12,1872,1053
357,ISIC_0222117,IP_0989858,female,30.0,torso,unknown,benign,0,1,2592,1936
...,...,...,...,...,...,...,...,...,...,...,...
467,ISIC_0248936,IP_1214216,female,60.0,torso,unknown,benign,0,0,6000,4000
468,ISIC_0249212,IP_3057277,male,45.0,,unknown,benign,0,7,6000,4000
469,ISIC_0249269,IP_2436697,female,45.0,lower extremity,unknown,benign,0,14,6000,4000
470,ISIC_0249560,IP_6174545,male,25.0,torso,unknown,benign,0,12,4288,2848


In [7]:
benign_test

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
472,ISIC_0250064,IP_4921034,female,75.0,lower extremity,nevus,benign,0,0,1872,1053
473,ISIC_0250069,IP_3549978,female,40.0,head/neck,unknown,benign,0,5,6000,4000
474,ISIC_0250455,IP_7748972,male,45.0,torso,unknown,benign,0,13,4288,2848
476,ISIC_0250842,IP_8011614,female,35.0,torso,unknown,benign,0,11,4288,2848
477,ISIC_0251149,IP_8442277,female,50.0,torso,unknown,benign,0,11,4288,2848
...,...,...,...,...,...,...,...,...,...,...,...
590,ISIC_0282166,IP_9583707,male,20.0,torso,unknown,benign,0,5,3264,2448
591,ISIC_0282281,IP_3690477,male,60.0,torso,unknown,benign,0,3,6000,4000
592,ISIC_0282321,IP_1389115,female,60.0,torso,unknown,benign,0,14,4288,2848
593,ISIC_0282422,IP_9416054,male,55.0,torso,unknown,benign,0,6,6000,4000


In [8]:
malignant_train = train_df[train_df["target"] == 1][:numTrainInstances]
malignant_val = train_df[train_df["target"] == 1][numTrainInstances:numTrainInstances+numValAndTestingInstances]
malignant_test = train_df[train_df["target"] == 1][numTrainInstances+numValAndTestingInstances:numTrainInstances+(numValAndTestingInstances*2)]

In [9]:
malignant_train

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
91,ISIC_0149568,IP_0962375,female,55.0,upper extremity,melanoma,malignant,1,0,6000,4000
235,ISIC_0188432,IP_0135517,female,50.0,upper extremity,melanoma,malignant,1,5,3264,2448
314,ISIC_0207268,IP_7735373,male,55.0,torso,melanoma,malignant,1,3,6000,4000
399,ISIC_0232101,IP_8349964,male,65.0,torso,melanoma,malignant,1,9,4288,2848
459,ISIC_0247330,IP_3232631,female,65.0,lower extremity,melanoma,malignant,1,10,6000,4000
...,...,...,...,...,...,...,...,...,...,...,...
20547,ISIC_6251753,IP_1273286,male,50.0,torso,melanoma,malignant,1,2,1872,1053
20560,ISIC_6255113,IP_1039004,female,75.0,upper extremity,melanoma,malignant,1,9,4288,2848
20612,ISIC_6269166,IP_0059113,female,75.0,upper extremity,melanoma,malignant,1,9,1872,1053
20643,ISIC_6278273,IP_4617831,male,65.0,torso,melanoma,malignant,1,12,3888,2592


In [10]:
malignant_val

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
20736,ISIC_6303107,IP_6890425,female,45.0,torso,melanoma,malignant,1,0,1872,1053
20775,ISIC_6313487,IP_6387073,male,70.0,torso,melanoma,malignant,1,14,5184,3456
20904,ISIC_6348638,IP_3623410,male,60.0,torso,melanoma,malignant,1,3,4288,2848
20915,ISIC_6352565,IP_4130585,female,65.0,lower extremity,melanoma,malignant,1,1,6000,4000
20930,ISIC_6355353,IP_8101933,male,60.0,lower extremity,melanoma,malignant,1,7,640,480
...,...,...,...,...,...,...,...,...,...,...,...
26226,ISIC_7960869,IP_9455054,male,45.0,torso,melanoma,malignant,1,7,1872,1053
26230,ISIC_7961365,IP_7785592,male,45.0,lower extremity,melanoma,malignant,1,8,1872,1053
26269,ISIC_7973598,IP_8124898,male,35.0,torso,melanoma,malignant,1,7,4288,2848
26374,ISIC_8002875,IP_6833889,male,55.0,head/neck,melanoma,malignant,1,12,3264,2448


In [11]:
malignant_test

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
26398,ISIC_8009536,IP_0957064,male,65.0,torso,melanoma,malignant,1,8,4032,3024
26406,ISIC_8011058,IP_3662918,female,30.0,lower extremity,melanoma,malignant,1,0,5184,3456
26474,ISIC_8027899,IP_7785592,male,30.0,lower extremity,melanoma,malignant,1,8,640,480
26522,ISIC_8040024,IP_8171851,female,55.0,torso,melanoma,malignant,1,5,1872,1053
26567,ISIC_8052769,IP_6630831,male,50.0,upper extremity,melanoma,malignant,1,14,3264,2448
...,...,...,...,...,...,...,...,...,...,...,...
32969,ISIC_9955163,IP_7507212,male,55.0,upper extremity,melanoma,malignant,1,5,640,480
33000,ISIC_9963177,IP_1165806,male,70.0,torso,melanoma,malignant,1,13,1872,1053
33014,ISIC_9967383,IP_7887363,male,60.0,upper extremity,melanoma,malignant,1,-1,6000,4000
33050,ISIC_9978107,IP_2860540,male,65.0,lower extremity,melanoma,malignant,1,6,4288,2848


In [12]:
benign_train["image_name"]

0      ISIC_2637011
1      ISIC_0015719
2      ISIC_0052212
3      ISIC_0068279
4      ISIC_0074268
           ...     
348    ISIC_0218747
349    ISIC_0219450
350    ISIC_0219493
351    ISIC_0219948
352    ISIC_0220461
Name: image_name, Length: 350, dtype: object

## Downscaling the image from 256x256 pixels to 224x224 pixels
### Seperating the benign and malignant melanomas into different directories

In [15]:
#     for image_name in df["image_name"]:
#         current_path = f"./original-dataset/train/{image_name}.jpg"
#         new_path = f"./{dir1}/{dir2}/{image_name}.jpg"
#         copyfile(current_path, new_path)

In [18]:
def changeImgDirectoryAndResize(df, isTrain, isBenign):
    dir1 = "train" if isTrain else "validation"
    dir2 = "benign" if isBenign else "malignant"
    
    for image_name in df["image_name"]:
        img = Image.open(f"./original-dataset/train/{image_name}.jpg")
        img = img.resize((224, 224), Image.ANTIALIAS)
        img.save(f"./{dir1}/{dir2}/{image_name}.jpg")

In [19]:
# changeImgDirectoryAndResize(benign_train, True, True)
# changeImgDirectoryAndResize(malignant_train, True, False)
# changeImgDirectoryAndResize(benign_val, False, True)
# changeImgDirectoryAndResize(malignant_val, False, False)