# Prepare the dataframe to download images into dataset folders

## This notebook contains the code to convert 'HAM10000_metadata.csv' and 'metadata_additional.csv' into one dataframe which contains all imformation to prepare the dataset for modelling.

In [None]:
#Import dependancies
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense, Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.optimizers import Adam

In [3]:
#Display maximum rows of the dataframe - optional. 
pd.set_option('display.max_rows', 150)

### Prepare main dataset

In [5]:
#Load the metadata for the main dataset
csv_path = "C:/Users/kravc/Final_Project/Data/Skin_classifier/HAM10000_metadata.csv"
skin_df = pd.read_csv(csv_path)
skin_df.sort_values(by="image_id").head(10)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
4349,HAM_0000550,ISIC_0024306,nv,follow_up,45.0,male,trunk
4263,HAM_0003577,ISIC_0024307,nv,follow_up,50.0,male,lower extremity
4217,HAM_0001477,ISIC_0024308,nv,follow_up,55.0,female,trunk
3587,HAM_0000484,ISIC_0024309,nv,follow_up,40.0,male,trunk
1451,HAM_0003350,ISIC_0024310,mel,histo,60.0,male,chest
4497,HAM_0000981,ISIC_0024311,nv,follow_up,75.0,female,back
700,HAM_0001359,ISIC_0024312,bkl,histo,75.0,male,lower extremity
2130,HAM_0002869,ISIC_0024313,mel,histo,50.0,female,back
6803,HAM_0002198,ISIC_0024314,nv,histo,75.0,male,lower extremity
1327,HAM_0007538,ISIC_0024315,mel,histo,55.0,male,trunk


In [6]:
#Add column to indicate leasions which have multiple corresponding images.
#This will help to make sure that all duplicated images go to the one set.

def add_duplicate_column(df):
    """
    To the existing dataframe add the column with value '0' for skin lesions that have 
    only one corresponding image, or '1' for skin lesions that have two or more corresponding images
    
    Input: DataFrame
    Output: DataFrame with new column "duplicate"
    
    """
    duplicates=df[df.duplicated(subset=['lesion_id'], keep=False)==True]
    duplicates_images_list=duplicates['image_id'].values.tolist()
    duplicate_column=[]
    for i in range(len(df['image_id'])):
        if df['image_id'][i] in duplicates_images_list:
            dup=1
        else:
            dup=0
        duplicate_column.append(dup)
    df['duplicate']=duplicate_column
    return df

In [9]:
#Implement the add_duplicate_column function to the initial dataframe
skin_df=add_duplicate_column(skin_df)

In [11]:
#Add column to indicate a class of lesion in a numeric form.

#Classes are present in the column 'dx' and represent the following lesion variants:

#    "akiec" : "actinic keratosis", very early form of skin cancer 
#    "bcc" : "basal cell carcinoma" ,  basal-cell cancer or white skin cancer
#    "bkl" : "benign keratosis-like lesions",  non-cancerous skin tumour
#    "df" : "dermatofibroma",  non-cancerous rounded bumps 
#    "mel" : "melanoma",  black skin cancer
#    "nv" : "melanocytic nevi",  mole non-cancerous
#    "vasc" : "vascular lesions",  non-cancerous skin condition


def add_class_column(df):
    """
    To the exesting dataframe add the column with numeric labels for the class.
    
    Input: DataFrame
    Output: DataFrame with new column "class"
    
    """
    target_dict={k: v for v, k in enumerate(np.unique(df.dx))}
    df['class'] = df['dx'].map(target_dict)
    return df

In [12]:
#Implement the add_duplicate_column function to the initial dataframe
skin_df=add_class_column(skin_df)

### Prepare additional dataset

In [13]:
#Load the metadata file for the additional dataset
add_data=pd.read_csv('C:/Users/kravc/Final_Project/Data/Skin_classifier/metadata_additional.csv')

In [15]:
#See what columns the new data has and what should be cleaned before concatenating dataframes
add_data.columns

Index(['patient_id', 'lesion_id', 'smoke', 'drink', 'background_father',
       'background_mother', 'age', 'pesticide', 'gender',
       'skin_cancer_history', 'cancer_history', 'has_piped_water',
       'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
       'diameter_2', 'diagnostic', 'itch', 'grew', 'hurt', 'changed', 'bleed',
       'elevation', 'img_id', 'biopsed'],
      dtype='object')

In [16]:
#Clean the dataframe with additional data to prepare it for concatnating. As we need to enrich 
#the existing dataset with images of cancerous lesions we will leave only cancerous classes.

def clean_additional_data(df):
    """
    In the dataframe leave only images of cancerous classes, remove unwanted columns, rename the 
    column to match the initial dataframee, add 'class' column, drop all lesions with two or more
    corresponding images, add 'duplicate' columns.
    
    Input: DataFrame
    Output: DataFrame prepared to be concatenated with main dataframe ('skin_df')
    
    """
    
    #leave only cancerous lesions as we want to enlarge them
    df=df[(df.diagnostic=='ACK')|(df.diagnostic=='BCC')|(df.diagnostic=='MEL')]
    #Remove unwanted columns
    df=df.loc[:, ['lesion_id', 'age', 'gender', 'region', 'diagnostic', 'img_id']]
    #Rename the columns to match initial dataframe
    df.columns=['lesion_id', 'age', 'sex', 'localization', 'class', 'image_id']
    #Add 'class' column
    target_dict_add={'ACK': 0, 'BCC': 1, 'MEL': 4}
    df['class'] = df['class'].map(target_dict_add)
    #Drop duplicated lesions
    df=df.drop_duplicates(subset=['lesion_id'], keep='first')
    #Add 'duplicates' column and set all values to 0
    df['duplicate']=0
    return df

In [18]:
#Clean the dataframe for additional dataset
add_data=clean_additional_data(add_data)

### Create the dataframe for the whole dataset

In [19]:
#Concatenate two dataframes into one to represent whole image set
all_images=pd.concat([skin_df, add_data], ignore_index=True)

In [17]:
#Check class imbalance
all_images.groupby(['duplicate','class'], as_index=False).count()

Unnamed: 0,duplicate,class,lesion_id,image_id,dx,dx_type,age,sex,localization
0,0,0,720,720,151,151,720,370,720
1,0,1,781,781,175,175,781,781,781
2,0,2,440,440,440,440,432,440,440
3,0,3,39,39,39,39,39,39,39
4,0,4,264,264,230,230,264,264,264
5,0,5,4415,4415,4415,4415,4376,4415,4415
6,0,6,64,64,64,64,64,64,64
7,1,0,176,176,176,176,176,176,176
8,1,1,339,339,339,339,339,339,339
9,1,2,659,659,659,659,657,659,659


In [21]:
#Add column to indicate whether the class represents cancerous or non-cancerous lesion
#    "akiec" : cancerous
#    "bcc" : cancerous
#    "bkl" : non-cancerous
#    "df" : non-cancerous
#    "mel" : cancerous
#    "nv" : non-cancerous
#    "vasc" : non-cancerous
#}

def add_cancerous_column(df):
    """
    To the dataframe add column with value '0' for non-cancerous class and '1' for cancerous class
    
    Input: DataFrame
    Output: DataFrame with new column "cancerous"
    
    """
    canc_dict={0: 1, 1: 1, 2: 0, 3: 0, 4: 1, 5: 0, 6: 0}
    df['cancerous'] = df['class'].map(canc_dict)
    return df

In [22]:
#Add 'cancerous' column to the working dataframe
all_images=add_cancerous_column(all_images)

In [23]:
#Check class imbalance 
all_images.groupby(['cancerous'], as_index=False).count()

Unnamed: 0,cancerous,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicate,class
0,0,8061,8061,8061,8061,8006,8061,8061,8061,8061
1,1,3163,3163,1954,1954,3161,2813,3163,3163,3163


In [24]:
#To balance data remove all lesions with augmented images from the largest class by cancerous
all_images.drop(all_images[(all_images['duplicate']==1) & (all_images['cancerous']==0)].index, inplace=True)

In [25]:
all_images.groupby(['cancerous'], as_index=False).count()

Unnamed: 0,cancerous,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicate,class
0,0,4958,4958,4958,4958,4911,4958,4958,4958,4958
1,1,3163,3163,1954,1954,3161,2813,3163,3163,3163


In [26]:
all_images.reset_index(drop=True, inplace=True)

In [27]:
#Check if all image_ids are unique
all_images['image_id'].nunique()==all_images.shape[0]

True

### Divide the dataset into training, validation and test set

Now let us divide each class into training (75%), validation (15%) and test (10%) set. We will create a columns which will specify the set for the image.
First, put 75 % of every class to train set making sure that all augmented lesions are in training set.

In [28]:
#Duplicated images that will go to train set
classes=[0,1]
images_to_train=[]
for cl in classes:
    for image in all_images[all_images['cancerous']==cl]['image_id'].tolist():
        if (all_images[all_images['image_id']==image]['duplicate']==1).values==True:
            images_to_train.append(image)

In [31]:
all_images.groupby(['cancerous'], as_index=False).count()

Unnamed: 0,cancerous,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicate,class
0,0,4958,4958,4958,4958,4911,4958,4958,4958,4958
1,1,3163,3163,1954,1954,3161,2813,3163,3163,3163


In [32]:
#Add column to indicate to which set (train, val or test) the image should go
all_images['set']=0

In [34]:
#For all lesions with mulpiple images update their 'set' value in dataframe
for i in range(len(all_images['image_id'])):
    if (all_images['image_id'][i] in images_to_train)==True:
        all_images['set'][i]='train'      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_images['set'][i]='train'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [35]:
all_images.groupby(['cancerous', 'set'], as_index=False).count()

Unnamed: 0,cancerous,set,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicate,class
0,0,0,4958,4958,4958,4958,4911,4958,4958,4958,4958
1,1,0,1765,1765,556,556,1765,1415,1765,1765,1765
2,1,train,1398,1398,1398,1398,1396,1398,1398,1398,1398


In [36]:
#Count how many random images from original (without augmented duplicates) set we need to add in order to put 75% of each class to train set. 

In [37]:
import random

In [38]:
#Get the list of images of the class that would supplement train set so that train set includes 75% of images of the class 

def get_train_supplement(cl):
    """
    Creates the list of image_ids that or the class that would supplement training set for this class
    
    Input: class label from 0 to 5
    Output: the list if image_ids from the class that will be added to the train set
    
    """
   
    train_supplement=[]
    #define how many images we need in order to complete the training set for the class to 75% of all class values
    number=0.75*(all_images[all_images['cancerous']==cl].nunique()['image_id']) - all_images[(all_images['cancerous']==cl) & (all_images['set']=='train')].count()['image_id']
    number=round(number)
    if number>0:
        #randomly choose the desired number of images from non duplicated subset od this class
        a = all_images[(all_images['cancerous']==cl) & (all_images['set']==0)]['image_id'].tolist()
        random.shuffle(a)
        train_supplement=a[: number]
        return  train_supplement, number
    else:
        return "The trainig set for this class is already complete", number
    

In [39]:
#Get the lists of train set supplements for each class: (0=non_cancerous and 1=cancerous)
train_supplement_class_0=get_train_supplement(0)
train_supplement_class_1=get_train_supplement(1)


In [40]:
print(train_supplement_class_0[1])
print(train_supplement_class_1[1])


3718
974


In [41]:
#Assign 'train' value to all image_id from the supplement lists
for i in range(len(all_images['image_id'])):
    if  (all_images['image_id'][i] in train_supplement_class_0[0])==True:
            all_images['set'][i]='train' 
    if  (all_images['image_id'][i] in train_supplement_class_1[0])==True:
            all_images['set'][i]='train'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_images['set'][i]='train'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_images['set'][i]='train'


In [42]:
#Check the length of overal training set
len(all_images[all_images['set']=='train'])

6090

In [43]:
#Create validation set (15% of all values) for each class
def get_validation_list(cl):
    
    """
    Creates the list of image_ids that or the class that would supplement validation set for this class
    
    Input: class label from 0 to 5
    Output: the list if image_ids from the class that will be added to the validation set
    
    """
    
    validation_list=[]
    #define how many images we need in order to complete the training set for the class to 75% of all class values
    number=0.6*(all_images[(all_images['cancerous']==cl) & (all_images['set']==0)].count()['image_id'])
    number=round(number)
    if number>0:
        #randomly choose the desired number of images from non duplicated subset od this class
        a = all_images[(all_images['cancerous']==cl) & (all_images['set']==0)]['image_id'].tolist()
        random.shuffle(a)
        validation_list=a[: number]
        return  validation_list, number
    else:
        return  "Please check set=0 for this class", number

In [44]:
validation_class_0=get_validation_list(0)
validation_class_1=get_validation_list(1)

In [45]:
print(validation_class_0[1])
print(validation_class_1[1])

744
475


In [46]:
#Assign 'val' value to all image_id from the validation lists
for i in range(len(all_images['image_id'])):
    if  (all_images['image_id'][i] in validation_class_0[0])==True:
            all_images['set'][i]='val' 
    if  (all_images['image_id'][i] in validation_class_1[0])==True:
            all_images['set'][i]='val'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_images['set'][i]='val'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_images['set'][i]='val'


In [47]:
#Assign set='test' to all remaining images 
all_images['set']=all_images['set'].replace(0, 'test')

In [48]:
#Now we have all images put into train (75%), val(15%), test (10%) sets with stratification by class.
all_images.groupby(['set'], as_index=False).count()

Unnamed: 0,set,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicate,class,cancerous
0,test,812,812,587,587,807,743,812,812,812,812
1,train,6090,6090,5436,5436,6054,5907,6090,6090,6090,6090
2,val,1219,1219,889,889,1211,1121,1219,1219,1219,1219


In [45]:
#Save the dataframe to csv
all_images.to_csv('2classes_df.csv', index=False)