In [9]:
import pandas as pd
import os
import multiprocessing 

import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

## Data preparation

In [18]:
meta=pd.read_csv('data/ISIC-2017_Training_Data_metadata.csv')
labels=pd.read_csv('data/ISIC-2017_Training_Part3_GroundTruth.csv')

In [19]:
meta.columns

Index([u'image_id', u'age_approximate', u'sex'], dtype='object')

In [20]:
meta.sex.unique()

array(['female', 'male', 'unknown'], dtype=object)

Fill na with mean for age and mode for sex

In [32]:
meta.sex=meta.sex.replace('unknown', None)
meta.age_approximate=meta.age_approximate.replace('unknown', None)

meta.sex=meta.sex.fillna(meta.sex.mode())
meta.age_approximate=meta.age_approximate.fillna(meta.age_approximate.mean())

#meta.sex=meta.sex.replace(['female', 'male'], [0,1])
meta.sex=meta.sex.astype('float')
meta.age_approximate=meta.age_approximate.astype('float')

In [33]:
dataset=pd.concat((meta, labels[[1,2]]), axis=1)

In [34]:
labels.head()

Unnamed: 0,image_id,melanoma,seborrheic_keratosis
0,ISIC_0000000,0.0,0.0
1,ISIC_0000001,0.0,0.0
2,ISIC_0000002,1.0,0.0
3,ISIC_0000003,0.0,0.0
4,ISIC_0000004,1.0,0.0


In [35]:
def labeler(row):
    if row['melanoma'] == 0.0 and row['seborrheic_keratosis']==0.0:
        return 2
    elif row['melanoma'] == 1.0 and row['seborrheic_keratosis']==0.0:
        return 0 
    else:
        return 1

labels['class'] = labels.apply(labeler, axis=1)

In [36]:
labels.head()

Unnamed: 0,image_id,melanoma,seborrheic_keratosis,class
0,ISIC_0000000,0.0,0.0,2
1,ISIC_0000001,0.0,0.0,2
2,ISIC_0000002,1.0,0.0,0
3,ISIC_0000003,0.0,0.0,2
4,ISIC_0000004,1.0,0.0,0


In [37]:
y=np.array(labels[[3]].values)

In [38]:
mlb = MultiLabelBinarizer()
y_binarized=mlb.fit_transform(y)

In [39]:
y_binarized

array([[0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       ..., 
       [0, 1, 0],
       [1, 0, 0],
       [0, 1, 0]])

In [40]:
age_sex_labels=np.array(meta[[1,2]])

In [41]:
age_sex_labels

array([[ 55.,   0.],
       [ 30.,   0.],
       [ 60.,   0.],
       ..., 
       [ 55.,   0.],
       [ 75.,   1.],
       [ 70.,   1.]])

## Images preparation

In [45]:
def images_preprocess(directory='data/ISIC-2017_Training_Data/', savedir='data/melanoma_preprocessed/'):
    
    Melanoma_cnt=0
    Seborreic_ceratosis_cnt=0
    Nevus_cnt=0
    
    if not os.path.exists(savedir):  
        os.makedirs(savedir+'0_Melanoma/'),
        os.makedirs(savedir+'1_Seborreic_ceratosis/'),
        os.makedirs(savedir+'2_Nevus/')
    
    for img, im_class in labels[[0,3]].values:
        
        if im_class==0:
            for subdir, _, files in os.walk('data/ISIC-2017_Training_Data/'):
                subdir=subdir.replace('\\','/')
                subdir_split=subdir.split('/')
                if str('superpixels') not in files:
                    image_path=str(subdir+'/'+str(img)+'.jpg')
                    image=plt.imread(image_path)
                    plt.imsave((savedir+'0_Melanoma/'+str(img)+'.jpg'),image)
                    
                    Melanoma_cnt+=1
                    if Melanoma_cnt%50==0:
                        print(' # '*20)
                        print('{} images are saved in {}'.format(Melanoma_cnt, (savedir+'0_Melanoma/')))
        
                    
        elif im_class==1:                            
            for subdir, _, files in os.walk('data/ISIC-2017_Training_Data/'):
                subdir=subdir.replace('\\','/')
                subdir_split=subdir.split('/')
                if str('superpixels') not in files:
                    image_path=str(subdir+'/'+str(img)+'.jpg')
                    image=plt.imread(image_path)
                    plt.imsave((savedir+'1_Seborreic_ceratosis/'+str(img)+'.jpg'), image)
                    
                    Seborreic_ceratosis_cnt+=1
                    if Seborreic_ceratosis_cnt%50==0:
                        print(' # '*20)
                        print('{} images are saved in {}'.format(Seborreic_ceratosis_cnt, (savedir+'1_Seborreic_ceratosis/')))
        else:
            for subdir, _, files in os.walk('data/ISIC-2017_Training_Data/'):
                subdir=subdir.replace('\\','/')
                subdir_split=subdir.split('/')
                if str('superpixels') not in files:
                    image_path=str(subdir+'/'+str(img)+'.jpg')
                    image=plt.imread(image_path)
                    plt.imsave((savedir+'2_Nevus/'+str(img)+'.jpg'), image)
                    
                    Nevus_cnt+=1
                    if Nevus_cnt%50==0:
                        print(' # '*20)
                        print('{} images are saved in {}'.format(Nevus_cnt, (savedir+'2_Nevus/')))
                              
    with open(savedir+"class_weight.txt", "w") as text_file:
        text_file.write("0_Melanoma: {}, 1_Seborreic_ceratosis:{}, 2_Nevus:{}".format(Melanoma_cnt, Seborreic_ceratosis_cnt, Nevus_cnt))

In [46]:
images_preprocess()

 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
50 images are saved in data/melanoma_preprocessed/2_Nevus/
 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
100 images are saved in data/melanoma_preprocessed/2_Nevus/
 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
50 images are saved in data/melanoma_preprocessed/0_Melanoma/
 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
150 images are saved in data/melanoma_preprocessed/2_Nevus/
 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
200 images are saved in data/melanoma_preprocessed/2_Nevus/
 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
250 images are saved in data/melanoma_preprocessed/2_Nevus/
 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
300 images are saved in data/melanoma_preprocessed/2_Nevus/
 #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  # 
100 images are saved in data/melanoma_preprocessed/0_Melanoma/
 #  #  #  #  #  #  #  #  #  