In [1]:
from fastai.vision import *
from sklearn.model_selection import train_test_split
np.random.seed(11)

In [2]:
#Assign path to directory you have your data
path = Path('data')

In [3]:
train_df = pd.read_csv(path/'ISIC_2019_Training_Metadata.csv')
train_labels = pd.read_csv(path/'ISIC_2019_Training_GroundTruth.csv')

In [4]:
train_df.head()

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,55.0,anterior torso,,female
1,ISIC_0000001,30.0,anterior torso,,female
2,ISIC_0000002,60.0,upper extremity,,female
3,ISIC_0000003,30.0,upper extremity,,male
4,ISIC_0000004,80.0,posterior torso,,male


In [5]:
train_labels.head()

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#Convert One-hot encoded diagnosis to single column
def diag(row):
    return row[row==1].index[0]
train_labels['diagnosis'] = train_labels.apply(diag,axis=1)

#Drop one-hot columns 
train_labels.drop(columns=['MEL','NV','BCC','AK','BKL','DF','VASC','SCC','UNK'], inplace=True)

#Add metadata to train_labels onto train_df
train_labels = train_labels.merge(train_df,how='left',on='image')

In [7]:
train_labels.head()

Unnamed: 0,image,diagnosis,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,NV,55.0,anterior torso,,female
1,ISIC_0000001,NV,30.0,anterior torso,,female
2,ISIC_0000002,MEL,60.0,upper extremity,,female
3,ISIC_0000003,NV,30.0,upper extremity,,male
4,ISIC_0000004,MEL,80.0,posterior torso,,male


In [8]:
#filter out lesion_id's that have duplicates
df = train_labels.groupby('lesion_id').count()
df = df[df['image'] > 1]
df.reset_index(inplace=True)

In [9]:
#Split the lensions by lesions_id to train/val set
dup_train, dup_val = train_test_split(df, random_state=11)
dup_train_ls = list(dup_train.lesion_id)
dup_val_ls = list(dup_val.lesion_id)

#Create sep train/val dfs
train_df = train_labels[train_labels['lesion_id'].isin(dup_train_ls)]
val_df = train_labels[train_labels['lesion_id'].isin(dup_val_ls)]

In [10]:
#filter lesions without duplicate lesion_ids
df = train_labels.groupby('lesion_id').count()
df = df[df['image'] == 1]
df.reset_index(inplace=True)

In [11]:
#Get the lesion ids of all lesions that do not contain duplicates
indv_train, indv_val = train_test_split(df, random_state=11)
indv_train_ls = list(indv_train.lesion_id)
indv_val_ls = list(indv_val.lesion_id)

train_indv = train_labels[train_labels['lesion_id'].isin(indv_train_ls)]
val_indv = train_labels[train_labels['lesion_id'].isin(indv_val_ls)]

In [12]:
#Add those lesions to existing train/val dfs
train_df = pd.concat([train_df, train_indv],ignore_index=True,sort=False)
val_df = pd.concat([val_df, val_indv],ignore_index=True,sort=False)

In [13]:
#Lesions will null ID
df = train_labels[train_labels['lesion_id'].isnull()]

#Repeat filter process for lesions with null id
nul_train, nul_val = train_test_split(df, random_state=11)
train_df = pd.concat([train_df,nul_train],ignore_index=True,sort=False)
val_df = pd.concat([val_df,nul_val],ignore_index=True,sort=False)

In [14]:
#Check everything adds up
train_df.shape, val_df.shape

((18964, 6), (6367, 6))

In [15]:
len(train_df) + len(val_df) == len(train_labels)

True

In [16]:
#Quick view at lesions distributions in train/val
train_df.diagnosis.value_counts()

NV      9670
MEL     3343
BCC     2481
BKL     1983
AK       661
SCC      488
VASC     174
DF       164
Name: diagnosis, dtype: int64

In [17]:
val_df.diagnosis.value_counts()

NV      3205
MEL     1179
BCC      842
BKL      641
AK       206
SCC      140
VASC      79
DF        75
Name: diagnosis, dtype: int64

In [18]:
#Double check no lesions of same id are in train and val
tl = list(train_df['lesion_id'].unique())
vl = list(val_df['lesion_id'].unique())

for lesion in vl:
    if lesion in tl:
        print(lesion)

nan


In [19]:
#Save
train_df.to_csv(path/'train.csv',index=False)
val_df.to_csv(path/'val.csv',index=False)

In [20]:
#Save a full df for later convenience
train_df['is_val'] = 0
val_df['is_val'] = 1
full_df = pd.concat([train_df,val_df],ignore_index=True, sort=False)
full_df.to_csv(path/'full.csv',index=False)