In [1]:
%matplotlib inline

In [2]:
from fastai.vision import *
np.random.seed(11)

In [3]:
train_df = pd.read_csv('data/ISIC_2019_Training_Metadata.csv')

In [4]:
train_labels = pd.read_csv('data/ISIC_2019_Training_GroundTruth.csv')

In [5]:
def diag(row):
    return row[row==1].index[0]

In [6]:
train_labels['diagnosis'] = train_labels.apply(diag,axis=1)

In [7]:
train_labels.drop(columns=['MEL','NV','BCC','AK','BKL','DF','VASC','SCC','UNK'], inplace=True)

In [8]:
train_labels = train_labels.merge(train_df,how='left',on='image')

In [9]:
# this will tell us how many images are associated with each lesion_id
df = train_labels.groupby('lesion_id').count()

# now we filter out lesion_id's that have duplicates
df = df[df['image'] > 1]

df.reset_index(inplace=True)

df.head()

Unnamed: 0,lesion_id,image,diagnosis,age_approx,anatom_site_general,sex
0,BCN_0000001,3,3,3,3,3
1,BCN_0000002,3,3,3,3,3
2,BCN_0000003,2,2,2,2,2
3,BCN_0000004,6,6,6,6,6
4,BCN_0000008,3,3,3,3,3


In [10]:
from sklearn.model_selection import train_test_split
dup_train, dup_val = train_test_split(df, random_state=11)

In [11]:
dup_train_ls = list(dup_train.lesion_id)
dup_val_ls = list(dup_val.lesion_id)

In [12]:
train = train_labels[train_labels['lesion_id'].isin(dup_train_ls)]
val = train_labels[train_labels['lesion_id'].isin(dup_val_ls)]

In [13]:
# this will tell us how many images are associated with each lesion_id
df = train_labels.groupby('lesion_id').count()

# now we filter out lesion_id's that have only one image associated with it
df = df[df['image'] == 1]

df.reset_index(inplace=True)

df.head()

Unnamed: 0,lesion_id,image,diagnosis,age_approx,anatom_site_general,sex
0,BCN_0000013,1,1,1,1,1
1,BCN_0000020,1,1,1,1,1
2,BCN_0000039,1,1,1,1,1
3,BCN_0000053,1,1,1,1,1
4,BCN_0000059,1,1,1,1,1


In [14]:
un_train, un_val = train_test_split(df, random_state=11)

In [15]:
un_train_ls = list(un_train.lesion_id)
un_val_ls = list(un_val.lesion_id)

In [16]:
train_un = train_labels[train_labels['lesion_id'].isin(un_train_ls)]
val_un = train_labels[train_labels['lesion_id'].isin(un_val_ls)]

In [17]:
train = pd.concat([train, train_un],ignore_index=True,sort=False)
val = pd.concat([val, val_un],ignore_index=True,sort=False)

In [18]:
df = train_labels[train_labels['lesion_id'].isnull()]

df.head()

Unnamed: 0,image,diagnosis,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,NV,55.0,anterior torso,,female
1,ISIC_0000001,NV,30.0,anterior torso,,female
2,ISIC_0000002,MEL,60.0,upper extremity,,female
3,ISIC_0000003,NV,30.0,upper extremity,,male
4,ISIC_0000004,MEL,80.0,posterior torso,,male


In [19]:
nul_train, nul_val = train_test_split(df, random_state=11)

In [20]:
train = pd.concat([train,nul_train],ignore_index=True,sort=False)
val = pd.concat([val,nul_val],ignore_index=True,sort=False)

In [21]:
train.shape, val.shape

((18964, 6), (6367, 6))

In [22]:
train.diagnosis.value_counts()

NV      9670
MEL     3343
BCC     2481
BKL     1983
AK       661
SCC      488
VASC     174
DF       164
Name: diagnosis, dtype: int64

In [23]:
val.diagnosis.value_counts()

NV      3205
MEL     1179
BCC      842
BKL      641
AK       206
SCC      140
VASC      79
DF        75
Name: diagnosis, dtype: int64

In [24]:
train.to_csv('data/train.csv',index=False)
val.to_csv('data/val.csv',index=False)

In [34]:
t = pd.read_csv('data/train.csv')

In [35]:
t.tail()

Unnamed: 0,image,diagnosis,age_approx,anatom_site_general,lesion_id,sex
18959,ISIC_0014911_downsampled,NV,25.0,anterior torso,,female
18960,ISIC_0000370,NV,65.0,lower extremity,,male
18961,ISIC_0012212_downsampled,NV,60.0,,,female
18962,ISIC_0011123,NV,65.0,posterior torso,,female
18963,ISIC_0015563_downsampled,NV,45.0,upper extremity,,female


In [36]:
v = pd.read_csv('data/val.csv')

In [37]:
vl = list(v['lesion_id'].unique())

In [38]:
tl = list(t['lesion_id'].unique())

In [39]:
for l in vl:
    if l in tl:
        print(l)

nan


In [40]:
t['is_val'] = 0
v['is_val'] = 1

In [41]:
f = pd.concat([t,v],ignore_index=True, sort=False)

In [42]:
f.to_csv('data/full.csv', index = False)