In [1]:
import pandas as pd
import numpy as np

In [135]:
train_data = pd.read_csv('./metadataTrain.csv')
train_data.head()

Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0000000,2,female,55.0,anterior torso
1,ISIC_0000001,2,female,30.0,anterior torso
2,ISIC_0000002,1,female,60.0,upper extremity
3,ISIC_0000003,2,male,30.0,upper extremity
4,ISIC_0000004,1,male,80.0,posterior torso


In [136]:
missing_sex = train_data[train_data['SEX'].isna()]
print(missing_sex.count())
missing_sex.head()


ID          284
CLASS       284
SEX           0
AGE           0
POSITION    124
dtype: int64


Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
55,ISIC_0000080,2,,,
56,ISIC_0000081,2,,,
57,ISIC_0000086,2,,,
58,ISIC_0000087,2,,,
59,ISIC_0000088,2,,,


In [137]:
sex = train_data[train_data['SEX'].isin(['male','female'])]
print(sex.count())
sex.head()

ID          18714
CLASS       18714
SEX         18714
AGE         18674
POSITION    16904
dtype: int64


Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0000000,2,female,55.0,anterior torso
1,ISIC_0000001,2,female,30.0,anterior torso
2,ISIC_0000002,1,female,60.0,upper extremity
3,ISIC_0000003,2,male,30.0,upper extremity
4,ISIC_0000004,1,male,80.0,posterior torso


In [138]:
fem = sex[sex['SEX']=='female']
print(fem.count())
fem.head()

ID          8736
CLASS       8736
SEX         8736
AGE         8716
POSITION    7887
dtype: int64


Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0000000,2,female,55.0,anterior torso
1,ISIC_0000001,2,female,30.0,anterior torso
2,ISIC_0000002,1,female,60.0,upper extremity
5,ISIC_0000007,2,female,25.0,posterior torso
6,ISIC_0000008,2,female,30.0,anterior torso


In [139]:
mal = sex[sex['SEX']=='male']
print(mal.count())
mal.head()

ID          9978
CLASS       9978
SEX         9978
AGE         9958
POSITION    9017
dtype: int64


Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
3,ISIC_0000003,2,male,30.0,upper extremity
4,ISIC_0000004,1,male,80.0,posterior torso
9,ISIC_0000012,2,male,30.0,posterior torso
11,ISIC_0000015,2,male,35.0,posterior torso
21,ISIC_0000028,2,male,60.0,anterior torso


In [140]:
male_prob = mal['ID'].count()/sex['ID'].count()
print(male_prob)

0.5331837127284386


In [141]:
def f(x):
    if x in ['male','female']:
        return x
    else:
        p = np.random.random()
        if p>male_prob:
            return 'female'
        else:
            return 'male'
    
train_data['SEX'] = train_data['SEX'].map(f)
train_data.head()

Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0000000,2,female,55.0,anterior torso
1,ISIC_0000001,2,female,30.0,anterior torso
2,ISIC_0000002,1,female,60.0,upper extremity
3,ISIC_0000003,2,male,30.0,upper extremity
4,ISIC_0000004,1,male,80.0,posterior torso


In [142]:
age = train_data[train_data['AGE'].notna()]
age.count()

ID          18674
CLASS       18674
SEX         18674
AGE         18674
POSITION    16866
dtype: int64

In [143]:
agenp = age['AGE'].to_numpy()
print(agenp)
min_age = np.min(agenp)
max_age = np.max(agenp)
print(min_age,max_age)
print(np.random.randint(min_age,max_age+1))

[55. 30. 60. ... 45. 65. 55.]
0.0 85.0
14


In [144]:
def g(x):
    if np.isnan(x):
        return np.random.randint(min_age,max_age+1)
    else:
        return x
    
train_data['AGE'] = train_data['AGE'].map(g)
train_data.head()

Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0000000,2,female,55.0,anterior torso
1,ISIC_0000001,2,female,30.0,anterior torso
2,ISIC_0000002,1,female,60.0,upper extremity
3,ISIC_0000003,2,male,30.0,upper extremity
4,ISIC_0000004,1,male,80.0,posterior torso


In [145]:
print(train_data['POSITION'].dropna().count())

17028


In [146]:
poss = np.unique(train_data['POSITION'].dropna().to_numpy())
print(poss)

['anterior torso' 'head/neck' 'lateral torso' 'lower extremity'
 'oral/genital' 'palms/soles' 'posterior torso' 'upper extremity']


In [147]:
good_pos = train_data['POSITION'].dropna()

ant = good_pos[good_pos==poss[0]].count()
head = good_pos[good_pos==poss[1]].count()
lat = good_pos[good_pos==poss[2]].count()
low = good_pos[good_pos==poss[3]].count()
ora = good_pos[good_pos==poss[4]].count()
pal = good_pos[good_pos==poss[5]].count()
pos = good_pos[good_pos==poss[6]].count()
upp = good_pos[good_pos==poss[7]].count()
tot = ant+head+lat+low+ora+pal+pos+upp
print(ant,head,lat,low,ora,pal,pos,upp)
print(tot)

5194 3447 42 3738 46 292 2082 2187
17028


In [148]:
cnts = [ant,head,lat,low,ora,pal,pos,upp]
probs = cnts/tot
ints = np.cumsum(probs)
print(cnts)
print(probs)
print(ints)


[5194, 3447, 42, 3738, 46, 292, 2082, 2187]
[0.30502701 0.20243129 0.00246653 0.21952079 0.00270143 0.01714823
 0.1222692  0.12843552]
[0.30502701 0.5074583  0.50992483 0.72944562 0.73214705 0.74929528
 0.87156448 1.        ]


In [149]:
def h(x):
    if x in poss:
        return x
    else:
        p = np.random.random()
        if p<ints[0]:
            return poss[0]
        elif p<ints[1]:
            return poss[1]
        elif p<ints[2]:
            return poss[2]
        elif p<ints[3]:
            return poss[3]
        elif p<ints[4]:
            return poss[4]
        elif p<ints[5]:
            return poss[5]
        elif p<ints[6]:
            return poss[6]
        else:
            return poss[7]       
    
train_data['POSITION'] = train_data['POSITION'].map(h)
train_data.head()

Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0000000,2,female,55.0,anterior torso
1,ISIC_0000001,2,female,30.0,anterior torso
2,ISIC_0000002,1,female,60.0,upper extremity
3,ISIC_0000003,2,male,30.0,upper extremity
4,ISIC_0000004,1,male,80.0,posterior torso


In [150]:
print(train_data.notna().count())

ID          18998
CLASS       18998
SEX         18998
AGE         18998
POSITION    18998
dtype: int64


In [151]:
#train_data.to_csv('./metadataTrainFilled.csv',index=False)

In [3]:
test_data = pd.read_csv('./metadataTest.csv')
missing_sex = test_data[test_data['SEX'].isna()]
sex = test_data[test_data['SEX'].isin(['male','female'])]
mal = sex[sex['SEX']=='male']
male_prob = mal['ID'].count()/sex['ID'].count()

def f(x):
    if x in ['male','female']:
        return x
    else:
        p = np.random.random()
        if p>male_prob:
            return 'female'
        else:
            return 'male'
    
test_data['SEX'] = test_data['SEX'].map(f)



age = test_data[test_data['AGE'].notna()]
agenp = age['AGE'].to_numpy()
min_age = np.min(agenp)
max_age = np.max(agenp)

def g(x):
    if np.isnan(x):
        return np.random.randint(min_age,max_age+1)
    else:
        return x
    
test_data['AGE'] = test_data['AGE'].map(g)



poss = np.unique(test_data['POSITION'].dropna().to_numpy())
good_pos = test_data['POSITION'].dropna()

ant = good_pos[good_pos==poss[0]].count()
head = good_pos[good_pos==poss[1]].count()
lat = good_pos[good_pos==poss[2]].count()
low = good_pos[good_pos==poss[3]].count()
ora = good_pos[good_pos==poss[4]].count()
pal = good_pos[good_pos==poss[5]].count()
pos = good_pos[good_pos==poss[6]].count()
upp = good_pos[good_pos==poss[7]].count()
tot = ant+head+lat+low+ora+pal+pos+upp

cnts = [ant,head,lat,low,ora,pal,pos,upp]
probs = cnts/tot
ints = np.cumsum(probs)

def h(x):
    if x in poss:
        return x
    else:
        p = np.random.random()
        if p<ints[0]:
            return poss[0]
        elif p<ints[1]:
            return poss[1]
        elif p<ints[2]:
            return poss[2]
        elif p<ints[3]:
            return poss[3]
        elif p<ints[4]:
            return poss[4]
        elif p<ints[5]:
            return poss[5]
        elif p<ints[6]:
            return poss[6]
        else:
            return poss[7]       
    
test_data['POSITION'] = test_data['POSITION'].map(h)

print(test_data.notna().count())

ID          6333
SEX         6333
AGE         6333
POSITION    6333
dtype: int64


In [4]:
test_data.to_csv('./metadataTestFilled.csv',index=False)