In [1]:
import pandas as pd

In [2]:
in_file_path = '/home/jupyter/rsna-intracranial-hemorrhage-detection/stage_2_train.csv'
out_file_path = '/home/jupyter/rsna-intracranial-hemorrhage-detection/train_labels.csv'
df = pd.read_csv(in_file_path)
df.head(10)

Unnamed: 0,ID,Label
0,ID_12cadc6af_epidural,0
1,ID_12cadc6af_intraparenchymal,0
2,ID_12cadc6af_intraventricular,0
3,ID_12cadc6af_subarachnoid,0
4,ID_12cadc6af_subdural,0
5,ID_12cadc6af_any,0
6,ID_38fd7baa0_epidural,0
7,ID_38fd7baa0_intraparenchymal,0
8,ID_38fd7baa0_intraventricular,0
9,ID_38fd7baa0_subarachnoid,0


## Extract the type of hemorrhage from the ID column

In [3]:
df['type'] = df['ID'].str.split('_').str[2]
#also modify the 'ID' column to remove the type of hemorrhage from it
df['ID'] = df['ID'].str.split('_').str[:-1].str.join('_')
df.head(10)

Unnamed: 0,ID,Label,type
0,ID_12cadc6af,0,epidural
1,ID_12cadc6af,0,intraparenchymal
2,ID_12cadc6af,0,intraventricular
3,ID_12cadc6af,0,subarachnoid
4,ID_12cadc6af,0,subdural
5,ID_12cadc6af,0,any
6,ID_38fd7baa0,0,epidural
7,ID_38fd7baa0,0,intraparenchymal
8,ID_38fd7baa0,0,intraventricular
9,ID_38fd7baa0,0,subarachnoid


## Add boolean columns for each type of hemorrhage (including 'any')

In [4]:
df['any'] = df[['Label','type']].apply(lambda x: x['Label'] == 1 and x['type'] == 'any', axis=1)
df['epidural'] = df[['Label','type']].apply(lambda x: x['Label'] == 1 and x['type'] == 'epidural', axis=1)
df['intraparenchymal'] = df[['Label','type']].apply(lambda x: x['Label'] == 1 and x['type']=='intraparenchymal', axis=1)
df['intraventricular'] = df[['Label','type']].apply(lambda x: x['Label'] == 1 and x['type']=='intraventricular', axis=1)
df['subarachnoid'] = df[['Label','type']].apply(lambda x: x['Label'] == 1 and x['type']=='subarachnoid', axis=1)
df['subdural'] = df[['Label','type']].apply(lambda x: x['Label'] == 1 and x['type']=='subdural', axis=1) 

In [5]:
df.head(10)

Unnamed: 0,ID,Label,type,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_12cadc6af,0,epidural,False,False,False,False,False,False
1,ID_12cadc6af,0,intraparenchymal,False,False,False,False,False,False
2,ID_12cadc6af,0,intraventricular,False,False,False,False,False,False
3,ID_12cadc6af,0,subarachnoid,False,False,False,False,False,False
4,ID_12cadc6af,0,subdural,False,False,False,False,False,False
5,ID_12cadc6af,0,any,False,False,False,False,False,False
6,ID_38fd7baa0,0,epidural,False,False,False,False,False,False
7,ID_38fd7baa0,0,intraparenchymal,False,False,False,False,False,False
8,ID_38fd7baa0,0,intraventricular,False,False,False,False,False,False
9,ID_38fd7baa0,0,subarachnoid,False,False,False,False,False,False


## Group by each image ID to collapse each row representing an image into one

In [6]:
groups = df.groupby('ID')

In [7]:
labels = ['ID', 'Label', 'type', 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
def collapse(group):
    data = sum(group.iloc[j].to_numpy()[3:].astype(bool) for j in range(len(group))).astype(bool).tolist()
    data = group.iloc[0].to_numpy()[:3].tolist() + data
    return pd.Series(data, index=labels)

df = groups.apply(collapse)
df.head(10)

Unnamed: 0_level_0,ID,Label,type,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ID_000012eaf,ID_000012eaf,0,epidural,False,False,False,False,False,False
ID_000039fa0,ID_000039fa0,0,epidural,False,False,False,False,False,False
ID_00005679d,ID_00005679d,0,epidural,False,False,False,False,False,False
ID_00008ce3c,ID_00008ce3c,0,epidural,False,False,False,False,False,False
ID_0000950d7,ID_0000950d7,0,epidural,False,False,False,False,False,False
ID_0000aee4b,ID_0000aee4b,0,epidural,False,False,False,False,False,False
ID_0000ca2f6,ID_0000ca2f6,0,epidural,False,False,False,False,False,False
ID_0000f1657,ID_0000f1657,0,epidural,False,False,False,False,False,False
ID_000178e76,ID_000178e76,0,epidural,False,False,False,False,False,False
ID_00019828f,ID_00019828f,0,epidural,False,False,False,False,False,False


## Drop redundant columns

In [8]:
df = df.drop(['Label', 'type', 'ID'], axis='columns')
df.head(10)

Unnamed: 0_level_0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID_000012eaf,False,False,False,False,False,False
ID_000039fa0,False,False,False,False,False,False
ID_00005679d,False,False,False,False,False,False
ID_00008ce3c,False,False,False,False,False,False
ID_0000950d7,False,False,False,False,False,False
ID_0000aee4b,False,False,False,False,False,False
ID_0000ca2f6,False,False,False,False,False,False
ID_0000f1657,False,False,False,False,False,False
ID_000178e76,False,False,False,False,False,False
ID_00019828f,False,False,False,False,False,False


## Save the formatted data for later use

In [9]:
df.to_csv(out_file_path)

## Some basic exploration

In [11]:
df['any'].value_counts()

False    644870
True     107933
Name: any, dtype: int64

We see that there is a heavy imbalance in favor of 'no hemorrhage'. Thus, we will likely need to do some up/down sampling to rectify this during training.

In [14]:
types = ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
for t in types:
    print(df[t].value_counts())

False    749658
True       3145
Name: epidural, dtype: int64
False    716685
True      36118
Name: intraparenchymal, dtype: int64
False    726598
True      26205
Name: intraventricular, dtype: int64
False    717128
True      35675
Name: subarachnoid, dtype: int64
False    705637
True      47166
Name: subdural, dtype: int64


We can also see an imbalance among the 5 positive classes. Most evident is 'epidural', which only has 3145 images containing it.