In [1]:
import pandas as pd
import numpy as np
import random
random.seed(42)
import os
from shutil import rmtree

# Create annotations file

In [2]:
DATA_DIR = './rsna-intracranial-hemorrhage-detection/'
PATH_TO_CSV = os.path.join(DATA_DIR, 'stage_2_train.csv')
df = pd.read_csv(PATH_TO_CSV)
print(df)

                                    ID  Label
0                ID_12cadc6af_epidural      0
1        ID_12cadc6af_intraparenchymal      0
2        ID_12cadc6af_intraventricular      0
3            ID_12cadc6af_subarachnoid      0
4                ID_12cadc6af_subdural      0
...                                ...    ...
4516837  ID_4a85a3a3f_intraparenchymal      0
4516838  ID_4a85a3a3f_intraventricular      0
4516839      ID_4a85a3a3f_subarachnoid      0
4516840          ID_4a85a3a3f_subdural      0
4516841               ID_4a85a3a3f_any      0

[4516842 rows x 2 columns]


In [3]:
df = df[list(map(lambda x: x.split('_')[-1] == 'any', df['ID'].values))]
df

Unnamed: 0,ID,Label
5,ID_12cadc6af_any,0
11,ID_38fd7baa0_any,0
17,ID_6c5d82413_any,0
23,ID_aec8e68b3_any,1
29,ID_4d9209c7c_any,0
...,...,...
4516817,ID_41029e8f0_any,0
4516823,ID_4b74ffd24_any,0
4516829,ID_e5c02a4ca_any,0
4516835,ID_8079930a8_any,0


In [4]:
df['ID'] = df['ID'].apply(lambda x:  x[:-4])
df.rename(columns = {'Label' : 'IH'}, inplace = True)
df

Unnamed: 0,ID,IH
5,ID_12cadc6af,0
11,ID_38fd7baa0,0
17,ID_6c5d82413,0
23,ID_aec8e68b3,1
29,ID_4d9209c7c,0
...,...,...
4516817,ID_41029e8f0,0
4516823,ID_4b74ffd24,0
4516829,ID_e5c02a4ca,0
4516835,ID_8079930a8,0


In [5]:
df = df.drop_duplicates(subset=None, keep=False, inplace=False, ignore_index=False)
df

Unnamed: 0,ID,IH
5,ID_12cadc6af,0
11,ID_38fd7baa0,0
17,ID_6c5d82413,0
23,ID_aec8e68b3,1
29,ID_4d9209c7c,0
...,...,...
4516817,ID_41029e8f0,0
4516823,ID_4b74ffd24,0
4516829,ID_e5c02a4ca,0
4516835,ID_8079930a8,0


# Explore DICOM file

In [6]:
import pydicom

In [7]:
DICOM_DATA_DIR = os.path.join(DATA_DIR, 'stage_2_train')

In [8]:
dicom_file = f"{df['ID'].iloc[0]}.dcm"
dicom_file

'ID_12cadc6af.dcm'

In [9]:
dicom_data = pydicom.dcmread(os.path.join(DICOM_DATA_DIR, dicom_file))

In [10]:
dicom_data

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 188
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.2.840.4267.32.273105137610979365450109402280489797677
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.2.40.0.13.1.1.1
(0002, 0013) Implementation Version Name         SH: 'dcm4che-1.4.35'
-------------------------------------------------
(0008, 0018) SOP Instance UID                    UI: ID_12cadc6af
(0008, 0060) Modality                            CS: 'CT'
(0010, 0020) Patient ID                          LO: 'ID_2eb3925f'
(0020, 000d) Study Instance UID                  UI: ID_6dec708c74
(0020, 000e) Series Instance UID                 UI: ID_1b17a4a944
(0020, 0010) Study ID                            SH: '

In [11]:
%%time
dicom_per_patient = {}
for ID in df['ID']:
    dicom_file = f"{ID}.dcm"
    dicom_data = pydicom.dcmread(os.path.join(DICOM_DATA_DIR, dicom_file))
    patient_id = dicom_data.PatientID
    if patient_id in dicom_per_patient.keys():
        dicom_per_patient[patient_id].append(dicom_file)
    else:
        dicom_per_patient[patient_id] = [dicom_file]

CPU times: user 3min 47s, sys: 1min 34s, total: 5min 21s
Wall time: 10min 5s


# Train, valid and test (patient split)

In [12]:
patients = list(dicom_per_patient.keys())
random.shuffle(patients)

num_patients = len(patients)
num_train_patients = int(0.9 * num_patients)
num_valid_patients = (num_patients - num_train_patients) // 2
num_test_patients = num_patients - num_train_patients - num_valid_patients

print(f'{num_train_patients} patients for train, {num_valid_patients} for valid and {num_test_patients} for test')

17044 patients for train, 947 for valid and 947 for test


In [13]:
patients_train = patients[:num_train_patients]
patients_valid = patients[num_train_patients:num_train_patients + num_valid_patients]
patients_test = patients[num_train_patients + num_valid_patients:]
print(len(patients_train), len(patients_valid), len(patients_test))

17044 947 947


In [14]:
# sum([[1],[2],[3]], []) = [1,2,3]
dicom_files_train = sum([dicom_per_patient[patient_id] for patient_id in patients_train] ,[])
dicom_files_valid = sum([dicom_per_patient[patient_id] for patient_id in patients_valid] ,[])
dicom_files_test = sum([dicom_per_patient[patient_id] for patient_id in patients_test] ,[])

In [15]:
dicom_IDs_train = list(map(lambda x: x[:-4], dicom_files_train))
dicom_IDs_valid = list(map(lambda x: x[:-4], dicom_files_valid))
dicom_IDs_test = list(map(lambda x: x[:-4], dicom_files_test))

In [16]:
df_train = df[df['ID'].isin(dicom_IDs_train)]
df_valid = df[df['ID'].isin(dicom_IDs_valid)]
df_test = df[df['ID'].isin(dicom_IDs_test)]

In [17]:
df_train

Unnamed: 0,ID,IH
5,ID_12cadc6af,0
11,ID_38fd7baa0,0
17,ID_6c5d82413,0
23,ID_aec8e68b3,1
29,ID_4d9209c7c,0
...,...,...
4516811,ID_70ecab234,0
4516817,ID_41029e8f0,0
4516823,ID_4b74ffd24,0
4516835,ID_8079930a8,0


In [18]:
df_valid

Unnamed: 0,ID,IH
317,ID_a837bb1fc,1
563,ID_7c71b48a3,1
1043,ID_8261293e1,0
1211,ID_316ba46e1,0
1523,ID_154bf7e8e,0
...,...,...
4516025,ID_b284aa45d,1
4516181,ID_186a81461,1
4516445,ID_a9338c341,0
4516481,ID_a72effde8,0


In [19]:
df_test

Unnamed: 0,ID,IH
83,ID_d7c4441ee,0
113,ID_38922f8ce,0
227,ID_93f4c7426,0
341,ID_17490a6a3,0
431,ID_d479518a7,0
...,...,...
4516361,ID_2155ef1b4,1
4516535,ID_ab119e3db,0
4516559,ID_af06b825e,1
4516619,ID_21ec17a17,0


## Sanity check: all the dicom files exists

In [20]:
#Commented because it's a lot of time consuming
#%%time
#dcm_ids = list(map(lambda x: x[:-4], os.listdir(DICOM_DATA_DIR)))
#print(all(df_train['ID'].isin(dcm_files)), all(df_valid['ID'].isin(dcm_files)), all(df_test['ID'].isin(dcm_files)))
#IDs = df_train['ID'].values.tolist() + df_valid['ID'].values.tolist() + df_test['ID'].values.tolist()
#print(sum(not ID in dcm_ids for ID in IDs))

In [21]:
n_IH_train = df_train['IH'].sum()
n_noIH_train = df_train.shape[0] - n_IH_train
print(f'Train: \n\tn_IH: {n_IH_train}\n\tn_noIH: {n_noIH_train}')

n_IH_valid = df_valid['IH'].sum()
n_noIH_valid = df_valid.shape[0] - n_IH_valid
print(f'Train: \n\tn_IH: {n_IH_valid}\n\tn_noIH: {n_noIH_valid}')

n_IH_test = df_test['IH'].sum()
n_noIH_test = df_test.shape[0] - n_IH_test
print(f'Train: \n\tn_IH: {n_IH_test}\n\tn_noIH: {n_noIH_test}')

Train: 
	n_IH: 97525
	n_noIH: 580934
Train: 
	n_IH: 5401
	n_noIH: 31174
Train: 
	n_IH: 5007
	n_noIH: 32758


In [22]:
noIH_idxs_train = df_train[df_train['IH'] == 0].index.tolist()
noIH_idxs_valid = df_valid[df_valid['IH'] == 0].index.tolist()
noIH_idxs_test = df_test[df_test['IH'] == 0].index.tolist()

random.shuffle(noIH_idxs_train)
random.shuffle(noIH_idxs_valid)
random.shuffle(noIH_idxs_test)

selected_noIH_idxs_train = noIH_idxs_train[:n_IH_train]
selected_noIH_idxs_valid = noIH_idxs_valid[:n_IH_valid]
selected_noIH_idxs_test = noIH_idxs_test[:n_IH_test]

In [23]:
selected_idxs_train = selected_noIH_idxs_train + df_train[df_train['IH'] == 1].index.tolist()
selected_idxs_valid = selected_noIH_idxs_valid + df_valid[df_valid['IH'] == 1].index.tolist()
selected_idxs_test = selected_noIH_idxs_test + df_test[df_test['IH'] == 1].index.tolist()

In [24]:
balanced_df_train = df_train.loc[selected_idxs_train, :]
balanced_df_valid = df_valid.loc[selected_idxs_valid, :]
balanced_df_test = df_test.loc[selected_idxs_test, :]

print(sum(balanced_df_train['IH']) / balanced_df_train.shape[0])
print(sum(balanced_df_valid['IH']) / balanced_df_valid.shape[0])
print(sum(balanced_df_test['IH']) / balanced_df_test.shape[0])

0.5
0.5
0.5


In [25]:
balanced_df_train

Unnamed: 0,ID,IH
2094275,ID_de4586002,0
4432883,ID_1c7e0b803,0
3676313,ID_7c5d2d53a,0
4364033,ID_febb055fc,0
3428903,ID_fe6d8d454,0
...,...,...
4516493,ID_f648820ce,1
4516505,ID_012b4b6d8,1
4516541,ID_cade771b5,1
4516649,ID_21ea5fffb,1


In [26]:
balanced_df_valid

Unnamed: 0,ID,IH
980261,ID_8ab8a477b,0
4476569,ID_7adeb12c8,0
271589,ID_dfd7132dd,0
549275,ID_d3c000ccf,0
3261005,ID_d9b550a8a,0
...,...,...
4512977,ID_b063156d1,1
4514735,ID_f677f9035,1
4515779,ID_642dac43c,1
4516025,ID_b284aa45d,1


In [27]:
balanced_df_test

Unnamed: 0,ID,IH
3933251,ID_f29d0b2bb,0
2060783,ID_9bd655fc8,0
2339795,ID_d137956a1,0
4482965,ID_5a30a75c3,0
3215417,ID_00c210221,0
...,...,...
4510601,ID_df5454dd2,1
4513013,ID_803b83de0,1
4514933,ID_e3bbb9906,1
4516361,ID_2155ef1b4,1


# Save annots and dataset

In [28]:
if os.path.exists('./data/'): rmtree('./data/')
os.system('mkdir ./data/ ./data/set/ ./data/set/train/ ./data/set/valid/ ./data/set/test ./data/annots/')

0

In [29]:
balanced_df_train.to_csv('./data/annots/train.csv',index=False)
balanced_df_valid.to_csv('./data/annots/valid.csv', index=False)
balanced_df_test.to_csv('./data/annots/test.csv', index=False)

In [30]:
count = 0
for img_id in balanced_df_train['ID'].values.tolist():
    path = os.path.join(DICOM_DATA_DIR, f'{img_id}.dcm')
    path2 = f'./data/set/train/{img_id}.dcm'
    os.system(f'cp {path} {path2}')
    count +=1
    if count % 10000 == 0:
        print('Count:', count)
print(f'{count} files moved. Expected: {balanced_df_train.shape[0]}')

Count: 10000
Count: 20000
Count: 30000
Count: 40000
Count: 50000
Count: 60000
Count: 70000
Count: 80000
Count: 90000
Count: 100000
Count: 110000
Count: 120000
Count: 130000
Count: 140000
Count: 150000
Count: 160000
Count: 170000
Count: 180000
Count: 190000
195050 files moved. Expected: 195050


In [31]:
count = 0
for img_id in balanced_df_valid['ID'].values.tolist():
    path = os.path.join(DICOM_DATA_DIR, f'{img_id}.dcm')
    path2 = f'./data/set/valid/{img_id}.dcm'
    os.system(f'cp {path} {path2}')
    count +=1
    if count % 1000 == 0:
        print('Count:', count)
print(f'{count} files moved. Expected: {balanced_df_valid.shape[0]}')

Count: 1000
Count: 2000
Count: 3000
Count: 4000
Count: 5000
Count: 6000
Count: 7000
Count: 8000
Count: 9000
Count: 10000
10802 files moved. Expected: 10802


In [32]:
count = 0
for img_id in balanced_df_test['ID'].values.tolist():
    path = os.path.join(DICOM_DATA_DIR, f'{img_id}.dcm')
    path2 = f'./data/set/test/{img_id}.dcm'
    os.system(f'cp {path} {path2}')
    count +=1
    if count % 1000 == 0:
        print('Count:', count)
print(f'{count} files moved. Expected: {balanced_df_test.shape[0]}')

Count: 1000
Count: 2000
Count: 3000
Count: 4000
Count: 5000
Count: 6000
Count: 7000
Count: 8000
Count: 9000
Count: 10000
10014 files moved. Expected: 10014
