In [1]:
import os
import pandas as pd
import numpy as np
import random
import cv2
import pydicom
from ydata_profiling import ProfileReport
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import warnings
import random
from joblib import Parallel, delayed
warnings.filterwarnings('ignore')

# pandasの表示を設定
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:, 4f}'.format)

##### Configs

In [2]:
class CFG:

    resize_dimension = 224

    seed = 123

##### Seeding

In [3]:
def seeding(SEED):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    print('seeding done!!!')

seeding(CFG.seed)

seeding done!!!


### Make train and test data

In [4]:
df = pd.read_csv('./rsna2023atd_files/train.csv')
series_meta_df = pd.read_csv('./rsna2023atd_files/train_series_meta.csv')
df = df.merge(series_meta_df, on=['patient_id'], how='right')

patient_series_df = pd.DataFrame(columns=['patient_id', 'series_id'])
patient_series_df['patient_id'] = df['patient_id']
patient_series_df['series_id'] = df['series_id']

instance_df = pd.DataFrame(columns=['patient_id', 'series_id', 'instance_number', 'image_path'])


def apply_instance_number(row):
    
    row['instance_number'] = (row['image_path'].split('/')[-1]).split('.')[0]
    
    return row


for pid, sid in tqdm(zip(patient_series_df['patient_id'], patient_series_df['series_id'])):
    
    new_df = pd.DataFrame(columns=['patient_id', 'series_id', 'image_path'])

    image_paths = glob(f'./rsna2023atd_files/train_images/{pid}/{sid}/*.dcm')

    new_df['image_path'] = image_paths
    new_df['patient_id'] = pid
    new_df['series_id'] = sid
    
    new_df = new_df.apply(apply_instance_number, axis=1)

    instance_df = pd.concat([instance_df, new_df])


df = df.merge(instance_df, on=['patient_id', 'series_id'], how='right')
df = df.sort_values(['patient_id', 'series_id', 'instance_number']).reset_index(drop=True)

df.loc[df['kidney_healthy'] == 1, 'kidney_injury'] = 0
df.loc[df['kidney_healthy'] == 0, 'kidney_injury'] = 1
df.loc[df['liver_healthy'] == 1, 'liver_injury'] = 0
df.loc[df['liver_healthy'] == 0, 'liver_injury'] = 1
df.loc[df['spleen_healthy'] == 1, 'spleen_injury'] = 0
df.loc[df['spleen_healthy'] == 0, 'spleen_injury'] = 1

df['kidney_injury'] = df['kidney_injury'].astype('int')
df['liver_injury'] = df['liver_injury'].astype('int')
df['spleen_injury'] = df['spleen_injury'].astype('int')

df = df.drop(['kidney_low', 'kidney_high', 'liver_low', 'liver_high', 'spleen_low', 'spleen_high'], axis=1)


0it [00:00, ?it/s]

4711it [06:06, 12.87it/s]


In [17]:
# test_pre = df[df['any_injury'] == 1]
# train_pre_0 = df[df['any_injury'] == 0]

# train_pre_1_id, test_id = train_test_split(test_pre['patient_id'], test_size=30, shuffle=True, stratify=test_pre[['bowel_injury', 'extravasation_injury', 'kidney_injury', 'liver_injury', 'spleen_injury']], random_state=123)
# test_id_list =  test_id.to_list()

# test_df = test_pre.query('patient_id in @test_id_list')
# train_pre_1 = test_pre.query('patient_id not in @test_id_list')
# train_df = pd.concat([train_pre_0, train_pre_1], axis=0).sort_values('patient_id')

In [5]:
df_unique_patient = df.drop_duplicates(subset='patient_id')

train, test = train_test_split(df_unique_patient['patient_id'], test_size=100, shuffle=True, random_state=123)
test_list = test.tolist()

train_df = df.query('patient_id not in @test_list')
test_df = df.query('patient_id in @test_list')

### Make metadata dataframes for 5 different organs

In [6]:
df_positive = train_df[df['any_injury'] == 1]
df_negative = train_df[df['any_injury'] == 0]

df_positive['kidney_injury'] = df_positive['kidney_injury'].astype('int')
df_positive['liver_injury'] = df_positive['liver_injury'].astype('int')
df_positive['spleen_injury'] = df_positive['spleen_injury'].astype('int')
df_negative['kidney_injury'] = df_negative['kidney_injury'].astype('int')
df_negative['liver_injury'] = df_negative['liver_injury'].astype('int')
df_negative['spleen_injury'] = df_negative['spleen_injury'].astype('int')

In [7]:
df_bowel_positive = df_positive[df_positive['bowel_injury'] == 1]
df_extra_positive = df_positive[df_positive['extravasation_injury'] == 1]
df_kidney_positive = df_positive[df_positive['kidney_injury'] == 1]
df_liver_positive = df_positive[df_positive['liver_injury'] == 1]
df_spleen_positive = df_positive[df_positive['spleen_injury'] == 1]

len_bowel = len(df_bowel_positive['patient_id'].unique())
len_extra = len(df_extra_positive['patient_id'].unique())
len_kidney = len(df_kidney_positive['patient_id'].unique())
len_liver = len(df_liver_positive['patient_id'].unique())
len_spleen = len(df_spleen_positive['patient_id'].unique())

bowel_negative_list = np.random.choice(df_negative['patient_id'].unique(), len_bowel, replace=False)
extra_negative_list = np.random.choice(df_negative['patient_id'].unique(), len_extra, replace=False)
kidney_negative_list = np.random.choice(df_negative['patient_id'].unique(), len_kidney, replace=False)
liver_negative_list = np.random.choice(df_negative['patient_id'].unique(), len_liver, replace=False)
spleen_negative_list = np.random.choice(df_negative['patient_id'].unique(), len_spleen, replace=False)

df_bowel_negative = df_negative[df_negative['patient_id'].isin(bowel_negative_list.tolist())]
df_extra_negative =  df_negative[df_negative['patient_id'].isin(extra_negative_list.tolist())]
df_kidney_negative =  df_negative[df_negative['patient_id'].isin(kidney_negative_list.tolist())]
df_liver_negative =  df_negative[df_negative['patient_id'].isin(liver_negative_list.tolist())]
df_spleen_negative =  df_negative[df_negative['patient_id'].isin(spleen_negative_list.tolist())]

df_bowel = (pd.concat([df_bowel_positive, df_bowel_negative])).sort_values(['patient_id', 'series_id', 'instance_number']).reset_index(drop=True)
df_extra = (pd.concat([df_extra_positive, df_extra_negative])).sort_values(['patient_id', 'series_id', 'instance_number']).reset_index(drop=True)
df_kidney = (pd.concat([df_kidney_positive, df_kidney_negative])).sort_values(['patient_id', 'series_id', 'instance_number']).reset_index(drop=True)
df_liver = (pd.concat([df_liver_positive, df_liver_negative])).sort_values(['patient_id', 'series_id', 'instance_number']).reset_index(drop=True)
df_spleen = (pd.concat([df_spleen_positive, df_spleen_negative])).sort_values(['patient_id', 'series_id', 'instance_number']).reset_index(drop=True)

### Make test data

##### Convert .dcm image files to .png

In [8]:
def standardize_pixel_array(dcm: pydicom.dataset.FileDataset) -> np.ndarray:

    pixel_array = dcm.pixel_array

    if dcm.PixelRepresentation == 1:
        bit_shift = dcm.BitsAllocated - dcm.BitsStored
        dtype = pixel_array.dtype 
        new_array = (pixel_array << bit_shift).astype(dtype) >>  bit_shift

        pixel_array = pydicom.pixel_data_handlers.util.apply_modality_lut(new_array, dcm)
    
    return pixel_array


def read_xray(file_path, fix_monochrome = True):

    dicom = pydicom.dcmread(file_path)
    data = standardize_pixel_array(dicom)
    data = data - np.min(data)
    data = data / (np.max(data) + 1e-5)
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = 1.0 - data
        
    return data


def resize_and_save(dir, file_path):
    image = read_xray(file_path)
    image = cv2.resize(image, (CFG.resize_dimension, CFG.resize_dimension), cv2.INTER_LINEAR) # bilinear interpolation
    image = (image * 255).astype(np.uint8)
    height = image.shape[0]
    width = image.shape[1]

    sub_path = file_path.split('/')[-3] + '/' + file_path.split('/')[-2] + '/' + file_path.split('/')[-1].split('.')[0] + '.png'

    infos = sub_path.split('/')
    patient_id = infos[0]
    series_id = infos[1]
    instance_id = infos[2]
    instance_id = instance_id.replace('.png', '')

    new_path = os.path.join(dir, sub_path)
    os.makedirs(new_path.rsplit('/',1)[0], exist_ok=True)

    cv2.imwrite(new_path, image, [cv2.IMWRITE_PNG_COMPRESSION, 0])

    return patient_id,series_id,instance_id,width,height

In [110]:
dir_test = './Dataset/test/'

image_size = [CFG.resize_dimension, CFG.resize_dimension]
index = 0
parts = 1

In [111]:
%%time
file_paths = test_df.image_path.tolist()
imagesize_test = Parallel(n_jobs=-1,backend='threading')(delayed(resize_and_save)(dir_test, file_path) for file_path in tqdm(file_paths, leave=True, position=0))

100%|██████████| 50243/50243 [03:49<00:00, 219.01it/s]


CPU times: user 4min 10s, sys: 46.5 s, total: 4min 56s
Wall time: 3min 49s


In [112]:
pid, sid, iid, width, height = zip(*imagesize_test)

test_meta_df = pd.DataFrame({'patient_id'     : pid,
                            'series_id'       : sid,
                            'instance_number' : iid,
                            'width'           : width,
                            'height'          : height})

test_meta_df[['patient_id', 'series_id', 'instance_number']] = test_meta_df[['patient_id', 'series_id', 'instance_number']].astype(int)
test_df[['patient_id', 'series_id', 'instance_number']] = test_df[['patient_id', 'series_id', 'instance_number']].astype('int')

test_df = test_df.merge(test_meta_df, on=['patient_id','series_id','instance_number'], how='right')

test_df['image_path'] = dir_test + test_df.patient_id.astype(str)+ '/' + test_df.series_id.astype(str) + '/' + test_df.instance_number.astype(str) + '.png'

test_df.to_csv(f'{dir_test}/test.csv', index=False)

In [113]:
ans_df = test_df.reindex(columns=['patient_id', 'bowel_healthy', 'bowel_injury', 'extravasation_healthy', 'extravasation_injury', 
                                  'kidney_healthy', 'kidney_injury', 'liver_healthy', 'liver_injury', 'spleen_healthy', 'spleen_injury'])
ans_df = ans_df.drop_duplicates().sort_values('patient_id').reset_index(drop=True)

ans_df.to_csv(f'{dir_test}/answer.csv', index='false')

### Make data for bowel

In [114]:
dir_bowel = './Dataset/bowel/'

image_size = [CFG.resize_dimension, CFG.resize_dimension]
index = 0
parts = 1

In [115]:
%%time
file_paths = df_bowel.image_path.tolist()
imagesize_bowel = Parallel(n_jobs=-1,backend='threading')(delayed(resize_and_save)(dir_bowel, file_path) for file_path in tqdm(file_paths, leave=True, position=0))

100%|██████████| 59616/59616 [04:29<00:00, 221.35it/s]

CPU times: user 4min 48s, sys: 56.7 s, total: 5min 45s
Wall time: 4min 29s





In [116]:
pid, sid, iid, width, height = zip(*imagesize_bowel)

bowel_meta_df = pd.DataFrame({'patient_id'     : pid,
                            'series_id'       : sid,
                            'instance_number' : iid,
                            'width'           : width,
                            'height'          : height})

bowel_meta_df[['patient_id', 'series_id', 'instance_number']] = bowel_meta_df[['patient_id', 'series_id', 'instance_number']].astype(int)
df_bowel[['patient_id', 'series_id', 'instance_number']] = df_bowel[['patient_id', 'series_id', 'instance_number']].astype('int')

df_bowel = df_bowel.merge(bowel_meta_df, on=['patient_id','series_id','instance_number'], how='right')

df_bowel['image_path'] = dir_bowel + df_bowel.patient_id.astype(str)+ '/' + df_bowel.series_id.astype(str) + '/' + df_bowel.instance_number.astype(str) + '.png'

df_bowel.to_csv(f'{dir_bowel}bowel.csv', index=False)

### Make data for extravasation

In [9]:
dir_extra = './Dataset/extravasation/'

image_size = [CFG.resize_dimension, CFG.resize_dimension]
index = 0
parts = 1

In [11]:
%%time
file_paths = df_extra.image_path.tolist()
imagesize_extra = Parallel(n_jobs=-1,backend='threading')(delayed(resize_and_save)(dir_extra, file_path) for file_path in tqdm(file_paths, leave=True, position=0))

100%|██████████| 103487/103487 [08:38<00:00, 199.73it/s]

CPU times: user 11min 5s, sys: 3min 20s, total: 14min 25s
Wall time: 8min 38s





In [12]:
pid, sid, iid, width, height = zip(*imagesize_extra)

extra_meta_df = pd.DataFrame({'patient_id'     : pid,
                            'series_id'       : sid,
                            'instance_number' : iid,
                            'width'           : width,
                            'height'          : height})

extra_meta_df[['patient_id', 'series_id', 'instance_number']] = extra_meta_df[['patient_id', 'series_id', 'instance_number']].astype(int)
df_extra[['patient_id', 'series_id', 'instance_number']] = df_extra[['patient_id', 'series_id', 'instance_number']].astype('int')

df_extra = df_extra.merge(extra_meta_df, on=['patient_id','series_id','instance_number'], how='right')

df_extra['image_path'] = dir_extra + df_extra.patient_id.astype(str)+ '/' + df_extra.series_id.astype(str) + '/' + df_extra.instance_number.astype(str) + '.png'

df_extra.to_csv(f'{dir_extra}extravasation.csv', index=False)

### Make data for kidney

In [120]:
dir_kidney = './Dataset/kidney/'

image_size = [CFG.resize_dimension, CFG.resize_dimension]
index = 0
parts = 1

In [121]:
%%time
file_paths = df_kidney.image_path.tolist()
imagesize_kidney = Parallel(n_jobs=-1,backend='threading')(delayed(resize_and_save)(dir_kidney, file_path) for file_path in tqdm(file_paths, leave=True, position=0))

100%|██████████| 177609/177609 [13:34<00:00, 218.13it/s]


CPU times: user 14min 33s, sys: 2min 45s, total: 17min 18s
Wall time: 13min 34s


In [122]:
pid, sid, iid, width, height = zip(*imagesize_kidney)

kidney_meta_df = pd.DataFrame({'patient_id'     : pid,
                            'series_id'       : sid,
                            'instance_number' : iid,
                            'width'           : width,
                            'height'          : height})

kidney_meta_df[['patient_id', 'series_id', 'instance_number']] = kidney_meta_df[['patient_id', 'series_id', 'instance_number']].astype(int)
df_kidney[['patient_id', 'series_id', 'instance_number']] = df_kidney[['patient_id', 'series_id', 'instance_number']].astype('int')

df_kidney = df_kidney.merge(kidney_meta_df, on=['patient_id','series_id','instance_number'], how='right')

df_kidney['image_path'] = dir_kidney + df_kidney.patient_id.astype(str)+ '/' + df_kidney.series_id.astype(str) + '/' + df_kidney.instance_number.astype(str) + '.png'

df_kidney.to_csv(f'{dir_kidney}kidney.csv', index=False)

### Make data for liver

In [123]:
df_liver_unique_patient = df_liver.drop_duplicates(subset='patient_id')

df_liver_patient, df_liver_left_patient = train_test_split(df_liver_unique_patient['patient_id'], stratify=df_liver_unique_patient[['liver_healthy', 'liver_injury']], train_size=400, shuffle=True, random_state=123)
df_liver_patient_list = df_liver_patient.tolist()
df_liver = df_liver[df_liver['patient_id'].isin(df_liver_patient_list)]

In [126]:
dir_liver = './Dataset/liver/'

image_size = [CFG.resize_dimension, CFG.resize_dimension]
index = 0
parts = 1

In [127]:
%%time
file_paths = df_liver.image_path.tolist()
imagesize_liver = Parallel(n_jobs=-1,backend='threading')(delayed(resize_and_save)(dir_liver, file_path) for file_path in tqdm(file_paths, leave=True, position=0))

100%|██████████| 199206/199206 [13:53<00:00, 239.01it/s]

CPU times: user 15min 38s, sys: 2min 23s, total: 18min 2s
Wall time: 13min 53s





In [128]:
pid, sid, iid, width, height = zip(*imagesize_liver)

liver_meta_df = pd.DataFrame({'patient_id'     : pid,
                            'series_id'       : sid,
                            'instance_number' : iid,
                            'width'           : width,
                            'height'          : height})

liver_meta_df[['patient_id', 'series_id', 'instance_number']] = liver_meta_df[['patient_id', 'series_id', 'instance_number']].astype(int)
df_liver[['patient_id', 'series_id', 'instance_number']] = df_liver[['patient_id', 'series_id', 'instance_number']].astype('int')

df_liver = df_liver.merge(liver_meta_df, on=['patient_id','series_id','instance_number'], how='right')

df_liver['image_path'] = dir_liver + df_liver.patient_id.astype(str)+ '/' + df_liver.series_id.astype(str) + '/' + df_liver.instance_number.astype(str) + '.png'

df_liver.to_csv(f'{dir_liver}liver.csv', index=False)

### Make data for spleen

In [129]:
df_spleen_unique_patient = df_spleen.drop_duplicates(subset='patient_id')

df_spleen_patient, df_spleen_left_patient = train_test_split(df_spleen_unique_patient['patient_id'], stratify=df_spleen_unique_patient[['spleen_healthy', 'spleen_injury']], train_size=400, shuffle=True, random_state=123)
df_spleen_patient_list = df_spleen_patient.tolist()
df_spleen = df_spleen[df_spleen['patient_id'].isin(df_spleen_patient_list)]

In [130]:
dir_spleen = './Dataset/spleen/'

image_size = [CFG.resize_dimension, CFG.resize_dimension]
index = 0
parts = 1

In [131]:
%%time
file_paths = df_spleen.image_path.tolist()
imagesize_spleen = Parallel(n_jobs=-1,backend='threading')(delayed(resize_and_save)(dir_spleen, file_path) for file_path in tqdm(file_paths, leave=True, position=0))

  0%|          | 0/192524 [00:00<?, ?it/s]

100%|██████████| 192524/192524 [35:37<00:00, 90.08it/s] 


CPU times: user 34min 43s, sys: 4min 51s, total: 39min 35s
Wall time: 35min 37s


In [132]:
pid, sid, iid, width, height = zip(*imagesize_spleen)

spleen_meta_df = pd.DataFrame({'patient_id'     : pid,
                            'series_id'       : sid,
                            'instance_number' : iid,
                            'width'           : width,
                            'height'          : height})

spleen_meta_df[['patient_id', 'series_id', 'instance_number']] = spleen_meta_df[['patient_id', 'series_id', 'instance_number']].astype(int)
df_spleen[['patient_id', 'series_id', 'instance_number']] = df_spleen[['patient_id', 'series_id', 'instance_number']].astype('int')

df_spleen = df_spleen.merge(spleen_meta_df, on=['patient_id','series_id','instance_number'], how='right')

df_spleen['image_path'] = dir_spleen + df_spleen.patient_id.astype(str)+ '/' + df_spleen.series_id.astype(str) + '/' + df_spleen.instance_number.astype(str) + '.png'

df_spleen.to_csv(f'{dir_spleen}/spleen.csv', index=False)

In [41]:
# profile = ProfileReport(df_spleen, title="Profiling Report:Train", minimal=True)
# profile.to_notebook_iframe()