In [11]:
import pandas as pd
import numpy as np
import os

In [12]:
base_dir = '/workspace/BoneMeta'
img_dir = os.path.join(base_dir, 'images')
label_dir = os.path.join(base_dir, 'labels')

data_info_path = '/workspace/BoneMeta_raw/data_info.csv'

#### DataInfo.csv와 가지고 있는 파일들의 개수가 일치하는지 확인 

In [13]:
data_info = pd.read_csv(data_info_path)
print(
f'''
data info shape = {data_info.shape}, 
Unique case number = {data_info.Case.unique().size}
Cases per lesion type 
 S : {data_info.loc[data_info['Lesion Type']=='S'].Case.unique().size}
 L : {data_info.loc[data_info['Lesion Type']=='L'].Case.unique().size}
 M : {data_info.loc[data_info['Lesion Type']=='M'].Case.unique().size}
 
Different case names: 
  Folder - CSV = {set(os.listdir('/workspace/BoneMeta_raw/data')) - set(data_info.Case.unique())}
  CSV - Folder = {set(data_info.Case.unique()) - set(os.listdir('/workspace/BoneMeta_raw/data'))}
'''
)



data info shape = (3176, 13), 
Unique case number = 310
Cases per lesion type 
 S : 228
 L : 170
 M : 119
 
Different case names: 
  Folder - CSV = set()
  CSV - Folder = set()



In [14]:
img_files = os.listdir(img_dir)
label_files = os.listdir(label_dir)
print(len(set(img_files)-set(label_files)), 
len(set(label_files)-set(img_files)))

0 0


In [15]:
print(f'''
Folder cases
 S: {len(os.listdir(base_dir+'/S'))}
 L: {len(os.listdir(base_dir+'/L'))}
 M: {len(os.listdir(base_dir+'/M'))}
''')


Folder cases
 S: 228
 L: 170
 M: 119



=> Lytic 병변가진 케이스가 data_info에서 하나 적음. 
SN040번 케이스임


#### 한 케이스가 여러번 쓰인 경우는 몇개인지 확인

In [16]:
img_files[:5]

['000383_20200121_chest.npy',
 'SN012_20150801_abdomen.npy',
 '000021_20181227_chest.npy',
 '000002_20180829_chest.npy',
 '000296_20190726_chest.npy']

In [17]:
personal_ids = [file_name.split('_')[0] for file_name in img_files]
print(personal_ids.__len__())
personal_ids[:5]

310


['000383', 'SN012', '000021', '000002', '000296']

In [18]:
set(personal_ids).__len__()

201

In [19]:
unique_ids = list(set(personal_ids))
unique_ids.sort()

In [20]:
duplicated_bool = [personal_ids.count(id)>1 for id in unique_ids]

In [21]:
from itertools import compress

dup_cases = list(compress(unique_ids, duplicated_bool))

In [22]:
dup_cases[:5]

['000012', '000019', '000025', '000232', '000234']

In [23]:
img_file_series = pd.Series(img_files)
dup_idx = img_file_series.str.startswith(tuple(dup_cases))
dup_files = img_file_series.loc[dup_idx].values
dup_files.sort()

In [24]:
print(f'''
Duplicated cases number: {len(dup_files)}

case names = {[file_name.split('.')[0] for file_name in dup_files]}

''')


Duplicated cases number: 209

case names = ['000012_20181212_abdomen', '000012_20181214_chest', '000019_20181018_chest', '000019_20190601_abdomen', '000019_20190613_chest', '000025_20180808_abdomen', '000025_20180808_chest', '000232_20190401_abdomen', '000232_20190423_chest', '000234_20190401_abdomen', '000234_20190419_chest', '000251_20190429_abdomen', '000251_20190429_chest', '000251_20190701_abdomen', '000251_20190703_chest', '000260_20190501_CT Liver (contrast)', '000260_20190522_chest', '000262_20190318_chest', '000262_20190319_abdomen', '000262_20190501_abdomen', '000262_20190529_chest', '000269_20181016_abdomen', '000269_20181214_abdomen', '000269_20190325_abdomen', '000269_20190601_abdomen', '000269_20190604_chest', '000270_20190601_abdomen', '000270_20190608_chest', '000272_20190601_abdomen', '000272_20190614_chest', '000279_20190601_abdomen', '000279_20190614_chest', '000282_20190410_abdomen', '000282_20190701_abdomen', '000285_20190801_abdomen', '000285_20190807_chest', '00

==> 2/3 에 해당하는 209케이스가 중복으로 있는 케이스들임.

서로 안겹치게 분류해야함 

#### 특이한 protocol들 확인 

In [25]:
img_file_series = pd.Series(img_files)
abd_chest_idx = img_file_series.str.contains('chest|abd', case=False)

In [26]:
img_file_series.loc[~abd_chest_idx].values

array(['SN056_20170601_Thorax^01_Lung_Cancer_3D (Adult).npy',
       '000391_20200216_Thoracic Aorta CT Angio+3D (contrast).npy',
       'BH051_20190301_GU Kidney & bladder CT (3D).npy',
       'SN032_20190401_T spine CT pre contr.npy',
       '000400_20200201_CT Biliary (contrast).npy',
       'SN044_20160601_CT Liver+Pelvis(contrast).npy',
       '000079_20180911_Pulmonary artery CT Angio+3D (contrast).npy',
       'BH027_20191001_L-spine CT (3D).npy',
       '000401_20200201_T-Spine+3D CT (noncontrast).npy',
       '000301_20190801_Pulmonary artery CT Angio+3D (contrast).npy',
       'SN040_20170201_CT Neck (contrast).npy',
       'BH041_20190201_C-T-L Spine (3D).npy',
       'BH035_20191101_GU Kidney & bladder CT (3D).npy',
       'BH042_20180801_C-T-L Spine (3D).npy',
       '000348_20191201_CT Liver (contrast).npy',
       'BH040_20200101_CT Angio + 3D Pulmonary artery (Embolism) (2).npy',
       '000304_20190124_CT Liver (contrast).npy',
       '000316_20190715_Spine^L_SPINE (Ad

"""
'SN032_20190401_T spine CT pre contr.npy',
'BH027_20191001_L-spine CT (3D).npy',
'000401_20200201_T-Spine+3D CT (noncontrast).npy',
'SN040_20170201_CT Neck (contrast).npy',
'BH041_20190201_C-T-L Spine (3D).npy',
'BH042_20180801_C-T-L Spine (3D).npy',
'000316_20190715_Spine^L_SPINE (Adult).npy',
'BH030_20160101_T-L spine (3D).npy',
'BH011_20190301_Spine^00_C_Spine_Pre_OP (Adult).npy',
"""

이 CT 들은 빼던지 special case로 추론해보던지 해야겠음. 

#### 현재 나눠진 케이스들이 문제 없는지 보자. 

In [27]:
train_cases = '''000019_20181018_chest
000019_20190601_abdomen
000019_20190613_chest
000021_20181227_chest
000022_20180830_chest
000025_20180808_abdomen
000025_20180808_chest
000080_20180911_chest
000085_20180829_chest
000091_20180504_chest
000162_20180131_chest
000193_20190114_chest
000214_20190325_chest
000223_20190319_chest
000224_20190228_chest
000234_20190401_abdomen
000234_20190419_chest
000236_20190401_abdomen
000242_20190409_chest
000244_20190501_abdomen
000246_20190629_chest
000251_20190429_abdomen
000251_20190429_chest
000251_20190701_abdomen
000251_20190703_chest
000255_20190418_chest
000262_20190318_chest
000262_20190319_abdomen
000262_20190501_abdomen
000262_20190529_chest
000269_20181016_abdomen
000269_20181214_abdomen
000269_20190325_abdomen
000269_20190601_abdomen
000269_20190604_chest
000270_20190601_abdomen
000270_20190608_chest
000276_20190604_chest
000279_20190601_abdomen
000279_20190614_chest
000280_20190625_CT Liver (contrast)
000281_20190701_chest
000282_20190410_abdomen
000282_20190701_abdomen
000286_20190701_abdomen
000286_20190710_chest
000288_20190701_abdomen
000291_20190701_abdomen
000291_20190718_chest
000296_20190726_chest
000301_20190801_abdomen
000301_20190801_Pulmonary artery CT Angio+3D (contrast)
000301_20190827_chest
000302_20190726_chest
000304_20190124_chest
000304_20190124_CT Liver (contrast)
000308_20190801_abdomen
000308_20190826_chest
000309_20190801_abdomen
000309_20190823_chest
000311_20190902_chest
000314_20190827_chest
000314_20190901_abdomen
000315_20190820_chest
000315_20190901_abdomen
000322_20190901_abdomen
000322_20190913_chest
000324_20190910_chest
000325_20190919_chest
000330_20190926_chest
000331_20190901_abdomen
000331_20190916_chest
000332_20191001_abdomen
000332_20191025_chest
000344_20191125_chest
000352_20191001_abdomen
000352_20191017_chest
000354_20191001_abdomen
000354_20191022_chest
000355_20190901_abdomen
000355_20191023_chest
000356_20190901_abdomen
000356_20191024_chest
000362_20191201_abdomen
000362_20191214_chest
000363_20191201_abdomen
000363_20191222_chest
000364_20191201_abdomen
000364_20191217_chest
000368_20200101_abdomen
000372_20200101_abdomen
000372_20200107_chest
000376_20200101_abdomen
000376_20200114_chest
000382_20200101_abdomen
000382_20200129_chest
000386_20200204_chest
000396_20200201_abdomen
000396_20200218_chest
000404_20200201_abdomen
000404_20200305_chest
BH004_20191101_chest
BH008_20190701_abdomen
BH008_20190701_chest
BH009_20180301_abdomen
BH009_20180301_chest
BH010_20161101_abdomen
BH010_20161101_chest
BH011_20190301_Spine^00_C_Spine_Pre_OP (Adult)
BH012_20180401_abdomen
BH012_20180401_chest
BH013_20191001_chest
BH014_20181201_abdomen
BH014_20181201_chest
BH015_20190101_abdomen
BH015_20190101_chest
BH016_20151001_abdomen
BH016_20151001_chest
BH017_20191201_abdomen
BH017_20191201_chest
BH018_20190801_abdomen
BH018_20190801_chest
BH019_20191101_chest
BH020_20191201_abdomen
BH020_20191201_chest
BH021_20181001_abdomen
BH021_20181001_chest
BH022_20190101_chest
BH023_20191101_abdomen
BH023_20191101_chest
BH024_20190501_abdomen
BH024_20190501_chest
BH025_20191101_abdomen
BH026_20190601_abdomen
BH028_20190801_chest
BH029_20200101_chest
BH030_20160101_T-L spine (3D)
BH031_20160301_CT Angio + 3D Pulmonary artery (Embolism)
BH032_20180701_abdomen
BH032_20180701_chest
BH034_20190901_abdomen
BH034_20190901_chest
BH035_20191101_GU Kidney & bladder CT (3D)
BH036_20180301_GU Kidney & bladder CT (3D)
BH037_20171101_abdomen
BH037_20171101_chest
BH038_20160901_CT angio + 3D C-spine(vertebral artery, C1-2)
BH042_20180801_C-T-L Spine (3D)
BH043_20191001_abdomen
BH043_20191001_chest
BH045_20181001_abdomen
BH045_20181001_chest
BH047_20180901_abdomen
BH048_20190501_chest
BH051_20190301_GU Kidney & bladder CT (3D)
BH052_20190901_abdomen
BH052_20190901_chest
BH054_20191122_abdomen
BH055_20200328_chest
BH056_20200721_chest
BH057_20191227_abdomen
BH057_20191227_chest
BH058_20180918_chest
BH059_20201013_abdomen
BH059_20201019_chest
BH064_20200605_abdomen
BH064_20200605_chest
BH066_20190525_chest
BH067_20191017_chest
BH069_20200505_chest
BH081_20190322_abdomen
BH112_20190201_chest
SN001_20190901_abdomen
SN002_20190801_abdomen
SN002_20190801_chest
SN005_20191001_abdomen
SN005_20191101_chest
SN007_20190701_chest
SN007_20190801_abdomen
SN008_20190901_chest
SN009_20190801_abdomen
SN009_20190801_chest
SN010_20190801_CT Liver+Pelvis(contrast)
SN011_20180401_abdomen
SN011_20180401_chest
SN012_20150801_abdomen
SN013_20190401_abdomen
SN013_20190401_chest
SN014_20190901_abdomen
SN014_20190901_chest
SN015_20190901_abdomen
SN015_20190901_chest
SN016_20190901_abdomen
SN016_20190901_chest
SN018_20190801_abdomen
SN018_20190801_chest
SN020_20190801_abdomen
SN020_20190801_chest
SN025_20200401_abdomen
SN025_20200401_chest
SN028_20160801_abdomen
SN028_20160801_chest
SN032_20190401_T spine CT pre contr
SN032_20190501_abdomen
SN036_20190601_chest
SN039_20181001_abdomen
SN039_20181001_chest
SN040_20170201_abdomen
SN040_20170201_CT Neck (contrast)
SN042_20170901_abdomen
SN042_20170901_chest
SN043_20170701_chest
SN044_20160601_CT Liver+Pelvis(contrast)
SN045_20170201_abdomen
SN045_20170201_chest
SN051_20170401_abdomen
SN051_20170401_chest
SN054_20170201_chest
SN055_20170301_abdomen
SN055_20170301_chest'''.split('\n')

valid_cases = '''000316_20190627_chest
000316_20190715_Spine^L_SPINE (Adult)
SN034_20200101_abdomen
SN034_20200101_chest
SN059_20170501_chest
BH027_20191001_L-spine CT (3D)
000397_20200214_chest
BH027_20191001_chest
BH039_20190601_chest
BH041_20190201_C-T-L Spine (3D)
BH060_20200413_chest
BH062_20201104_chest
SN035_20180101_chest
SN048_20170801_chest
000232_20190401_abdomen
000232_20190423_chest
000285_20190801_abdomen
000285_20190807_chest
000348_20191201_CT Liver (contrast)
000401_20200201_T-Spine+3D CT (noncontrast)
BH006_20170801_chest
BH040_20200101_CT Angio + 3D Pulmonary artery (Embolism)
BH040_20200101_CT Angio + 3D Pulmonary artery (Embolism) (2)
BH065_20201013_chest
BH099_20200806_abdomen
BH099_20200806_chest
BH110_20200616_chest
SN004_20190801_abdomen
SN004_20190901_chest
SN017_20190701_chest
SN017_20190801_abdomen
SN029_20200301_abdomen
SN029_20200301_chest
SN030_20191101_abdomen
SN049_20170901_abdomen
SN049_20170901_chest
SN050_20170401_chest
SN056_20170601_Thorax^01_Lung_Cancer_3D (Adult)'''.split('\n')

test_cases = '''000009_20180417_chest
000079_20180911_Pulmonary artery CT Angio+3D (contrast)
000212_20190324_chest
BH001_20190501_abdomen
000048_20190501_abdomen
000069_20180319_chest
000278_20190620_chest
SN046_20170601_abdomen
BH005_20170701_chest
000002_20180829_chest
000005_20181202_abdomen
000010_20181214_abdomen
000011_20181207_abdomen
000011_20181207_chest
000012_20181212_abdomen
000012_20181214_chest
000260_20190501_CT Liver (contrast)
000260_20190522_chest
000272_20190601_abdomen
000272_20190614_chest
000273_20190531_chest
000298_20190726_chest
000300_20190801_abdomen
000305_20190801_abdomen
000310_20190801_abdomen
000310_20190812_chest
000317_20190720_CT Liver (contrast)
000350_20190927_chest
000350_20191001_abdomen
000391_20200216_Thoracic Aorta CT Angio+3D (contrast)
000400_20200201_CT Biliary (contrast)
000400_20200210_chest
BH001_20190401_chest
BH002_20190701_chest
BH005_20170701_abdomen
BH007_20190801_abdomen
BH007_20190801_chest
BH061_20190315_abdomen
BH061_20190315_chest
BH063_20200519_chest
BH070_20160823_chest
BH072_20200219_abdomen
BH072_20200219_chest
BH091_20200104_chest
BH091_20200104_GU Kidney & bladder CT (3D)
SN019_20190801_abdomen
SN019_20190801_chest
SN031_20160501_abdomen
SN031_20160501_chest'''.split('\n')

In [28]:
print(f'''
Case numbers:
 train: {len(train_cases)}
 validation: {len(valid_cases)}
 test: {len(test_cases)}
 total: {len(train_cases + valid_cases + test_cases)}
''')


Case numbers:
 train: 220
 validation: 38
 test: 49
 total: 307



In [5]:
train_personal_ids = [case.split('_')[0] for case in train_cases]
valid_personal_ids = [case.split('_')[0] for case in valid_cases]
test_personal_ids = [case.split('_')[0] for case in test_cases]

In [6]:
print(len(set(train_personal_ids)), len(set(valid_personal_ids)), len(set(test_personal_ids)))

137 27 34


In [7]:
set(train_personal_ids).intersection(set(valid_personal_ids))

set()

In [8]:
set(test_personal_ids).intersection(set(valid_personal_ids))

set()

In [9]:
set(train_personal_ids).intersection(set(test_personal_ids))

set()

#### 표시가 안된 케이스는 뭘까? 

In [31]:
def case_to_file(case):
    return case + '.npy'

classified_files = [case_to_file(case) for case in (train_cases + valid_cases + test_cases)]
set(img_files) - set(classified_files)

{'000383_20200121_chest.npy',
 '000450_20200501_abdomen.npy',
 '000450_20200512_chest.npy',
 '000452_20200513_chest.npy',
 'BH047_20180901_chest.npy'}

얘네들은 왜 빠졌지? 

여기랑 지금 사용하는 케이스랑 다른듯. 정리 필요

#### 케이스 표시하자

In [34]:
train_files = [case_to_file(case) for case in train_cases] + ['000383_20200121_chest.npy', '000450_20200501_abdomen.npy', '000450_20200512_chest.npy', '000452_20200513_chest.npy', 'BH047_20180901_chest.npy']
val_files = [case_to_file(case) for case in valid_cases]
test_files = [case_to_file(case) for case in test_cases]
print(f'''
TRAIN_FILES = {train_files}
VAL_FILES = {val_files}
TEST_FILES = {test_files}
''')


TRAIN_FILES = ['000019_20181018_chest.npy', '000019_20190601_abdomen.npy', '000019_20190613_chest.npy', '000021_20181227_chest.npy', '000022_20180830_chest.npy', '000025_20180808_abdomen.npy', '000025_20180808_chest.npy', '000080_20180911_chest.npy', '000085_20180829_chest.npy', '000091_20180504_chest.npy', '000162_20180131_chest.npy', '000193_20190114_chest.npy', '000214_20190325_chest.npy', '000223_20190319_chest.npy', '000224_20190228_chest.npy', '000234_20190401_abdomen.npy', '000234_20190419_chest.npy', '000236_20190401_abdomen.npy', '000242_20190409_chest.npy', '000244_20190501_abdomen.npy', '000246_20190629_chest.npy', '000251_20190429_abdomen.npy', '000251_20190429_chest.npy', '000251_20190701_abdomen.npy', '000251_20190703_chest.npy', '000255_20190418_chest.npy', '000262_20190318_chest.npy', '000262_20190319_abdomen.npy', '000262_20190501_abdomen.npy', '000262_20190529_chest.npy', '000269_20181016_abdomen.npy', '000269_20181214_abdomen.npy', '000269_20190325_abdomen.npy', '00