In [2]:
import torch
import pandas as pd

# Data Preprocess 및 Split

In [126]:
metadata = pd.read_csv('../mimic-cxr-2.0.0-metadata.csv.gz')

In [127]:
metadata_ = metadata[metadata['ViewPosition']=='AP'][['subject_id', 'study_id', 'dicom_id']]
meta_data_sort = metadata_.sort_values(by=['subject_id', 'study_id', 'dicom_id'])

In [128]:
meta_data_select = meta_data_sort.groupby('subject_id', as_index=False).nth(0)

In [129]:
test_data = meta_data_select[meta_data_select['study_id']%10 >= 8]
train_data = meta_data_select[meta_data_select['study_id']%10 < 8]
train_data.to_csv('./train.csv')
test_data.to_csv('./test_csv')

In [130]:
train_data

Unnamed: 0,subject_id,study_id,dicom_id
4,10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714
7,10000764,57375967,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4
27,10000980,51967283,943486a3-b3fa9ff7-50f5a769-7a62fcbb-f39b6da4
46,10001122,53957785,07b9ddda-9a4a1e1a-4495463d-4c77d947-ed368713
56,10001217,58913004,5e54fc9c-37c49834-9ac3b915-55811712-9d959d26
...,...,...,...
377072,19998843,54376373,0f1912dc-1b4be22c-910071cc-c0ed8b22-b91989d3
377092,19999287,50000173,c8bbb9ff-ecb81ef7-a1a6cecf-f535bd20-bd512ba0
377101,19999376,57540554,53e9b6d0-5d5317f5-f1a4c031-01d40558-fd14a425
377102,19999442,58497551,ee9155f3-944c056b-c76c73d0-3f792f2c-92ae461e


In [131]:
negbio = pd.read_csv('../mimic-cxr-2.0.0-negbio.csv.gz')

In [132]:
negbio = negbio.fillna(0).replace(-1, 0)

In [133]:
negbio_train = negbio.sort_values(by=['subject_id', 'study_id'])[negbio['study_id'].isin(train_data['study_id'])].reset_index()
negbio_test = negbio.sort_values(by=['subject_id', 'study_id'])[negbio['study_id'].isin(test_data['study_id'])].reset_index()
train_data_reset_index = train_data.reset_index()
test_data_reset_index = test_data.reset_index()

In [134]:
negbio_train

Unnamed: 0,index,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,2,10000032,53911762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,4,10000764,57375967,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14,10000980,51967283,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,24,10001122,53957785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,28,10001217,58913004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26802,227799,19998843,54376373,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
26803,227813,19999287,50000173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26804,227820,19999376,57540554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26805,227821,19999442,58497551,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
train_data_concat = pd.concat([train_data_reset_index.iloc[:,1:], negbio_train.iloc[:,3:]], axis=1)
test_data_concat = pd.concat([test_data_reset_index.iloc[:,1:], negbio_test.iloc[:,3:]], axis=1)

In [136]:
train_data_concat.shape

(26807, 17)

In [137]:
test_data_concat.shape

(6694, 17)

In [142]:
train_data_concat.to_csv('./train_with_label.csv')
test_data_concat.to_csv('./test_with_label.csv')

In [145]:
test_data_concat.sum(axis=0)

subject_id                                                         100379443583
study_id                                                           357651913894
dicom_id                      d0b71acc-b5a62046-bbb5f6b8-7b173b85-65cdf73821...
Atelectasis                                                              1426.0
Cardiomegaly                                                             1187.0
Consolidation                                                             291.0
Edema                                                                     777.0
Enlarged Cardiomediastinum                                                239.0
Fracture                                                                  141.0
Lung Lesion                                                               209.0
Lung Opacity                                                             1530.0
No Finding                                                               2268.0
Pleural Effusion                        

# 이미지 파일 가져오기

In [1]:
import numpy as np
from PIL import Image
from tqdm import tqdm
import torchvision
import torchvision.transforms as T

In [3]:
train = pd.read_csv('./train_with_label.csv')
test = pd.read_csv('./test_with_label.csv')

In [16]:
train_label = pd.concat([train.iloc[:,2:3],train.iloc[:,4:].astype(int)], axis=1)
test_label = pd.concat([test.iloc[:,2:3],test.iloc[:,4:].astype(int)], axis=1)

In [22]:
train_label.to_csv('train_label.txt', sep=',', index=False, header=None)
test_label.to_csv('test_label.txt', sep=',', index=False, header=None)

In [18]:
train_label

Unnamed: 0,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,53911762,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,57375967,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,51967283,0,0,0,0,0,0,1,0,0,0,0,1,0,0
3,53957785,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,58913004,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26802,54376373,1,0,0,0,0,0,0,0,0,1,0,0,0,1
26803,50000173,0,0,0,0,0,0,0,0,1,0,0,0,0,0
26804,57540554,0,0,0,0,0,0,0,0,1,0,0,0,0,0
26805,58497551,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [37]:

test.head(3)
    

Unnamed: 0.1,Unnamed: 0,subject_id,study_id,dicom_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,0,10000935,50578979,d0b71acc-b5a62046-bbb5f6b8-7b173b85-65cdf738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1,10003052,58630288,21560cb5-ffe886be-2a234166-47975293-e3a97d3f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,10003637,51371378,4c028244-47499ecc-3fab489b-15ec1e76-47055a4d,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train 이미지 -> Numpy 

In [39]:
path = '../../../ssd1/mimic-cxr-jpg-2.0.0.physionet.org/files/'
transform = T.Resize((256,256))
for num in tqdm(range(len(train))):
    subject_id = train.iloc[num,1]
    study_id = train.iloc[num,2]
    dicom_id = train.iloc[num,3]
    path_num = subject_id // 1000000
    image_pil = Image.open(f'{path}p{path_num}/p{subject_id}/s{study_id}/{dicom_id}.jpg')
    resized_img = transform(image_pil)
    image = np.array(resized_img)
    # print(image.shape)
    if num == 0:
        train_image = image.reshape(1,256,256)
    else:
        train_image = np.concatenate((train_image, image.reshape(1,256,256)),axis=0)


100%|██████████| 26807/26807 [1:22:03<00:00,  5.45it/s]


In [40]:
np.save('./train_image', train_image)

In [43]:
train_image.shape

(26807, 256, 256)

# Test 이미지 -> Numpy

In [24]:
path = '../../../ssd1/mimic-cxr-jpg-2.0.0.physionet.org/files/'
transform = T.Resize((256,256))
for num in tqdm(range(len(test))):
    subject_id = test.iloc[num,1]
    study_id = test.iloc[num,2]
    dicom_id = test.iloc[num,3]
    path_num = subject_id // 1000000
    image_pil = Image.open(f'{path}p{path_num}/p{subject_id}/s{study_id}/{dicom_id}.jpg')
    resized_img = transform(image_pil)
    image = np.array(resized_img)
    # print(image.shape)
    if num == 0:
        test_image = image.reshape(1,256,256)
    else:
        test_image = np.concatenate((test_image, image.reshape(1,256,256)),axis=0)


100%|██████████| 6694/6694 [09:05<00:00, 12.26it/s]


In [25]:
np.save('./test_image', test_image)