In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nibabel as nib
import os, glob, skimage, dataset, cv2
from skimage import io, transform
from tqdm import tqdm
from copy import copy
from dataset import download
from tools import *

%matplotlib inline

In [2]:
# dataset 다운로드
download("./S1_Data.csv")

[Download PINES Dataset]: 100%|██████████| 5067/5067 [4:02:31<00:00,  3.77s/it]  


<dataset.download at 0x7fc22c2cb278>

In [3]:
img_path = explore_dir('./dataset')

# csv파일에서 train과 test 인덱스 찾기
train_indices = list()
test_indices = list()

csv_file = pd.read_csv("./S1_Data.csv")

for i in range(len(list(csv_file.Holdout))):
    if csv_file.Holdout[i]=='Train':
        train_indices.append(i)
    else:
        test_indices.append(i)

In [4]:
# train과 test 인덱스를 이용하여 train csv와 test csv 생성 
csv_file = csv_file.as_matrix()

train_csv = csv_file[train_indices[0]][np.newaxis,:]
test_csv = csv_file[test_indices[0]][np.newaxis,:]

for i, entity in enumerate(csv_file):
    if i in train_indices[1:] :
        train_csv = np.concatenate((train_csv,[entity]),axis=0)
    elif i in test_indices[1:] :
        test_csv = np.concatenate((test_csv,[entity]),axis=0)

In [5]:
# 서로 다른 image들의 크기를 각 축의 평균값으로 조정
shape_list = list(nib.load(path).get_data().shape for path in img_path[1])
mean_shape = list(sum(shape_list[i][j] for i in range(len(shape_list)))//len(shape_list) for j in {0,1,2})

train_image_shape = list()
test_image_shape = list()

for i, nibFilename in enumerate(img_path[0]):
    try : 
        list(train_csv[:,7]).index(nibFilename)
        train_image_shape.append(nib.load(img_path[1][i]).get_data().shape)
    except : 
        test_image_shape.append(nib.load(img_path[1][i]).get_data().shape)

# train_csv와 test_csv에 속하는 각 image들의 전체 크기를 저장
train_num_shape = list(sum(train_image_shape[i][j] for i in range(len(train_image_shape))) for j in {0,1,2})
test_num_shape = list(sum(test_image_shape[i][j] for i in range(len(test_image_shape))) for j in {0,1,2})

In [6]:
# initialize
train_count = 0
test_count = 0
train_x_batches = list()
train_y_batches = list()
train_z_batches = list()
test_x_batches = list()
test_y_batches = list()
test_z_batches = list()
    
train_x_labels = list()
train_y_labels = list()
train_z_labels = list()
test_x_labels = list()
test_y_labels = list()
test_y_labels = list()

train_x_labels = list()
train_y_labels = list()
train_z_labels = list()
test_x_labels = list()
test_y_labels = list()
test_z_labels = list()

In [7]:
with tqdm(total=len(img_path[0])) as pbar:
    pbar.set_description('image_load')
    for i, nibFilename in enumerate(img_path[0]):
        img = np.array(nib.load(img_path[1][i]).get_data(), dtype=np.float64)
        
        # 주어진 각각의 축을 기준으로 2차원 행렬의 크기 (단일 채널 이미지) 재조정
        x_img = skimage.transform.resize(img,(img.shape[0],mean_shape[1],mean_shape[2]))
        y_img = skimage.transform.resize(img.transpose(1,0,2),(img.shape[1],mean_shape[0],mean_shape[2]))
        z_img = skimage.transform.resize(img.transpose(2,0,1),(img.shape[2],mean_shape[0],mean_shape[1]))
        
        
        if nibFilename in train_csv[:,7]:
            """
            100개 단위로 이미지 데이터 호출 및 병합 후 100개 초과시 외부 리스트에 저장한 뒤에
            다시 100개 단위로 이미지 데이터 호출 뒤 병합 진행. (메모리 고려) - train,test 동일
            """
            train_x_dataset = x_img if train_count==0 else np.concatenate((train_x_dataset,x_img), axis=0)
            train_y_dataset = y_img if train_count==0 else np.concatenate((train_y_dataset,y_img), axis=0)
            train_z_dataset = z_img if train_count==0 else np.concatenate((train_z_dataset,z_img), axis=0)
            
            if train_count==100:
                train_x_batches.append(copy(train_x_dataset))
                train_y_batches.append(copy(train_y_dataset))
                train_z_batches.append(copy(train_z_dataset))
                train_count=0
            else:
                train_count+=1
                
            # 각 축에 해당하는 img 크기와 해당 image의 Rating(between 1 and 5)을 labeling - train
            idx=list(train_csv[:,7]).index(nibFilename)
            for idx in range(img.shape[0]):
                train_x_labels.append(train_csv[idx][10]) 
            for idx in range(img.shape[1]):
                train_y_labels.append(train_csv[idx][10])
            for idx in range(img.shape[2]):
                train_z_labels.append(train_csv[idx][10])
            
        elif nibFilename in test_csv[:,7]:
            test_x_dataset = x_img if test_count==0 else np.concatenate((test_x_dataset,x_img), axis=0)
            test_y_dataset = y_img if test_count==0 else np.concatenate((test_y_dataset,y_img), axis=0)
            test_z_dataset = z_img if test_count==0 else np.concatenate((test_z_dataset,z_img), axis=0)
            
            if test_count==100:
                test_x_batches.append(copy(test_x_dataset))
                test_y_batches.append(copy(test_y_dataset))
                test_z_batches.append(copy(test_z_dataset))
                test_count=0
            else:
                test_count+=1
            
            # 각 축에 해당하는 img 크기와 해당 image의 Rating(between 1 and 5)을 labeling - test
            idx=list(test_csv[:,7]).index(nibFilename)
            for idx in range(img.shape[0]):
                test_x_labels.append(test_csv[idx][10])    
            for idx in range(img.shape[1]):
                test_y_labels.append(test_csv[idx][10])
            for idx in range(img.shape[2]):
                test_z_labels.append(test_csv[idx][10])
        
        pbar.update(1)
        
    # 모든 과정이 끝난 후 남은 이미지 데이터 append    
    train_x_batches.append(train_x_dataset)
    train_y_batches.append(train_y_dataset)
    train_z_batches.append(train_z_dataset)
    
    test_x_batches.append(test_x_dataset)
    test_y_batches.append(test_y_dataset)
    test_z_batches.append(test_z_dataset)

  warn("The default mode, 'constant', will be changed to 'reflect' in "
image_load: 100%|██████████| 5067/5067 [41:52<00:00,  2.50it/s]  


In [8]:
# 100개씩 나눠진 이미지 데이터들을 하나의 dataset으로 합침

train_x_dataset = np.concatenate((train_x_batches), axis=0)
train_y_dataset = np.concatenate((train_y_batches), axis=0)
train_z_dataset = np.concatenate((train_z_batches), axis=0)

test_x_dataset = np.concatenate((test_x_batches), axis=0)
test_y_dataset = np.concatenate((test_y_batches), axis=0)
test_z_dataset = np.concatenate((test_z_batches), axis=0)

In [9]:
# 처리한 이미지를 저장
with tqdm(total=len(train_x_dataset)+len(train_y_dataset)+len(train_z_dataset)+len(test_x_dataset)+len(test_y_dataset)+len(test_z_dataset)) as pbar:
    pbar.set_description('Saving Processed Images')
    for i, img in enumerate(train_x_dataset):
        cv2.imwrite('./train_dataset/x/img_'+str(i+1)+'.jpg',img)
        pbar.update(1)
    for i, img in enumerate(train_y_dataset):
        cv2.imwrite('./train_dataset/y/img_'+str(i+1)+'.jpg',img)
        pbar.update(1)
    for i, img in enumerate(train_z_dataset):
        cv2.imwrite('./train_dataset/z/img_'+str(i+1)+'.jpg',img)
        pbar.update(1)
        
    for i, img in enumerate(test_x_dataset):
        cv2.imwrite('./test_dataset/x/img_'+str(i+1)+'.jpg',img)
        pbar.update(1)
    for i, img in enumerate(test_y_dataset):
        cv2.imwrite('./test_dataset/y/img_'+str(i+1)+'.jpg',img)
        pbar.update(1)
    for i, img in enumerate(test_z_dataset):
        cv2.imwrite('./test_dataset/z/img_'+str(i+1)+'.jpg',img)
        pbar.update(1)

Saving Processed Images: 100%|██████████| 1278056/1278056 [01:43<00:00, 12364.14it/s]
