# Data Research & Data Separation of MRI_brain
## 0. import package and set config

In [1]:
# import package
import os
import h5py
import numpy as np
import pandas as pd


In [2]:
class config(object):
    data_root = '/Users/gofinge/Documents/Data/BrainMRI_raw/train'
    data_name = 'train_pre_data.h5'
    label_name = 'train_pre_label.csv'
    
    export_dir = '/Users/gofinge/Documents/Data/BrainMRI/'

conf = config()

## 1. load data & reshape

In [3]:
# load MRI
f_raw = h5py.File(os.path.join(conf.data_root, conf.data_name), 'r')
for key in f_raw.keys():
    print(key)
    print(f_raw[key].shape)

mri = f_raw['data'][...]
mri = np.squeeze(mri)
print(mri.shape)

data
(300, 1, 79, 95, 79)
(300, 79, 95, 79)


In [4]:
# load label
labels = np.array(pd.read_csv(os.path.join(conf.data_root, conf.label_name))['label'])
print(labels)
print(type(labels))

[1 0 0 1 0 1 2 1 1 2 2 0 1 2 1 0 2 1 2 1 0 1 2 2 1 0 1 0 1 1 0 2 1 1 1 1 0
 1 1 1 2 1 2 0 1 2 1 2 0 1 2 2 1 2 1 1 2 1 2 1 1 2 1 0 2 1 0 0 0 1 2 0 1 0
 1 1 1 1 2 0 1 0 1 1 1 2 1 2 0 1 2 1 1 1 1 2 1 0 1 0 0 2 1 1 1 1 1 1 0 1 1
 2 2 1 2 2 0 1 0 2 1 0 2 2 0 1 2 2 0 1 1 1 1 1 1 1 1 2 0 0 0 0 0 2 0 1 1 1
 0 2 1 0 1 2 0 1 2 2 1 1 0 2 2 1 0 2 1 1 2 1 2 1 0 0 1 1 1 1 1 1 0 2 1 1 1
 2 1 1 1 1 2 0 0 2 1 2 1 2 1 1 2 1 0 2 1 1 1 0 1 2 2 0 2 1 0 1 0 1 0 2 1 2
 1 1 0 1 1 1 1 1 2 1 1 1 1 1 0 1 2 1 2 2 0 0 1 0 0 1 0 2 1 1 0 1 2 0 2 2 2
 0 1 1 0 1 0 2 1 2 1 2 1 1 1 2 2 1 0 1 1 1 2 1 1 1 0 2 1 2 1 2 1 1 0 2 1 2
 1 1 0 2]
<class 'numpy.ndarray'>


In [6]:
# mri min max
temp = list(mri)
max_list = [int(np.max(i)) for i in temp]
print(max_list)

[1045, 1982, 1122, 2847, 2987, 1376, 1891, 1614, 1446, 1450, 3081, 660, 2860, 1255, 2259, 1006, 1123, 2076, 1879, 1601, 1547, 1934, 1962, 2497, 1001, 1624, 2037, 6128, 877, 1029, 940, 163, 161, 1620, 1991, 2281, 722, 853, 1985, 2109, 868, 706, 165, 1184, 2428, 2107, 2043, 956, 716, 2597, 155, 2247, 1179, 926, 1171, 135, 1268, 1344, 1004, 1219, 182, 1328, 2199, 1635, 178, 2185, 2348, 718, 814, 2758, 1624, 1337, 2248, 2416, 119, 1171, 122, 1585, 1383, 3214, 2863, 1347, 1481, 836, 2596, 891, 131, 859, 1341, 2233, 652, 1036, 3016, 2226, 3374, 1087, 1900, 1592, 1231, 1115, 815, 2498, 1157, 1540, 1046, 2010, 1171, 166, 2091, 648, 1425, 848, 862, 1159, 1677, 2401, 2217, 1305, 1233, 822, 1424, 929, 1112, 1283, 1655, 160, 1451, 159, 2168, 1171, 919, 1317, 859, 1281, 1097, 1584, 1382, 2631, 787, 1932, 156, 2077, 1247, 857, 2159, 1398, 2220, 1481, 861, 2013, 1011, 2281, 2249, 2286, 725, 2038, 1124, 598, 2659, 1393, 1118, 1539, 1732, 1721, 141, 2594, 1119, 948, 1028, 174, 2186, 1214, 1205, 1345, 2

## 2. separate mri & labels

In [16]:
# labels count
index0 = np.where(labels == 0) # 68
index1 = np.where(labels == 1) # 151
index2 = np.where(labels == 2) # 81

print(np.size(index0))
print(np.size(index1))
print(np.size(index2))

68
151
81


In [43]:
# separation
labels.sort()
test_idx = list(np.linspace(0, 299, num = 45, endpoint = False, dtype = int))
val_idx = [x+1 for x in test_idx]
train_idx = list(np.delete([i for i in range(300)], test_idx + val_idx, axis = 0))

train_mri = mri[train_idx]
val_mri = mri[val_idx]
test_mri = mri[test_idx]

train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]

print(train_mri.shape)
print(val_mri.shape)
print(test_mri.shape)

(210, 79, 95, 79)
(45, 79, 95, 79)
(45, 79, 95, 79)


In [51]:
# save data
def save_hdf5(data, labels, filename):
    file = h5py.File(filename + '.h5','w')
    file.create_dataset('data', data = data)
    labels_str = [str(x) for x in labels]
    np.savetxt(filename + '_labels.csv', labels_str, fmt = '%s')
    
    
save_hdf5(train_mri, train_labels, os.path.join(conf.export_dir, 'train'))
save_hdf5(val_mri, val_labels, os.path.join(conf.export_dir, 'val'))
save_hdf5(test_mri, test_labels, os.path.join(conf.export_dir, 'test'))