# 1. Import lib

In [2]:
import pandas as pd
import numpy as np
import h5py
import os
import time

import gc

# scipy sparse
from scipy import sparse
from scipy.sparse import save_npz
# multiprocessing
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from glob import glob

from concurrent.futures import ThreadPoolExecutor

# 2. Setting

In [3]:
class config:
    input_base_path = "/media/hiroki/share/kaggle_data/trends-assessment-prediction"
    out_base_path = "/media/hiroki/working/kaggle_data/trends-neuroimaging/split_IC"
    train_list = os.listdir(path=input_base_path+'/fMRI_train')
    test_list = os.listdir(path=input_base_path+'/fMRI_test')
    train_num_records = len(train_list)
    test_num_records = len(test_list)
    xyz = 52*63*53

In [None]:
from scipy import sparse

# 3. functions

def save_ic(num_ic, listdir):　　　　#listdir=train or test
    start_time = time.time()
    
    train_num_ic_matrix[row,:] = num_ic_vec
    train_num_ic_matrix = sparse.csr_matrix(train_num_ic_matrix)
    sparse.save_npz(out_base_path+"/train/ic{}_matrix.npz".format(num_ic+1), train_num_ic_matrix)

In [6]:
from scipy.sparse import lil_matrix
from scipy.sparse import coo_matrix

In [10]:
def load_ic(file_name, num_ic):
    # load .mat
    f = h5py.File(config.input_base_path+'/fMRI_train/'+file_name,'r')
    data = f['SM_feature']
    np_array4D = data[:,:,:,num_ic]
    # vectorize
    num_ic_vec = lil_matrix(np_array4D.reshape(config.xyz))
    gc.collect()
    return num_ic_vec

In [11]:
from tqdm import tqdm

# 4. RUN

In [13]:
for num_ic in range(12,30):
    
    start_time = time.time()
    # init group_num_ic matrix(train)
    train_num_ic_matrix = coo_matrix((config.train_num_records, config.xyz), dtype=np.float64).tolil()
    
    for (row,file_name) in enumerate(tqdm(config.train_list)):
        train_num_ic_matrix[row,:] = load_ic(file_name, num_ic)
        gc.collect()
    sparse.save_npz(config.out_base_path+"/train/ic{}_matrix.npz".format(num_ic+1), train_num_ic_matrix.tocsr())
    
    print("success : IC{}".format(num_ic+1))
    end_time = time.time()
    elapsed_time = end_time-start_time
    print("IC/sec:{}".format(elapsed_time))
    
    gc.collect()

  0%|          | 29/5877 [00:17<59:09,  1.65it/s]  


KeyboardInterrupt: 

In [11]:
for num_ic in range(0,53):
    start_time = time.time()
    for row, file_name in enumerate(test_listdir): 
        # init group_num_ic matrix(test)
        test_num_ic_matrix = np.zeros((test_num_records, xyz))
        # load .mat  
        f = h5py.File(input_base_path+'/fMRI_test/'+file_name,'r')
        data = f['SM_feature']
        np_array4D = data[()]
        # vectorize
        num_ic_vec = np_array4D[:,:,:,num_ic].reshape(xyz)
        test_num_ic_matrix[row,:] = num_ic_vec
    test_num_ic_matrix = sparse.csr_matrix(test_num_ic_matrix)
    sparse.save_npz(out_base_path+"/test/ic{}_matrix.npz".format(num_ic+1), test_num_ic_matrix)
    
    print("success : IC{}".format(num_ic+1))
    end_time = time.time()
    elapsed_time = end_time-start_time
    print("IC/sec:{}".format(elapsed_time))
    gc.collect()

success : IC1
IC/sec:1660.5606489181519
success : IC2
IC/sec:1661.2089116573334
success : IC3
IC/sec:1659.4940679073334


KeyboardInterrupt: 

In [4]:
def save_ic_matrix(num_ic):
    start_time = time.time()
    for row, file_name in tqdm(enumerate(config.train_list)): 
        # init group_num_ic matrix(test)
        #train_num_ic_matrix = np.zeros((config.train_num_records, config.xyz))
        # load .mat  
        f = h5py.File(input_base_path+'/fMRI_train/'+file_name,'r')
        data = f['SM_feature']
        np_array4D = data[()]
        # vectorize
        num_ic_vec = np_array4D[:,:,:,num_ic].reshape(xyz)
        train_num_ic_matrix[row,:] = num_ic_vec
    train_num_ic_matrix = sparse.csr_matrix(train_num_ic_matrix)
    sparse.save_npz(out_base_path+"/train/ic{}_matrix.npz".format(num_ic+1), train_num_ic_matrix)
    
    print("success : IC{}".format(num_ic+1))
    end_time = time.time()
    elapsed_time = end_time-start_time
    print("IC/sec:{}".format(elapsed_time))
    gc.collect()

In [None]:
#for num_ic in range(0,53):
start_time = time.time()
for row, file_name in tqdm(enumerate(config.train_list)): 
    # init group_num_ic matrix(test)
    #train_num_ic_matrix = np.zeros((config.train_num_records, config.xyz))
    # load .mat  
    f = h5py.File(input_base_path+'/fMRI_train/'+file_name,'r')
    data = f['SM_feature']
    np_array4D = data[()]
    # vectorize
    num_ic_vec = np_array4D[:,:,:,num_ic].reshape(xyz)
    train_num_ic_matrix[row,:] = num_ic_vec
    train_num_ic_matrix = sparse.csr_matrix(train_num_ic_matrix)
    sparse.save_npz(out_base_path+"/train/ic{}_matrix.npz".format(num_ic+1), train_num_ic_matrix)
    
    print("success : IC{}".format(num_ic+1))
    end_time = time.time()
    elapsed_time = end_time-start_time
    print("IC/sec:{}".format(elapsed_time))
    gc.collect()

In [5]:
job_args = [(num_ic) for num_ic in range(0,53)]
list(tqdm(p.imap(save_ic_matrix, job_args), total=54))
p.close()
p.join()

  0%|          | 0/54 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A
0it [00:00, ?it/s][A
0it [00:00, ?it/s]


KeyboardInterrupt: 

In [17]:
#train_num_ic_matrix = np.zeros((config.train_num_records, config.xyz))
# load .mat  
f = h5py.File(config.input_base_path+'/fMRI_train/'+config.train_list[0],'r')
data = f['SM_feature']
np_array4D = data[()]
# vectorize
num_ic_vec = np_array4D[:,:,:,0].reshape(config.xyz)
#train_num_ic_matrix[row,:] = num_ic_vec

In [18]:
from scipy.sparse import csr_matrix

In [16]:
num_ic_vec.tolist()

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [4]:
from scipy import sparse

In [6]:
import sys

In [29]:
%%time
a = sparse.lil_matrix((config.train_num_records, config.xyz))

CPU times: user 6.09 ms, sys: 176 µs, total: 6.26 ms
Wall time: 6.01 ms


In [19]:
%%time
a = _matrix(num_ic_vec).tolil()

CPU times: user 8.7 ms, sys: 32 µs, total: 8.73 ms
Wall time: 7.28 ms


In [23]:
from scipy import sparse

In [24]:
%%time
a = sparse.lil_matrix(a)

CPU times: user 456 µs, sys: 0 ns, total: 456 µs
Wall time: 464 µs


In [31]:
a[0,:] = num_ic_vec

In [35]:
import sys
sys.getsizeof(a[0,:])

64

In [19]:
a[0,:] = num_ic_vec

In [26]:
a[6,:] = num_ic_vec

In [14]:
num_ic_vec.shape

(173628,)

In [12]:
def io_num_component(num_ic):
    for row, file_name in enumerate(tqdm(config.train_list)):
        f = h5py.File(config.input_base_path+'/fMRI_train/'+file_name,'r')
        data = f['SM_feature']
        coo_vec = coo_matrix(data[:,:,:,num_ic].reshape(config.xyz))
        if row == 0:
            coo_matrix = coo_vec
        else:
            coo_matrix = sparse.vstack([coo_matrix, coo_vec])
    save_npz(config.out_base_path+"/train/ic{}_matrix.npz".format(num_ic+1), num_ic_coo.to_csr())

In [13]:
def wrap_io_num_component(args):
    return io_num_component(*args)

In [19]:
job_args = [(num_ic) for num_ic in range(0,54)]

In [20]:
list(tqdm(p.imap(io_num_component, job_args), total=54))
p.close()
p.join()

  0%|          | 0/54 [00:00<?, ?it/s]