In [1]:
from __future__ import print_function
from os.path import join
import time

import numpy as np
import pandas as pd
import pyxis as px
from astropy.io import fits

In [2]:
def normalize(samples):
    for par, values in samples.items():
        if par in par_ranges.keys():
            low, high = par_ranges[par]
            values -= low
            values /= high-low

In [3]:
par_ranges = \
{
    'g1': [-0.5, 0.5],
    'g2': [-0.5, 0.5],
    'theta_int': [-np.pi, np.pi],
    'sini': [0, 1],
    'v0': [-30, 30],
    'vcirc': [60, 540],
    'rscale': [0.1, 10],
    'hlr': [0.1, 5],
}

In [7]:
n = 4000
data_dir = '/xdisk/timeifler/wxs0703/kl_nn/train_data_massive/'
samp_dir = '/xdisk/timeifler/wxs0703/kl_nn/samples/samples_massive.csv'
save_dir = '/xdisk/timeifler/wxs0703/kl_nn/train_data_massive/train_database'
samples = pd.read_csv(samp_dir)
normalize(samples)
with px.Writer(dirpath=save_dir, map_size_limit=200000, ram_gb_limit=2) as db:
    
    for index in range(250):
        start = time.time()
        folder = index+1
        img_stack = np.full((n, 1, 48, 48), 0.)
        spec_stack = np.full((n, 1, 3, 64), 0.)
        fids = np.full((n, 8), 0.)
        start_id = index*n
        ids = np.arange(start_id, start_id+n, dtype=np.uint64)

        for i in range(n):
            
            ID = start_id + i

            with fits.open(join(data_dir, f'temp_{folder}/training_{ID}.fits')) as hdu:

                img = hdu[7].data
                img /= np.max(img)
                img_stack[i, 0] = img

                specs = np.full((3, 64), 0.)
                for j in range(3):
                    spec = hdu[2*j+1].data
                    specs[j, :spec.shape[0]] = spec
                specs /= np.max(specs)
                spec_stack[i, 0] = specs

                fids[i] = np.array(samples.iloc[ID])[1:]
                
        db.put_samples({'img': img_stack,
                        'spec': spec_stack,
                        'fid_pars': fids,
                        'id': ids})
        t = round(time.time() - start, 2)
        
        print(f'folder {folder} complete, {t} seconds')

folder 1 complete, 22.75 seconds
folder 2 complete, 35.58 seconds
folder 3 complete, 35.83 seconds
folder 4 complete, 34.83 seconds
folder 5 complete, 34.97 seconds
folder 6 complete, 35.12 seconds
folder 7 complete, 35.58 seconds
folder 8 complete, 44.65 seconds
folder 9 complete, 35.34 seconds
folder 10 complete, 35.59 seconds
folder 11 complete, 35.55 seconds
folder 12 complete, 35.6 seconds
folder 13 complete, 45.74 seconds
folder 14 complete, 37.17 seconds
folder 15 complete, 38.93 seconds
folder 16 complete, 127.03 seconds
folder 17 complete, 80.57 seconds
folder 18 complete, 49.06 seconds
folder 19 complete, 48.38 seconds
folder 20 complete, 48.58 seconds
folder 21 complete, 52.07 seconds
folder 22 complete, 39.77 seconds
folder 23 complete, 41.12 seconds
folder 24 complete, 49.38 seconds
folder 25 complete, 73.29 seconds
folder 26 complete, 64.64 seconds
folder 27 complete, 57.58 seconds
folder 28 complete, 44.2 seconds
folder 29 complete, 47.51 seconds
folder 30 complete, 41.4

folder 239 complete, 44.83 seconds
folder 240 complete, 44.83 seconds
folder 241 complete, 41.0 seconds
folder 242 complete, 38.71 seconds
folder 243 complete, 35.8 seconds
folder 244 complete, 38.04 seconds
folder 245 complete, 45.96 seconds
folder 246 complete, 39.56 seconds
folder 247 complete, 38.31 seconds
folder 248 complete, 36.08 seconds
folder 249 complete, 39.79 seconds
folder 250 complete, 40.2 seconds


In [5]:
n = 4000
data_dir = '/xdisk/timeifler/wxs0703/kl_nn/test_data/'
samp_dir = '/xdisk/timeifler/wxs0703/kl_nn/samples/samples_test.csv'
save_dir = '/xdisk/timeifler/wxs0703/kl_nn/train_data_massive/train_database'
samples = pd.read_csv(samp_dir)
normalize(samples)
with px.Writer(dirpath=save_dir, map_size_limit=200000, ram_gb_limit=2) as db:
    
    for index in range(25):
        start = time.time()
        folder = index+1
        img_stack = np.full((n, 1, 48, 48), 0.)
        spec_stack = np.full((n, 1, 3, 64), 0.)
        fids = np.full((n, 8), 0.)
        start_id = index*n
        ids = np.arange(start_id, start_id+n, dtype=np.uint64)

        for i in range(n):
            
            ID = start_id + i

            with fits.open(join(data_dir, f'temp_{folder}/testing_{ID}.fits')) as hdu:

                img = hdu[7].data
                img /= np.max(img)
                img_stack[i, 0] = img

                specs = np.full((3, 64), 0.)
                for j in range(3):
                    spec = hdu[2*j+1].data
                    specs[j, :spec.shape[0]] = spec
                specs /= np.max(specs)
                spec_stack[i, 0] = specs

                fids[i] = np.array(samples.iloc[ID])[1:]
                
        db.put_samples({'img': img_stack,
                        'spec': spec_stack,
                        'fid_pars': fids,
                        'id': ids})
        t = round(time.time() - start, 2)
        
        print(f'folder {folder} complete, {t} seconds')

folder 1 complete, 40.06 seconds
folder 2 complete, 40.08 seconds
folder 3 complete, 39.2 seconds
folder 4 complete, 39.64 seconds
folder 5 complete, 40.3 seconds
folder 6 complete, 38.24 seconds
folder 7 complete, 39.45 seconds
folder 8 complete, 39.03 seconds
folder 9 complete, 39.35 seconds
folder 10 complete, 40.01 seconds
folder 11 complete, 40.43 seconds
folder 12 complete, 39.39 seconds
folder 13 complete, 39.74 seconds
folder 14 complete, 39.15 seconds
folder 15 complete, 38.76 seconds
folder 16 complete, 39.15 seconds
folder 17 complete, 39.23 seconds
folder 18 complete, 38.59 seconds
folder 19 complete, 38.55 seconds
folder 20 complete, 39.04 seconds
folder 21 complete, 39.2 seconds
folder 22 complete, 39.11 seconds
folder 23 complete, 38.92 seconds
folder 24 complete, 38.91 seconds
folder 25 complete, 38.78 seconds


FileNotFoundError: [Errno 2] No such file or directory: '/xdisk/timeifler/wxs0703/kl_nn/test_data/temp_26/testing_100000.fits'

In [12]:
with px.Reader(database_dir) as db:
    print(db)

pyxis.Reader
Location:		'/xdisk/timeifler/wxs0703/kl_nn/test_data/test_database'
Number of samples:	100000
Data keys (0th sample):
	'img' <- dtype: float64, shape: (1, 48, 48)
	'spec' <- dtype: float64, shape: (1, 3, 64)
	'fid_pars' <- dtype: float64, shape: (8,)
	'id' <- dtype: uint64, shape: ()
