In [1]:
#################################################################################
#   MAGIC TRICK FOR HAVING tab, shift+tab COMMANDS!
#################################################################################

%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2

# Create 1D Dataset 

We create a Training/Validation/Test 1D dataset out of the 2D images; To do so, we extract a random percentage of pixels out of the image

In [2]:
import os 
import sys
import gc
from typing import Union
import tqdm

import h5py
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

import torch
import torchvision

print(f"PyTorch v.\t{torch.__version__}")
print(f"TorchVision v.\t{torchvision.__version__}\n")

# in torch/pytorch data and models need to be moved in the specific processing unit
# this code snippet allows to set the variable "device" according to available resoirce (cpu or cuda gpu)
if torch.cuda.is_available():
    print('number of devices: ', torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))

device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Computation device: {device}\n")

import torch.nn as nn
import torch.nn.functional as F

from pretreatment import rebin_xrf

PyTorch v.	1.10.1+cu102
TorchVision v.	0.11.2+cu102

number of devices:  1
Tesla T4
Computation device: cuda



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_random_indeces(_x: Union[np.array, torch.Tensor], _N_shown: int) -> list:
    _random_indeces = np.random.choice(_x.shape[0], _N_shown, replace=False) 
    return (torch.tensor(_random_indeces), _x[_random_indeces,:])

## CH dataset

In [6]:
PATH_TO_XRF_DATA   = '/jupyter/notebooks/Data/Synthetic_XRF/XRF/'
PATH_TO_STORE_DATA = '/jupyter/notebooks/Article/CHData/Synth/1D/'

TRAIN_DIR_NAME = 'train'
VAL_DIR_NAME   = 'val'
TEST_DIR_NAME  = 'test'

compression_factor = 9

dataset_name = 'img'
_N_shown = 20*512*512 // 100

dir_names = [TRAIN_DIR_NAME, VAL_DIR_NAME, TEST_DIR_NAME]

for dir_name in dir_names:
    full_path = os.path.join(PATH_TO_XRF_DATA, dir_name)
    if not os.path.isdir( full_path ):
        raise Exception(f"{full_path} does not exists.")
    
    for f in tqdm.tqdm(os.listdir(full_path), desc=f"{f}"):
        item_path =  os.path.join(full_path, f) 
        filename = f.split('.')[0]
        if f.endswith('.h5'):
            # Open 2D XRF
            with h5py.File(item_path, 'r') as _h5:
                #_xrf = torch.Tensor( _h5[dataset_name][()] ).float()
                _xrf = np.array( _h5[dataset_name][()] , dtype=int)
            # Create 1D XRF 
            _xrf = torch.tensor(_xrf).float()
            _xrf = _xrf.reshape(-1, _xrf.shape[-1])
            _random_indeces, _x = extract_random_indeces(_xrf, _xrf.shape[0]*20//100)
            # Store 
            if not os.path.isdir(PATH_TO_STORE_DATA):
                os.mkdir(PATH_TO_STORE_DATA)
            new_item_path = os.path.join(PATH_TO_STORE_DATA, f'{dir_name}.h5')
            with h5py.File(new_item_path, 'a') as new_h5: 
                new_h5.create_dataset(f"{filename}/pixel",  data=_random_indeces, compression="gzip", compression_opts=compression_factor)
                new_h5.create_dataset(f"{filename}/hist" ,  data=_x, compression="gzip", compression_opts=compression_factor)
        

1637.h5: 100%|██████████| 11/11 [07:00<00:00, 38.23s/it]
1640.h5: 100%|██████████| 5/5 [02:29<00:00, 29.92s/it]
.ipynb_checkpoints: 100%|██████████| 3/3 [01:17<00:00, 25.79s/it]


## AstroDataset

In [4]:
PATH_TO_XRF_DATA   = '/jupyter/notebooks/AstroDeepClustering/AstroData/2D/'
PATH_TO_STORE_DATA = '/jupyter/notebooks/AstroDeepClustering/AstroData/1D/'

TRAIN_DIR_NAME = 'train'
VAL_DIR_NAME   = 'val'
TEST_DIR_NAME  = 'test'

compression_factor = 9

MAX_BIN = 3000
REBIN_SIZE = 1024
SIGNAL_TO_NOISE_THRESHOLD = 25.0
STD_FACTOR = + 1.5

dataset_name = 'img'
_signal_percentage = 100
_noise_percentage = 5
_zeros_percentage = 5

dir_names = [TRAIN_DIR_NAME, VAL_DIR_NAME, TEST_DIR_NAME]

for dir_name in dir_names:
    full_path = os.path.join(PATH_TO_XRF_DATA, dir_name)
    if not os.path.isdir( full_path ):
        raise Exception(f"{full_path} does not exists.")
    
    for f in tqdm.tqdm(os.listdir(full_path)):
        item_path =  os.path.join(full_path, f) 
        filename = f.split('.')[0]
        if f.endswith('.h5'):
            # Open 2D XRF
            with h5py.File(item_path, 'r') as _h5:
                _ex_datacube = np.array( _h5['img'][()] , dtype=np.float32)
                _ex_datacube = torch.tensor(_ex_datacube).float()
                _ex_datacube[_ex_datacube < 0] = 0.0 # replace everything unphysically under zero
                #print( _ex_datacube.shape )
                # redice datacube
                _red_datacube = _ex_datacube[:, :, :MAX_BIN]
                _red_datacube = _red_datacube.reshape(-1, MAX_BIN)
                #print( _red_datacube.shape )
                # Rebin datacube
                _red_datacube = rebin_xrf(_red_datacube, n_bins=REBIN_SIZE)
                # Energy smoothing
                # TBD
                #print( _red_datacube.shape )
                _mean_max = max( torch.log(1 + _red_datacube).sum(dim=-1).mean() + STD_FACTOR*torch.log(1 + _red_datacube).sum(dim=-1).std(), SIGNAL_TO_NOISE_THRESHOLD)
                #print(f"_mean_max: {_mean_max}")
                _signal_datacube = _red_datacube[torch.log(1 + _red_datacube).sum(dim=-1) >= _mean_max]
                _noise_datacube  = _red_datacube[torch.log(1 + _red_datacube).sum(dim=-1) <  _mean_max]
                _zeros_datacube  = _noise_datacube[_noise_datacube.sum(dim=-1) == 0]
                # remove all zeros
                _noise_datacube = _noise_datacube[_noise_datacube.sum(dim=-1) > 0]
                #print(f"""Signal counts:\t{_signal_datacube.shape[0]}\nNoise  counts:\t{_noise_datacube.shape[0]}\nSignal/Noise ration:\t{_signal_datacube.shape[0]/_noise_datacube.shape[0]*100:.4f}%\n""")
            # Create 1D Dataset 
            _N_signal = _signal_percentage * _signal_datacube.shape[0] // 100
            _signal_indeces, _x_signal = extract_random_indeces(_signal_datacube, _N_signal)
            # Add noise
            _N_noise = _noise_percentage * _noise_datacube.shape[0] // 100
            _noise_indeces, _x_noise = extract_random_indeces(_noise_datacube, _N_noise)
            # Add zeros
            _N_zeros = _zeros_percentage * _zeros_datacube.shape[0] // 100
            _zeros_indeces, _x_zeros = extract_random_indeces(_zeros_datacube, _N_zeros)
            # merge
            _random_indeces = torch.cat([_signal_indeces, _noise_indeces, _zeros_indeces], dim=-1)
            _x = torch.cat([_x_signal, _x_noise, _x_zeros], dim=0)
            # Store 
            if not os.path.isdir(PATH_TO_STORE_DATA):
                os.mkdir(PATH_TO_STORE_DATA)
            new_item_path = os.path.join(PATH_TO_STORE_DATA, f'{dir_name}.h5')
            with h5py.File(new_item_path, 'a') as new_h5: 
                new_h5.create_dataset(f"{filename}/pixel",  data=_random_indeces, compression="gzip", compression_opts=compression_factor)
                new_h5.create_dataset(f"{filename}/hist",   data=_x,              compression="gzip", compression_opts=compression_factor)    

100%|██████████| 3/3 [00:49<00:00, 16.37s/it]
100%|██████████| 1/1 [00:15<00:00, 15.77s/it]
100%|██████████| 1/1 [00:17<00:00, 17.19s/it]


In [5]:
_random_indeces.shape

torch.Size([8665])