/
dc2_photoz_parquet.py
193 lines (157 loc) · 6.8 KB
/
dc2_photoz_parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
DC2 Parquet Photo-z Catalog Reader
"""
import os
import re
import warnings
import numpy as np
from .dc2_dm_catalog import DC2DMCatalog, DC2DMTractCatalog
from GCR import BaseGenericCatalog
from .parquet import ParquetFileWrapper
__all__ = ['DC2PhotozMixin', 'CosmoDC2Parquet', 'DC2PhotozGalaxyCatalog',
'DC2PhotozCatalog', 'PZSKRFCatalog']
class DC2PhotozMixin:
_PDF_BIN_INFO = {
'start': 0.005,
'stop': 3.005,
'nbins': 301,
'decimals_to_round': 3,
}
@staticmethod
def _generate_modifiers(**kwargs):
"""Creates a dictionary relating native and homogenized column names
Returns:
A dictionary of the form {<homogenized name>: <native name>, ...}
"""
modifiers = {
'photoz_odds': 'ODDS',
'photoz_mode': 'z_mode',
'photoz_median': 'z_median',
'photoz_mean': 'z_mean',
'photoz_pdf': 'pdf',
'ID': 'galaxy_id',
'photoz_mode_ml': 'z_mode_ml',
'photoz_mode_ml_red_chi2': 'z_mode_ml_red_chi2',
}
return modifiers
def _process_pdf_bins(self, pdf_bin_info=None):
self._pdf_bin_info = pdf_bin_info or self._PDF_BIN_INFO
self._pdf_bin_centers = np.round(np.linspace(
self._pdf_bin_info['start'],
self._pdf_bin_info['stop'],
self._pdf_bin_info['nbins'],
), self._pdf_bin_info['decimals_to_round'])
self._n_pdf_bins = len(self._pdf_bin_centers)
@property
def photoz_pdf_bin_centers(self):
return self._pdf_bin_centers
@property
def n_pdf_bins(self):
return self._n_pdf_bins
class CosmoDC2Parquet(DC2DMCatalog):
_native_filter_quantities = {'healpix_pixel', 'redshift_block_lower'}
def _subclass_init(self, **kwargs):
self._healpix_pixels = None
if kwargs.get('healpix_pixels') is not None:
self._healpix_pixels = [int(t) for t in kwargs['healpix_pixels']]
super()._subclass_init(**kwargs)
def _extract_dataset_info(self, filename):
match = re.match(self.FILE_PATTERN, filename)
try:
zlo, _, hpx = tuple(map(int, match.groups()))
except (ValueError, TypeError, AttributeError):
warnings.warn('Filename {} does not contain correct z/healpix info or not in correct format. Skipped')
return False
return {'redshift_block_lower': zlo, 'healpix_pixel': hpx}
def _sort_datasets(self, datasets):
current_healpix_pixels = set(dataset.info['healpix_pixel'] for dataset in datasets)
if self._healpix_pixels and not all(t in current_healpix_pixels for t in self._healpix_pixels):
warnings.warn('Not all healpix pixels that were requested are loaded. Use `available_healpix_pixels` to see what pixels have been loaded.')
return sorted(datasets, key=lambda d: (d.info['redshift_block_lower'], d.info['healpix_pixel']))
@property
def available_healpix_pixels(self):
"""Returns a sorted list of available tracts
Returns:
A sorted list of available tracts as integers
"""
return [dataset.info['healpix_pixel'] for dataset in self._datasets]
class DC2PhotozGalaxyCatalog(DC2PhotozMixin, CosmoDC2Parquet):
"""Parquet Photoz Catalog reader (for cosmoDC2 galaxy catalog)
Parameters
----------
base_dir (str): The directory of data files being served
file_pattern (str): The optional regex pattern of served data files
meta_path (str): path to yaml entries for quantities
healpix_pixels (list): List of tracts (integer)
"""
FILE_DIR = os.path.dirname(os.path.abspath(__file__))
FILE_PATTERN = r'fzboost_photoz_pdf_z_(\d)_(\d).step_all.healpix_(\d+).parquet'
META_PATH = os.path.join(FILE_DIR, 'catalog_configs', '_dc2_photoz_parquet.yaml')
# FlexZBoost has slightly different binning than BPZ
_PDF_BIN_INFO = {
'start': 0.0,
'stop': 3.0,
'nbins': 301,
'decimals_to_round': 3,
}
def _subclass_init(self, **kwargs):
super(DC2PhotozGalaxyCatalog, self)._subclass_init(**kwargs)
self._process_pdf_bins(kwargs.get("pdf_bin_info"))
class DC2PhotozCatalog(DC2PhotozMixin, DC2DMTractCatalog):
"""DC2 Parquet Photoz Catalog reader
Parameters
----------
base_dir (str): The directory of data files being served
file_pattern (str): The optional regex pattern of served data files
meta_path (str): path to yaml entries for quantities
tracts (list): List of tracts (integer)
"""
FILE_DIR = os.path.dirname(os.path.abspath(__file__))
FILE_PATTERN = r'photoz_pdf_Run\d\.[0-9a-z]+_tract_\d+\.parquet$'
META_PATH = os.path.join(FILE_DIR, 'catalog_configs', '_dc2_photoz_parquet.yaml')
def _subclass_init(self, **kwargs):
super(DC2PhotozCatalog, self)._subclass_init(**kwargs)
self._process_pdf_bins(kwargs.get("pdf_bin_info"))
class PZSKRFCatalog(DC2PhotozMixin,BaseGenericCatalog):
"""
SK-learn Random Forest-based Photo-z catalog class. Borrowed some structur
from Scott Daniel's AGN catalog, as it also uses one single file to load
everything. Columns available are different than other PZ catalogs, so
we will need this custom subclass.
"""
# Olivia's data has different binning and numbins than BPZ
_PDF_BIN_INFO = {
'start': 0.015,
'stop': 2.985,
'nbins': 100,
'decimals_to_round': 3,
}
def _subclass_init(self, base_dir, filename, **kwargs):
if not os.path.isdir(base_dir):
raise RuntimeError("Catalog directory %s does not exist." % (base_dir))
self._path = os.path.join(base_dir, filename)
self._dataset = ParquetFileWrapper(self._path,None)
self._columns = self._dataset.columns
self._quantity_modifiers = self._generate_quantity_modifiers()
self._columns = self._dataset.columns
self._process_pdf_bins(kwargs.get("pdf_bin_info"))
def __del__(self):
self._dataset = None
def _generate_quantity_modifiers(self):
quantity_modifiers = {
'galaxy_id': 'galid',
'mag_i_photoz': 'mag_i',
'rz_real': 'rz_real',
'photoz_mode': 'photoz_mode',
'photoz_pdf': 'photoz_pdf',
}
return quantity_modifiers
def _iter_native_dataset(self, native_filters=None):
if native_filters is not None:
raise RuntimeError("*native_filters* not supported")
yield self._dataset
def _generate_native_quantity_list(self):
return self._columns
@staticmethod
def _obtain_native_data_dict(native_quantities_needed, native_quantity_getter):
return native_quantity_getter.read_columns(list(native_quantities_needed), as_dict = True)