Skip to content

Commit

Permalink
Base classes for creation stages (#96)
Browse files Browse the repository at this point in the history
* draft the scaffolding of noisifier

* draft the scaffolding of selector

* add run() method to noisifier and selector

* Finish up base classes for Noisifier and Selector, add unit tests

* added addRandom.py

* fix up error message

---------

Co-authored-by: Eric Charles <badass@stanford.edu>
  • Loading branch information
ztq1996 and eacharles committed Apr 9, 2024
1 parent 2eb1849 commit 7c310cf
Show file tree
Hide file tree
Showing 6 changed files with 225 additions and 14 deletions.
33 changes: 33 additions & 0 deletions src/rail/creation/degradation/addRandom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Add a column of random numbers to a dataframe."""

import numpy as np

from ceci.config import StageParameter as Param
from rail.creation.noisifier import Noisifier

class AddColumnOfRandom(Noisifier):
"""Add a column of random numbers to a dataframe
"""

name = "AddColumnOfRandom"
config_options = Noisifier.config_options.copy()
config_options.update(
col_name=Param(str, "chaos_bunny", msg="Name of the column with random numbers"),
)

def __init__(self, args, comm=None):
"""
Constructor
Does standard Noisifier initialization
"""
Noisifier.__init__(self, args, comm=comm)

def _initNoiseModel(self): # pragma: no cover
np.random.seed(self.config.seed)

def _addNoise(self): # pragma: no cover
data = self.get_data('input')
copy = data.copy()
copy.insert(0, self.config.col_name, np.random.uniform(size=len(copy)))
self.add_data("output", copy)
19 changes: 10 additions & 9 deletions src/rail/creation/degradation/quantityCut.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@
from numbers import Number

import numpy as np
from rail.creation.degrader import Degrader
from rail.creation.selector import Selector


class QuantityCut(Degrader):
class QuantityCut(Selector):
"""Degrader that applies a cut to the given columns.
Note that if a galaxy fails any of the cuts on any one of its columns, that
galaxy is removed from the sample.
"""

name = "QuantityCut"
config_options = Degrader.config_options.copy()
config_options = Selector.config_options.copy()
config_options.update(cuts=dict)

def __init__(self, args, comm=None):
Expand All @@ -23,7 +23,7 @@ def __init__(self, args, comm=None):
Performs standard Degrader initialization as well as defining the cuts
to be applied.
"""
Degrader.__init__(self, args, comm=comm)
Selector.__init__(self, args, comm=comm)
self.cuts = None
self.set_cuts(self.config["cuts"])

Expand Down Expand Up @@ -82,7 +82,7 @@ def set_cuts(self, cuts: dict):
else:
raise TypeError(bad_cut_msg)

def run(self):
def _select(self):
"""Applies cuts.
Notes
Expand All @@ -97,17 +97,18 @@ def run(self):
columns = set(self.cuts.keys()).intersection(data.columns)

if len(columns) == 0: # pragma: no cover
self.add_data("output", data)
return np.ones(len(data), dtype=int)
else:
# generate a pandas query from the cuts
query = [
f"{col} > {self.cuts[col][0]} & {col} < {self.cuts[col][1]}"
for col in columns
]
query = " & ".join(query)

out_data = data.query(query)
self.add_data("output", out_data)
out_indices = data.query(query).index.values
out_mask = np.zeros(len(data), dtype=int)
out_mask[out_indices] = 1
return out_mask

def __repr__(self): # pragma: no cover
"""Pretty print this object."""
Expand Down
2 changes: 1 addition & 1 deletion src/rail/creation/degrader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rail.core.data import PqHandle


class Degrader(RailStage):
class Degrader(RailStage): # pragma: no cover
"""Base class Degraders, which apply various degradations to synthetic
photometric data.
Expand Down
83 changes: 83 additions & 0 deletions src/rail/creation/noisifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Abstract base class defining a noisifier.
The key feature here is the run adds noise to the catalog.
Intended subclasses are noisifier that adds LSST noise / other telescope noise
"""

from rail.core.stage import RailStage
from rail.core.data import PqHandle


class Noisifier(RailStage):
"""Base class Noisifier, which adds noise to the input catalog
Noisifier take "input" data in the form of pandas dataframes in Parquet
files and provide as "output" another pandas dataframes written to Parquet
files.
"""

name = 'Noisifier'
config_options = RailStage.config_options.copy()
config_options.update(seed=1337)
inputs = [('input', PqHandle)]
outputs = [('output', PqHandle)]

def __init__(self, args, comm=None):
"""Initialize Noisifier that can add noise to photometric data"""
RailStage.__init__(self, args, comm=comm)


def _initNoiseModel(self): # pragma: no cover
raise NotImplementedError("Noisifier._initNoiseModel()")

def _addNoise(self): # pragma: no cover
raise NotImplementedError("Noisifier._addNoise()")

def __call__(self, sample, seed: int = None):
"""The main interface method for ``Noisifier``.
Adds noise to the input catalog
This will attach the input to this `Noisifier`
Then it will call the _initNoiseModel() and _addNoise(), which need to be
implemented by the sub-classes.
The _initNoiseModel() method will initialize the noise model of the sub-classes, and
store the noise model as self.noiseModel
The _addNoise() method will add noise to the flux and magnitude of the column of the
catalog.
The finalize() method will check the end results (like preserving number of rows)
Finally, this will return a PqHandle providing access to that output
data.
Parameters
----------
sample : table-like
The sample to be degraded
seed : int, default=None
An integer to set the numpy random seed
Returns
-------
output_data : PqHandle
A handle giving access to a table with degraded sample
"""
if seed is not None:
self.config.seed = seed
self.set_data('input', sample)

self.run()
self.finalize()
return self.get_handle('output')


def run(self):

self._initNoiseModel()
self._addNoise()


75 changes: 75 additions & 0 deletions src/rail/creation/selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Abstract base class defining a selector.
The key feature here is make selection to either the photometric or spectroscopic catalog.
Intended subclasses spectroscopic selection, probability selection on a grid for the photometry,
or pure photometric selection.
"""

from ceci.config import StageParameter as Param
from rail.core.stage import RailStage
from rail.core.data import PqHandle


class Selector(RailStage):
"""Base class Selector, which makes selection to the catalog
Selector take "input" data in the form of pandas dataframes in Parquet
files and provide as "output" another pandas dataframes written to Parquet
files.
"""

name = 'Selector'
config_options = RailStage.config_options.copy()
config_options.update(
drop_rows=Param(bool, True, msg="Drop selected rows from output table"),
)
inputs = [('input', PqHandle)]
outputs = [('output', PqHandle)]

def __init__(self, args, comm=None):
"""Initialize Noisifier that can add noise to photometric data"""
RailStage.__init__(self, args, comm=comm)

def __call__(self, sample):
"""The main interface method for ``Selector``.
Adds noise to the input catalog
This will attach the input to this `Selector`
Then it will call the select() which add a flag column to the catalog. flag=1 means
selected, 0 means dropped.
If dropRows = True, the dropped rows will not be presented in the output catalog,
otherwise, all rows will be presented.
Finally, this will return a PqHandle providing access to that output
data.
Parameters
----------
sample : table-like
The sample to be selected
Returns
-------
output_data : PqHandle
A handle giving access to a table with selected sample
"""
self.set_data('input', sample)
self.run()
self.finalize()
return self.get_handle('output')

def run(self):
data = self.get_data('input')
selection_mask = self._select()
if self.config['drop_rows']:
out_data = data[selection_mask.astype(bool)]
else:
out_data = data.copy()
out_data.insert(0, 'flag', selection_mask)
self.add_data("output", out_data)

def _select(self): # pragma: no cover
raise NotImplementedError("Selector._select()")
27 changes: 23 additions & 4 deletions tests/creation/test_degraders.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rail.core.data import DATA_STORE, TableHandle
from rail.core.util_stages import ColumnMapper
from rail.creation.degradation.quantityCut import QuantityCut
# from rail.creation.degradation.spectroscopic_selections import *
from rail.creation.degradation.addRandom import AddColumnOfRandom


@pytest.fixture
Expand Down Expand Up @@ -71,14 +71,33 @@ def test_QuantityCut_returns_correct_shape(data):
"""Make sure QuantityCut is returning the correct shape"""

cuts = {
"u": 0,
"y": (1, 2),
"u": 30,
"redshift": (1, 2),
}
degrader = QuantityCut.make_stage(cuts=cuts)

degraded_data = degrader(data).data

assert degraded_data.shape == data.data.query("u < 0 & y > 1 & y < 2").shape
assert degraded_data.shape == data.data.query("u < 30 & redshift > 1 & redshift < 2").shape
os.remove(degrader.get_output(degrader.get_aliased_tag("output"), final_name=True))


degrader_w_flag = QuantityCut.make_stage(name="degrader_w_flag", cuts=cuts, drop_rows=False)

degraded_data_w_flag = degrader_w_flag(data).data

test_mask = np.zeros(len(data.data), dtype=int)
out_indices = data.data.query("u < 30 & redshift > 1 & redshift < 2").index.values
test_mask[out_indices] = 1

assert (degraded_data_w_flag['flag'] == test_mask).all()
os.remove(degrader_w_flag.get_output(degrader_w_flag.get_aliased_tag("output"), final_name=True))



def test_add_random(data):

add_random = AddColumnOfRandom.make_stage()

test_data = add_random(data, seed=1234).data
assert len(test_data[add_random.config.col_name]) == len(data.data)

0 comments on commit 7c310cf

Please sign in to comment.