Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to be able to ignore N-sample while LST-binning #932

Merged
merged 6 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 49 additions & 38 deletions hera_cal/lstbin.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def profile(fnc):

logger = logging.getLogger(__name__)


def baselines_same_across_nights(data_list):
"""
Check whether the sets of baselines in the datacontainers are consistent.
Expand Down Expand Up @@ -59,9 +60,10 @@ def baselines_same_across_nights(data_list):
same_across_nights = np.all([baseline_counts[k] == baseline_counts[bl] for bl in baseline_counts])
return same_across_nights


@profile
def lst_bin(data_list, lst_list, flags_list=None, nsamples_list=None, dlst=None, begin_lst=None, lst_low=None,
lst_hi=None, flag_thresh=0.7, atol=1e-10, median=False, truncate_empty=True,
lst_hi=None, flag_thresh=0.7, atol=1e-10, median=False, truncate_empty=True, weight_by_nsamples=True,
sig_clip=False, sigma=4.0, min_N=4, flag_below_min_N=False, return_no_avg=False, antpos=None,
rephase=False, freq_array=None, lat=-30.72152, verbose=True, bl_list=None):
"""
Expand Down Expand Up @@ -94,6 +96,8 @@ def lst_bin(data_list, lst_list, flags_list=None, nsamples_list=None, dlst=None,
median=True is not supported when nsamples_list is not None.
truncate_empty : type=boolean, if True, truncate output time bins that have
no averaged data in them.
weight_by_nsamples : type=boolean, if True, weight by nsamples during lst binning;
if False, only weight by flags.
sig_clip : type=boolean, if True, perform a sigma clipping algorithm of the LST bins on the
real and imag components separately. Resultant clip flags are OR'd between real and imag.
Warning: This is considerably slow.
Expand Down Expand Up @@ -211,9 +215,9 @@ def lst_bin(data_list, lst_list, flags_list=None, nsamples_list=None, dlst=None,
# iterate over keys in d
klist = list(d.keys())
for j, key in enumerate(klist):
if j % max(1, (len(klist)//100)) == 0:
if j % max(1, (len(klist) // 100)) == 0:
logger.info(f"Doing key {key} [{j+1}/{len(klist)}]")

# if bl_list is not None, use it to determine conjugation:
# this is to prevent situations where conjugation of bl in
# data_list is different from bl in data which can cause
Expand Down Expand Up @@ -316,7 +320,6 @@ def lst_bin(data_list, lst_list, flags_list=None, nsamples_list=None, dlst=None,
# wrap lst_bins if needed
lst_bins = lst_bins % (2 * np.pi)


# return un-averaged data if desired
if return_no_avg:
# return all binned data instead of just the bin average
Expand Down Expand Up @@ -346,7 +349,7 @@ def lst_bin(data_list, lst_list, flags_list=None, nsamples_list=None, dlst=None,
bin_count = []
# iterate over sorted LST grid indices in data[key]
for j, ind in enumerate(sorted(data[key].keys())):

# make data and flag arrays from lists
d = np.array(data[key][ind]) # shape = (Ndays x Nfreqs)
f = np.array(flags[key][ind])
Expand Down Expand Up @@ -400,18 +403,22 @@ def lst_bin(data_list, lst_list, flags_list=None, nsamples_list=None, dlst=None,
# (inverse variance weighted sum).
isfinite = np.isfinite(d)
n[~isfinite] = 0.0

norm = np.sum(n, axis=0).clip(1e-99, np.inf)
if weight_by_nsamples:
wgt = np.copy(n)
else:
wgt = np.ones_like(n)
wgt[~isfinite] = 0.0
norm = np.sum(wgt, axis=0).clip(1e-99, np.inf)
mask = norm > 0
real_avg_t = np.full(d.shape[1:], np.nan)
imag_avg_t = np.full(d.shape[1:], np.nan)
sm = np.nansum(d * n, axis=0)

sm = np.nansum(d * wgt, axis=0)
sm[mask] /= norm[mask]

real_avg_t[mask] = sm.real[mask]
imag_avg_t[norm>0] = sm.imag[mask]
imag_avg_t[norm > 0] = sm.imag[mask]

# add back nans as np.nansum sums nan slices to 0
flagged_f = np.logical_not(isfinite).all(axis=0)
real_avg_t[flagged_f] = np.nan
Expand Down Expand Up @@ -542,6 +549,7 @@ def lst_bin_arg_parser():
a.add_argument("--outdir", default=None, type=str, help="directory for writing output")
a.add_argument("--overwrite", default=False, action='store_true', help="overwrite output files")
a.add_argument("--lst_start", type=float, default=None, help="starting LST for binner as it sweeps across 2pi LST. Default is first LST of first file.")
a.add_argument("--weight_by_nsamples", default=True, action='store_true', help="Weight by nsamples during LST binning. If set to False, weight by flags only. Default True.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will work. As far as I know, there's no way to set the flag to False on the command line. So I think you'll need to use something like --weight-by-flags-only

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha, thanks! I will switch all the options to weight_by_flags_only.

a.add_argument("--sig_clip", default=False, action='store_true', help="perform robust sigma clipping before binning")
a.add_argument("--sigma", type=float, default=4.0, help="sigma threshold for sigma clipping")
a.add_argument("--min_N", type=int, default=4, help="minimum number of points in bin needed to proceed with sigma clipping")
Expand Down Expand Up @@ -644,7 +652,7 @@ def config_lst_bin_files(data_files, dlst=None, atol=1e-10, lst_start=None, verb

def lst_bin_files(data_files, input_cals=None, dlst=None, verbose=True, ntimes_per_file=60,
file_ext="{type}.{time:7.5f}.uvh5", outdir=None, overwrite=False, history='', lst_start=None,
atol=1e-6, sig_clip=True, sigma=5.0, min_N=5, flag_below_min_N=False, rephase=False,
atol=1e-6, weight_by_nsamples=True, sig_clip=True, sigma=5.0, min_N=5, flag_below_min_N=False, rephase=False,
output_file_select=None, Nbls_to_load=None, ignore_flags=False, average_redundant_baselines=False,
bl_error_tol=1.0, include_autos=True, ex_ant_yaml_files=None, Nblgroups=None, **kwargs):
"""
Expand Down Expand Up @@ -674,6 +682,8 @@ def lst_bin_files(data_files, input_cals=None, dlst=None, verbose=True, ntimes_p
overwrite : type=bool, if True overwrite output files
history : history to insert into output files
rephase : type=bool, if True, rephase data points in LST bin to center of bin
weight_by_nsamples : type=boolean, if True, weight by nsamples during lst binning;
if False, only weight by flags.
bin_kwargs : type=dictionary, keyword arguments for lst_bin.
atol : type=float, absolute tolerance for LST bin float comparison
output_file_select : type=int or integer list, list of integer indices of the output files to run on.
Expand Down Expand Up @@ -763,7 +773,7 @@ def lst_bin_files(data_files, input_cals=None, dlst=None, verbose=True, ntimes_p
if Nblgroups is None:
if Nbls_to_load in [None, 'None', 'none']:
Nbls_to_load = len(bl_nightly_dicts) + 1

Nblgroups = len(bl_nightly_dicts) // Nbls_to_load + 1
else:
Nbls_to_load = len(bl_nightly_dicts) // Nblgroups
Expand Down Expand Up @@ -865,7 +875,7 @@ def lst_bin_files(data_files, input_cals=None, dlst=None, verbose=True, ntimes_p
uvc.select(times=uvc.time_array[tinds])
gains, cal_flags, _, _ = uvc.build_calcontainers()
apply_cal.calibrate_in_place(data, gains, data_flags=flags, cal_flags=cal_flags,
gain_convention=uvc.gain_convention)
gain_convention=uvc.gain_convention)
utils.echo("Done with calibration.", verbose=verbose)

# redundantly average baselines, keying to baseline group key
Expand Down Expand Up @@ -920,7 +930,8 @@ def lst_bin_files(data_files, input_cals=None, dlst=None, verbose=True, ntimes_p
utils.echo("About to run LST binning...")
(bin_lst, bin_data, flag_data, std_data,
num_data) = lst_bin(data_list, lst_list, flags_list=flgs_list, dlst=dlst, begin_lst=begin_lst,
lst_low=fmin, lst_hi=fmax, truncate_empty=False, sig_clip=sig_clip, nsamples_list=nsamples_list,
lst_low=fmin, lst_hi=fmax, truncate_empty=False, sig_clip=sig_clip,
weight_by_nsamples=weight_by_nsamples, nsamples_list=nsamples_list,
sigma=sigma, min_N=min_N, flag_below_min_N=flag_below_min_N, rephase=rephase, freq_array=freq_array,
antpos=antpos, bl_list=all_blgroup_baselines)
# append to lists
Expand Down Expand Up @@ -973,9 +984,9 @@ def lst_bin_files(data_files, input_cals=None, dlst=None, verbose=True, ntimes_p


def make_lst_grid(
dlst: float,
begin_lst: float | None = None,
lst_width: float = 2*np.pi,
dlst: float,
begin_lst: float | None = None,
lst_width: float = 2 * np.pi,
) -> np.ndarray:
"""
Make a uniform grid in local sidereal time.
Expand All @@ -986,17 +997,17 @@ def make_lst_grid(

Parameters:
-----------
dlst :
dlst :
The width of a single LST bin in radians. 2pi must be equally divisible
by dlst. If not, will default to the closest dlst that satisfies this criterion that
is also greater than the input dlst. There is a minimum allowed dlst of 6.283e-6 radians,
or .0864 seconds.
begin_lst
Beginning point for lst_grid. ``begin_lst`` must fall exactly on an LST bin
given a dlst, within 0-2pi. If not, it is replaced with the closest bin.
Beginning point for lst_grid. ``begin_lst`` must fall exactly on an LST bin
given a dlst, within 0-2pi. If not, it is replaced with the closest bin.
Default is zero radians.
lst_width
The width of the LST grid (including all bins) in radians.
The width of the LST grid (including all bins) in radians.
Default is 2pi radians. Note that regardless of this value, the final grid
will always be equally divisible by 2pi.

Expand All @@ -1010,7 +1021,7 @@ def make_lst_grid(

# check 2pi is equally divisible by dlst
if not (
np.isclose((2 * np.pi / dlst) % 1, 0.0, atol=1e-5)
np.isclose((2 * np.pi / dlst) % 1, 0.0, atol=1e-5)
or np.isclose((2 * np.pi / dlst) % 1, 1.0, atol=1e-5)
):
# generate array of appropriate dlsts
Expand All @@ -1027,7 +1038,7 @@ def make_lst_grid(
dlst = new_dlst

# make an lst grid from [0, 2pi), with the first bin having a left-edge at 0 radians.
lst_grid = np.arange(0, 2*np.pi - 1e-7, dlst) + dlst / 2
lst_grid = np.arange(0, 2 * np.pi - 1e-7, dlst) + dlst / 2

# shift grid by begin_lst
if begin_lst is not None:
Expand All @@ -1041,17 +1052,17 @@ def make_lst_grid(
begin_lst = 0.0

lst_grid = lst_grid[lst_grid < (begin_lst + lst_width)]

return lst_grid


def sigma_clip(array: np.ndarray | np.ma.MaskedArray, sigma: float=4.0, min_N: int=4, axis: int = 0):
def sigma_clip(array: np.ndarray | np.ma.MaskedArray, sigma: float = 4.0, min_N: int = 4, axis: int = 0):
"""
One-iteration robust sigma clipping algorithm.

Parameters
----------
array
array
ndarray of *real* data.
sigma
sigma threshold to cut above.
Expand All @@ -1063,8 +1074,8 @@ def sigma_clip(array: np.ndarray | np.ma.MaskedArray, sigma: float=4.0, min_N: i

Output
------
clip_flags
A boolean array with same shape as input array,
clip_flags
A boolean array with same shape as input array,
with clipped values set to True.
"""
# ensure array is an array
Expand Down Expand Up @@ -1092,28 +1103,28 @@ def sigma_clip(array: np.ndarray | np.ma.MaskedArray, sigma: float=4.0, min_N: i


def gen_bl_nightly_dicts(
hds: list[io.HERAData],
bl_error_tol: float=1.0,
include_autos: bool=True,
redundant: bool=False,
ex_ant_yaml_files: list[str] | None=None
hds: list[io.HERAData],
bl_error_tol: float = 1.0,
include_autos: bool = True,
redundant: bool = False,
ex_ant_yaml_files: list[str] | None = None
) -> list[dict[int, list[tuple[int, int]]]]:
"""
Helper function to generate baseline dicts to keep track of reds between nights.

Parameters
----------
hds
A list of HERAData objects. Can have no data loaded (preferable) and should
A list of HERAData objects. Can have no data loaded (preferable) and should
refer to single files.
bl_error_tol
Baselines whose vector difference are within this tolerance are considered
redundant. In units of m.
include_autos
Whether to include autos in bl_nightly_dicts.
redundant
If True, ``bl_nightly_dict`` stores redundant group. If False, each
``bl_nightly_dict`` will contain a length-1 group for each baseline over all
If True, ``bl_nightly_dict`` stores redundant group. If False, each
``bl_nightly_dict`` will contain a length-1 group for each baseline over all
the nights.
ex_ant_yaml_files
A list of paths to yaml files with antennas to throw out on each night.
Expand All @@ -1122,7 +1133,7 @@ def gen_bl_nightly_dicts(
Outputs:
--------
If redundant:
list of dictionaries of the form
list of dictionaries of the form
``{0: [(a0, b0), (a0, c0)...], 1: [(a1, b1),.., ], ... Nnight: [(ANnight, BNnight), ...,]}.
Each dictionary represents a unique baseline length and orientation.
where the key of each dictionary is an index for each night to be LST binned and each value
Expand Down
41 changes: 40 additions & 1 deletion hera_cal/tests/test_lstbin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from ..datacontainer import DataContainer
from ..data import DATA_PATH
import shutil


@pytest.mark.filterwarnings("ignore:The default for the `center` keyword has changed")
@pytest.mark.filterwarnings("ignore:Degrees of freedom <= 0 for slice")
@pytest.mark.filterwarnings("ignore:Mean of empty slice")
Expand Down Expand Up @@ -134,7 +136,7 @@ def test_lstbin(self):
output = lstbin.lst_bin(self.data_list, self.lst_list, flags_list=None, dlst=0.01,
verbose=False, sig_clip=True, min_N=15, flag_below_min_N=True, sigma=2)
# test wrapping
lst_list = [(copy.deepcopy(l) + 6) % (2 * np.pi) for l in self.lst_list]
lst_list = [(copy.deepcopy(_lst) + 6) % (2 * np.pi) for _lst in self.lst_list]
output = lstbin.lst_bin(self.data_list, lst_list, dlst=0.001, begin_lst=np.pi)
assert output[0][0] > output[0][-1]
assert len(output[0]) == 175
Expand Down Expand Up @@ -168,6 +170,33 @@ def test_lstbin(self):
assert np.all(np.isclose(output[2][k], output2[2][k]))
assert np.all(np.isclose(output[1][k], output2[1][k]))

# test weighted_by_nsamples, nsamples are propagated but data is not weighted by nsamples if set to False
output1 = lstbin.lst_bin(self.data_list, self.lst_list, dlst=dlst,
flags_list=self.flgs_list, nsamples_list=self.nsmp_list,
weight_by_nsamples=True)

nsmps1 = copy.deepcopy(self.nsmps1)
nsmps1[(24, 25, 'ee')][:, 32] = 0
nsmps2 = copy.deepcopy(self.nsmps2)
nsmps2[(24, 25, 'ee')][:, 32] = 0
nsmps3 = copy.deepcopy(self.nsmps3)
nsmps3[(24, 25, 'ee')][:, 32] = 0
nsmps_list = [nsmps1, nsmps2, nsmps3]
output = lstbin.lst_bin(self.data_list, self.lst_list, dlst=dlst,
flags_list=self.flgs_list, nsamples_list=nsmps_list,
weight_by_nsamples=True)
# Check Nsamples are all 0
assert np.allclose(output[-1][(24, 25, 'ee')].real[:, 32], 0)
# Check data got weighted sum to 0
assert np.allclose(output[1][(24, 25, 'ee')].real[100, 32], 0)
output = lstbin.lst_bin(self.data_list, self.lst_list, dlst=dlst,
flags_list=self.flgs_list, nsamples_list=nsmps_list,
weight_by_nsamples=False)
# Check Nsamples are all 0
assert np.allclose(output[-1][(24, 25, 'ee')].real[:, 32], 0)
# Check data is the same as before
assert np.allclose(output[1][(24, 25, 'ee')].real, output1[1][(24, 25, 'ee')].real)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we please extract this into its own test?

def test_lstbin_vary_nsamps(self):
# test execution
pytest.raises(NotImplementedError, lstbin.lst_bin, self.data_list, self.lst_list, flags_list=self.flgs_list,
Expand Down Expand Up @@ -244,6 +273,16 @@ def test_lst_bin_files(self):
os.remove(output_lst_file)
os.remove(output_std_file)

# test weight_by_nsamples
lstbin.lst_bin_files(self.data_files, ntimes_per_file=250, outdir="./", overwrite=True,
verbose=False, rephase=True, weight_by_nsamples=False, file_ext=file_ext)
output_lst_file = "./zen.ee.LST.0.20124.uvh5"
output_std_file = "./zen.ee.STD.0.20124.uvh5"
assert os.path.exists(output_lst_file)
assert os.path.exists(output_std_file)
os.remove(output_lst_file)
os.remove(output_std_file)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also extract this into its own test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually so for here I did not do anything I just tested there is no error running this new option so it kind of falls into the "# basic execution" catalogue, similar to all the tests above like testing the rephase option. Does this still need to be its separate test?

# test data_list is empty
data_files = [[sorted(glob.glob(DATA_PATH + '/zen.2458043.*XRAA.uvh5'))[0]],
[sorted(glob.glob(DATA_PATH + '/zen.2458045.*XRAA.uvh5'))[-1]]]
Expand Down
Loading