# Season Data Quality Summary

**Josh Dillon**, Last Revised January 2022

This notebook parses information from each nightly `rtp_summary` notebook (as saved to .csvs) and builds a variety of useful tables. It is designed to highlight general trends across days.

In [26]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 1000)
import matplotlib.pyplot as plt
import matplotlib
import glob
import re
import os
from pathlib import Path
from astropy.time import Time
from IPython.display import display, HTML
from hera_notebook_templates.utils import status_colors, Antenna
import yaml
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
display(HTML("<style>.container { width:100% !important; }</style>"))

In [6]:
# If you want to run this notebook locally, copy the output of the next cell into the next line of this cell.
yaml_folder = Path('/lustre/aoc/projects/hera/h6c-analysis/day_flags/0/')
os.environ["A_PRIORI_YAML_FOLDER"] = str(yaml_folder)

In [7]:
# Use environment variables to figure out path to the csvs
yaml_folder = Path(os.environ["A_PRIORI_YAML_FOLDER"])
print(f'yaml_folder = "{yaml_folder}"')

yaml_folder = "/lustre/aoc/projects/hera/h6c-analysis/day_flags/0"


In [29]:
# load yamls in chronological order
yamls = sorted(yaml_folder.glob('*_flags.yaml'))
print(f'Found {len(yamls)} a-priori YAMLS in {yaml_folder}')

Found 25 a-priori YAMLS in /lustre/aoc/projects/hera/h6c-analysis/day_flags/0


In [None]:
# 

In [None]:
# Per-season options
def jd_to_summary_url(jd):
    return f'https://htmlpreview.github.io/?https://github.com/HERA-Team/H6C_Notebooks/blob/main/_rtp_summary_/rtp_summary_{jd}.html'

def ant_to_report_url(ant):
    return f'https://htmlpreview.github.io/?https://github.com/HERA-Team/H6C_Notebooks/blob/main/antenna_report/antenna_{ant}_report.html'

## Load Data

In [37]:
data = {}
for y in yamls:
    int_jd = int(y.name[y.name.index("245"):y.name.index("245")+7])
    with open(y, 'r') as fl:
        data[int_jd] = yaml.load(fl, Loader=yaml.SafeLoader)

In [38]:
data.keys()

dict_keys([2459847, 2459849, 2459851, 2459853, 2459854, 2459857, 2459858, 2459859, 2459860, 2459861, 2459862, 2459863, 2459864, 2459865, 2459866, 2459867, 2459868, 2459869, 2459870, 2459871, 2459872, 2459873, 2459874, 2459875, 2459876])

In [39]:
# Calculate the total amount of time that is flagged
hours_flagged = 0
for jd, d in data.items():
    for flg in d['JD_flags']:
        hours_flagged += (flg[1] - flg[0])*24

In [40]:
hours_flagged_frac = hours_flagged / (len(data)*8)

In [41]:
len(data)*8

200

In [44]:
print(f"Total number of hours flagged: {hours_flagged:.2f}")
print(f"Percentage of hours flagged: { hours_flagged_frac * 100:.2f}%")

Total number of hours flagged: 26.62
Percentage of hours flagged: 13.31%


In [45]:
ant_nights_flagged = 0
for jd, d in data.items():
    ant_nights_flagged += len(d['ex_ants'])

In [46]:
ant_nights_flagged

2672

In [47]:
print("Average number of ants flagged per night: ", ant_nights_flagged / len(data))

Average number of ants flagged per night:  106.88


In [None]:
def antenna_flag_plot():
    fig, ax = plt.subplots(1, 1, figsize=(14,7), dpi=100)
    utc = Time(jds, format='jd').datetime
    floor_jd = int(np.floor(np.min(jds) / 1000) * 1000)
    ax.plot_date(utc, to_show['Ants in Data'], '.-', ms=10, label='Antennas in Data')
    ax.plot(utc, to_show['Unflagged Antenna-Days'], 'r.-', ms=10, label='Unflagged Antenna-Days')
    ax2 = ax.twiny()
    ax2.plot(np.array(jds) - floor_jd, to_show['Unflagged Antenna-Days'], 'r.', ms=10)
    ax2.set_xlabel(f'JD - {floor_jd}')
    ax.set_ylabel('Number of Antennas')
    ax.set_xlabel('Date')
    ax.legend()
    ax2.grid()
    ax.grid(axis='y')

# Figure 1: (Unflagged) Antennas in the Array

This plot shows the progression of the number of antennas in the data and the number of antenna-days that are not automatically flagged.

In [None]:
antenna_flag_plot()    

# Table 1: Per Night Flagging

This table shows how many antennas we in the data for each night, the effective number of unflagged antenna-days, and the fraction of antennda-days flagged by:
* `auto_metrics` for bad autocorrelation shape, amplitude, or temporal structure,
* `ant_metrics` for low/no correlations or for having polarizations swapped, or
* `redcal` for non-redundancy. 

Note that an antenna can be flagged by both `auto_metrics` and `ant_metrics`, so those quantities may add up to more than 100%. However an antenna flagged by either `auto_metrics` (for the night) or `ant_metrics` (for the file) will be excluded from `redcal`, so these percentages will tend to be small. Dashes indicate missing metric data, likely because part of the RTP failed to run. Also included are links to individual `rtp_summary` notebooks.

In [None]:
HTML(table.render(render_links=True, escape=False))

In [None]:
# Build table for antenna flagging
antnums = sorted(antennas.keys())
to_show = {'Ant': [f'<a href="{ant_to_report_url(antnum)}" target="_blank">{antnum}</a>' for antnum in antnums]}
to_show['Node'] = [antennas[ant].node for ant in antnums]

to_show['Most Recent Status'] = [antennas[ant].statuses[np.max(list(antennas[ant].statuses.keys()))] for ant in antnums]

to_show['Total Days This Season'] = [len(antennas[ant].total_flags) for ant in antnums]
to_show['Unflagged Days This Season'] = [antennas[ant].unflagged_days() for ant in antnums]
to_show['Flag % This Season'] = 1 - np.array(to_show['Unflagged Days This Season']) / np.array(to_show['Total Days This Season'])
to_show['Top Flag Rationale (Season)'] = [antennas[ant].most_common_flag_rationale() for ant in antnums]

to_show['Total Days This Week'] = [len([antennas[ant].total_flags[jd] for jd in jds[0:min(len(jds), 7)] if jd in antennas[ant].total_flags]) for ant in antnums]
to_show['Unflagged Days This Week'] = [antennas[ant].unflagged_days(jds[0:min(len(jds), 7)]) for ant in antnums]
to_show['Flag % This Week'] = 1 - np.array(to_show['Unflagged Days This Week']) / np.array(to_show['Total Days This Week'])
to_show['Top Flag Rationale (Week)'] = [antennas[ant].most_common_flag_rationale(jds[0:min(len(jds), 7)]) for ant in antnums]

jd_links = []
for jd in jds:
    jd_links.append(f'<a href="{jd_to_summary_url(jd)}" target="_blank">{jd}</a>')
    to_show[jd_links[-1]] = [antennas[ant].total_flags[jd] if jd in antennas[ant].total_flags else np.nan for ant in antnums ]
df = pd.DataFrame(to_show)

table = df.style.hide_index()\
          .applymap(lambda val: f'background-color: {status_colors[val]}' if val in status_colors else '', subset=['Most Recent Status']) \
          .format('{:,.0%}', na_rep='-', subset=jd_links + ['Flag % This Season', 'Flag % This Week']) \
          .format('{:}', na_rep='-', subset=['Top Flag Rationale (Season)', 'Top Flag Rationale (Week)']) \
          .format('{:,.1f}', na_rep='-', subset=['Unflagged Days This Season', 'Unflagged Days This Week'] ) \
          .background_gradient(cmap='plasma', vmax=1, vmin=0, axis=None, subset=jd_links) \
          .bar(subset=['Flag % This Season', 'Flag % This Week'], vmin=0, vmax=1) 

# Table 2: Per-Antenna, Per-Night Flagging

This table summarizes information about individual antennas for the whole season and in the last seven days. This includes the number of days the antenna was in the data, what fraction of them were flagged, and how many unflagged days that leaves. It also attempts to figure out which metric led the most flagging, by seeing which step produced the most flags and when which median metric was largest within that step. Finally, it includes per-night flagging fractions.

In [None]:
HTML(table.render(render_links=True, escape=False))

In [None]:
# Build table for node flagging
nodenums = sorted([k for k in nodes.keys() if type(k) == str])

to_show = {'Node': nodenums, 'Number of Ants': [len(nodes[n]) for n in nodenums]}

to_show['Total Antenna-Days This Season'] = [np.sum([jd in antenna.total_flags for antenna in nodes[n] for jd in jds]) for n in nodenums]
to_show['Unflagged Antenna-Days This Season'] = [np.sum([antenna.unflagged_days() for antenna in nodes[n]]) for n in nodenums]
to_show['Flag % This Season'] = 1 - np.array(to_show['Unflagged Antenna-Days This Season']) / np.array(to_show['Total Antenna-Days This Season'])

to_show['Total Antenna-Days This Week'] = [np.sum([jd in antenna.total_flags for antenna in nodes[n] for jd in jds[0:min(len(jds), 7)]]) for n in nodenums]
to_show['Unflagged Antenna-Days This Week'] = [np.sum([antenna.unflagged_days(jds[0:min(len(jds), 7)]) for antenna in nodes[n]]) for n in nodenums]
to_show['Flag % This Week'] = 1 - np.array(to_show['Unflagged Antenna-Days This Week']) / np.array(to_show['Total Antenna-Days This Week'])

jd_links = []
for jd in jds:
    jd_links.append(f'<a href="{jd_to_summary_url(jd)}" target="_blank">{jd}</a>')
    to_show[jd_links[-1]] = [np.nansum([antenna.total_flags[jd] if jd in antenna.total_flags else np.nan for antenna in nodes[n]]) / 
                             np.sum([jd in antenna.total_flags for antenna in nodes[n]]) for n in nodenums]
df = pd.DataFrame(to_show)
table = df.style.hide_index() \
          .format('{:,.0%}', na_rep='-', subset=jd_links + ['Flag % This Season', 'Flag % This Week'] ) \
          .format('{:,.1f}', na_rep='-', subset=['Unflagged Antenna-Days This Season', 'Unflagged Antenna-Days This Week'] ) \
          .background_gradient(cmap='plasma', vmax=1, vmin=0, axis=None, subset=jd_links) \
          .bar(subset=['Flag % This Season', 'Flag % This Week'], vmin=0, vmax=1) 


# Table 3: Per-Node, Per-Night Flagging

This table summarizes flagging by node, showing how many antennas are in the data from each node, how many antenna-days were observed in each node, and how many of them were unflagged (both for the whole season and the last 7 nights).

In [None]:
HTML(table.render(render_links=True, escape=False))