# Season Data Quality Summary

**Josh Dillon**, Last Revised January 2022

This notebook parses information from each nightly `rtp_summary` notebook (as saved to .csvs) and builds a variety of useful tables. It is designed to highlight general trends across days.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 1000)
import matplotlib.pyplot as plt
import matplotlib
import glob
import re
import os
from astropy.time import Time
from IPython.display import display, HTML
from hera_notebook_templates.utils import status_colors
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# If you want to run this notebook locally, copy the output of the next cell into the next line of this cell.
# csv_folder = '/lustre/aoc/projects/hera/H5C/H5C_Notebooks/_rtp_summary_'
# os.environ["CSV_FOLDER"] = csv_folder

In [None]:
# Use environment variables to figure out path to the csvs
csv_folder = os.environ["CSV_FOLDER"]
print(f'csv_folder = "{csv_folder}"')

In [None]:
# load csvs in reverse chronological order, assuming that the notebook is run in the main HXC_Notebooks folder
csvs = sorted(glob.glob(os.path.join(csv_folder, '*.csv')))[::-1]
print(f'Found {len(csvs)} csvs in {csv_folder}')

In [None]:
# Per-season options
def jd_to_summary_url(jd):
    return f'https://htmlpreview.github.io/?https://github.com/HERA-Team/H5C_Notebooks/blob/main/_rtp_summary_/rtp_summary_{jd}.html'

In [None]:
class Antenna:
    '''Data structure for season data for an individual antenna'''
    def __init__(self, number, node=None):
        self.number = number
        self.node = node
        
        self.statuses = {}
        self.auto_flags = {}
        self.dead_flags_Jee = {}
        self.dead_flags_Jnn = {}        
        self.crossed_flags = {}
        self.flags_before_redcal = {}
        self.redcal_flags = {}
        self.total_flags = {}
        
        self.ee_shape_zs = {}
        self.nn_shape_zs = {}
        self.ee_power_zs = {}
        self.nn_power_zs = {}
        self.ee_temp_var_zs = {}
        self.nn_temp_var_zs = {}
        self.ee_temp_discon_zs = {}
        self.nn_temp_discon_zs = {}
        self.Jee_dead_metrics = {}
        self.Jnn_dead_metrics = {}
        self.crossed_metrics = {}
        self.Jee_chisqs = {}
        self.Jnn_chisqs = {}
        
    def add_day(self, csv_row):
        '''Parses row from rtp_summary csv into this object'''
        self.statuses[jd] = row['A Priori Status']
        
        # Add auto_metrics info, if available
        if 'Auto Metrics Flags' in row:
            self.auto_flags[jd] = row['Auto Metrics Flags']
            self.ee_shape_zs[jd] = row['ee Shape Modified Z-Score']
            self.nn_shape_zs[jd] = row['nn Shape Modified Z-Score']
            self.ee_power_zs[jd] = row['ee Power Modified Z-Score']
            self.nn_power_zs[jd] = row['nn Power Modified Z-Score']
            self.ee_temp_var_zs[jd] = row['ee Temporal Variability Modified Z-Score']
            self.nn_temp_var_zs[jd] = row['nn Temporal Variability Modified Z-Score']
            self.ee_temp_discon_zs[jd] = row['ee Temporal Discontinuties Modified Z-Score']
            self.nn_temp_discon_zs[jd] = row['nn Temporal Discontinuties Modified Z-Score']
        else:
            self.auto_flags[jd] = np.nan
            self.ee_shape_zs[jd] = np.nan
            self.nn_shape_zs[jd] = np.nan
            self.ee_power_zs[jd] = np.nan
            self.nn_power_zs[jd] = np.nan
            self.ee_temp_var_zs[jd] = np.nan
            self.nn_temp_var_zs[jd] = np.nan
            self.ee_temp_discon_zs[jd] = np.nan
            self.nn_temp_discon_zs[jd] = np.nan

        # Add ant_metrics info, if available
        if 'Dead Fraction in Ant Metrics (Jee)' in row:
            self.dead_flags_Jee[jd] = row['Dead Fraction in Ant Metrics (Jee)']
            self.dead_flags_Jnn[jd] = row['Dead Fraction in Ant Metrics (Jnn)']
            self.crossed_flags[jd] = row['Crossed Fraction in Ant Metrics']
            self.Jee_dead_metrics[jd] = row['Average Dead Ant Metric (Jee)']
            self.Jnn_dead_metrics[jd] = row['Average Dead Ant Metric (Jnn)']
            self.crossed_metrics[jd] = row['Average Crossed Ant Metric']
        else:
            self.dead_flags_Jee[jd] = np.nan
            self.dead_flags_Jnn[jd] = np.nan
            self.crossed_flags[jd] = np.nan
            self.Jee_dead_metrics[jd] = np.nan
            self.Jnn_dead_metrics[jd] = np.nan
            self.crossed_metrics[jd] = np.nan
        
        # Add redcal info, if available
        if 'Flag Fraction Before Redcal' in row:
            self.flags_before_redcal[jd] = row['Flag Fraction Before Redcal']
            self.redcal_flags[jd] = row['Flagged By Redcal chi^2 Fraction']
            self.Jee_chisqs[jd] = row['Median chi^2 Per Antenna (Jee)']
            self.Jnn_chisqs[jd] = row['Median chi^2 Per Antenna (Jnn)']      
        else:
            self.flags_before_redcal[jd] = np.nan
            self.redcal_flags[jd] = np.nan
            self.Jee_chisqs[jd] = np.nan
            self.Jnn_chisqs[jd] = np.nan

        # Compute final flagging percentage
        self.total_flags[jd] = self.auto_flags[jd]
        if self.total_flags[jd] != 1:
            self.total_flags[jd] = min(1, max(self.dead_flags_Jee[jd], self.dead_flags_Jnn[jd]) + self.crossed_flags[jd] + self.redcal_flags[jd])
            
    def unflagged_days(self, jds=None):
        '''Computes the number of effective unflagged days, either from a fixed set of days or all days.'''
        if jds is None:
            jds = self.total_flags.keys()
        return np.sum([1 - self.total_flags[jd] for jd in jds if jd in self.total_flags])
    
    def is_dead(self, jd):
        '''Returns the larger of Jee and Jnn dead flags (which should be the same) on a given JD.
        Returns np.nan if this antenna is not in the data for that day.'''
        if jd not in self.dead_flags_Jee:
            return np.nan
        return np.max([self.dead_flags_Jee[jd], self.dead_flags_Jnn[jd]])
    
    def ant_metrics_flag_frac(self, jd):
        '''Returns the fraction of the time this antenna is flagged as dead or crossed.
        Returns np.nan if this antenna is no tin the data for that day.'''
        if jd not in self.dead_flags_Jee:
            return np.nan
        return min(1, np.max([self.dead_flags_Jee[jd], self.dead_flags_Jnn[jd]]) + self.crossed_flags[jd])
    
    def most_common_flag_rationale(self, jds=None):
        '''Returns a string describining the worst metric for the most common flag rationale,
        either for a fixed set of days for all days. Returns np.nan if no data is available.'''
        if jds is None:
            jds = self.total_flags.keys()     
            
        ant_metrics_ff = np.nan_to_num(np.nanmean([self.ant_metrics_flag_frac(jd) for jd in jds]), nan=-np.inf)
        auto_metrics_ff = np.nan_to_num(np.nanmean([self.auto_flags[jd] if jd in self.auto_flags else np.nan for jd in jds]), nan=-np.inf)
        redcal_ff = np.nan_to_num(np.nanmean([self.redcal_flags[jd] if jd in self.redcal_flags else np.nan for jd in jds]), nan=-np.inf)
        
        if 0 == ant_metrics_ff == auto_metrics_ff == redcal_ff:
            return 'No Flags'
        
        if (ant_metrics_ff > 0) and (ant_metrics_ff >= auto_metrics_ff) and (ant_metrics_ff >= redcal_ff):
            ffs = {'Low_Corr': np.nanmean([self.is_dead(jd) for jd in jds])}
            ffs['Crossed'] = np.nanmean([self.crossed_flags[jd] if jd in self.crossed_flags else np.nan for jd in jds])
            return sorted(ffs.items(), key=lambda item: item[1])[-1][0]
        
        elif (auto_metrics_ff > 0) and (auto_metrics_ff >= ant_metrics_ff) and (auto_metrics_ff >= redcal_ff):
            mms = {}  # metric medians
            mms['ee_Auto_Shape'] = np.nanmedian([self.ee_shape_zs[jd] if jd in self.ee_shape_zs else np.nan for jd in jds])
            mms['nn_Auto_Shape'] = np.nanmedian([self.nn_shape_zs[jd] if jd in self.nn_shape_zs else np.nan for jd in jds])
            mms['ee_Auto_Power'] = np.nanmedian([self.ee_power_zs[jd] if jd in self.ee_power_zs else np.nan for jd in jds])
            mms['nn_Auto_Power'] = np.nanmedian([self.nn_power_zs[jd] if jd in self.nn_power_zs else np.nan for jd in jds])
            mms['ee_Auto_T_Var'] = np.nanmedian([self.ee_temp_var_zs[jd] if jd in self.ee_temp_var_zs else np.nan for jd in jds])
            mms['nn_Auto_T_Var'] = np.nanmedian([self.nn_temp_var_zs[jd] if jd in self.nn_temp_var_zs else np.nan for jd in jds])
            mms['ee_Auto_T_Discon'] = np.nanmedian([self.ee_temp_discon_zs[jd] if jd in self.ee_temp_discon_zs else np.nan for jd in jds])
            mms['nn_Auto_T_Discon'] = np.nanmedian([self.nn_temp_discon_zs[jd] if jd in self.nn_temp_discon_zs else np.nan for jd in jds])
            return sorted(mms.items(), key=lambda item: item[1])[-1][0]
        
        elif (redcal_ff > 0) and (redcal_ff >= ant_metrics_ff) and (redcal_ff >= auto_metrics_ff):
            mms = {}  # metric medians
            mms['Jee_Redcal_chisq'] = np.nanmedian([self.Jee_chisqs[jd] if jd in self.Jee_chisqs else np.nan for jd in jds])
            mms['Jnn_Redcal_chisq'] = np.nanmedian([self.Jnn_chisqs[jd] if jd in self.Jnn_chisqs else np.nan for jd in jds])
            return sorted(mms.items(), key=lambda item: item[1])[-1][0]            
            
        return np.nan

## Load Data

In [None]:
antennas = {}
nodes = {}
jds = []

# parse information about antennas and nodes
for csv in csvs:
    jd = [int(s) for s in re.split('_|\.', csv) if s.isdigit()][-1]
    jds.append(jd)
    df = pd.read_csv(csv)
    for n in range(len(df)):
        # Add this day to the antenna
        row = df.loc[n]
        antnum = row['Ant']
        if antnum not in antennas:
            antennas[antnum] = Antenna(row['Ant'], row['Node'])
        antennas[antnum].add_day(row)
    
        if antennas[antnum].node not in nodes:
            nodes[antennas[antnum].node] = [antennas[antnum]]
        elif antennas[antnum] not in nodes[antennas[antnum].node]:
            nodes[antennas[antnum].node].append(antennas[antnum])
            nodes[antennas[antnum].node] = sorted(nodes[antennas[antnum].node], key=lambda ant: ant.node)

In [None]:
# build per-night flagging table
antnums = sorted(antennas.keys())
to_show = {'JDs': [f'<a href="{jd_to_summary_url(jd)}">{jd}</a>' for jd in jds]}
to_show['Ants in Data'] = [np.sum([jd in antennas[ant].statuses for ant in antnums])  for jd in jds]
to_show['Unflagged Antenna-Days'] = [np.sum([antennas[ant].unflagged_days([jd]) for ant in antnums])  for jd in jds]
to_show['auto_metrics Flags'] = [np.nansum([antennas[ant].auto_flags[jd] for ant in antnums if jd in antennas[ant].auto_flags]) / 
                                 np.sum([np.isfinite(antennas[ant].auto_flags[jd]) for ant in antnums if jd in antennas[ant].auto_flags]) for jd in jds]
to_show['ant_metrics Flags'] = [x if np.isfinite(x) else np.nan for x in 
                                [np.nansum([antennas[ant].ant_metrics_flag_frac(jd) for ant in antnums]) / 
                                np.sum([np.isfinite(antennas[ant].dead_flags_Jee[jd]) for ant in antnums if jd in antennas[ant].dead_flags_Jee]) for jd in jds]]
to_show['redcal Flags'] = [np.nansum([antennas[ant].redcal_flags[jd] for ant in antnums if jd in antennas[ant].redcal_flags]) / 
                           np.sum([np.isfinite(antennas[ant].redcal_flags[jd]) for ant in antnums if jd in antennas[ant].redcal_flags]) for jd in jds]

df = pd.DataFrame(to_show)
table = df.style.hide_index() \
          .format('{:,.1f}', na_rep='-', subset=['Unflagged Antenna-Days']) \
          .background_gradient(cmap='gray', axis=None, subset=['Unflagged Antenna-Days']) \
          .format('{:,.0%}', na_rep='-', subset=['ant_metrics Flags', 'auto_metrics Flags', 'redcal Flags']) \
          .background_gradient(cmap='plasma', axis=None, subset=['ant_metrics Flags', 'auto_metrics Flags', 'redcal Flags']) \
    

In [None]:
def antenna_flag_plot():
    fig, ax = plt.subplots(1, 1, figsize=(14,7), dpi=100)
    utc = Time(jds, format='jd').datetime
    floor_jd = int(np.floor(np.min(jds) / 1000) * 1000)
    ax.plot_date(utc, to_show['Ants in Data'], '.-', ms=10, label='Antennas in Data')
    ax.plot(utc, to_show['Unflagged Antenna-Days'], 'r.-', ms=10, label='Unflagged Antenna-Days')
    ax2 = ax.twiny()
    ax2.plot(np.array(jds) - floor_jd, to_show['Unflagged Antenna-Days'], 'r.', ms=10)
    ax2.set_xlabel(f'JD - {floor_jd}')
    ax.set_ylabel('Number of Antennas')
    ax.set_xlabel('Date')
    ax.legend()
    ax2.grid()
    ax.grid(axis='y')

# Figure 1: (Unflagged) Antennas in the Array

This plot shows the progression of the number of antennas in the data and the number of antenna-days that are not automatically flagged.

In [None]:
antenna_flag_plot()    

# Table 1: Per Night Flagging

This table shows how many antennas we in the data for each night, the effective number of unflagged antenna-days, and the fraction of antennda-days flagged by:
* `auto_metrics` for bad autocorrelation shape, amplitude, or temporal structure,
* `ant_metrics` for low/no correlations or for having polarizations swapped, or
* `redcal` for non-redundancy. 

Note that an antenna can be flagged by both `auto_metrics` and `ant_metrics`, so those quantities may add up to more than 100%. However an antenna flagged by either `auto_metrics` (for the night) or `ant_metrics` (for the file) will be excluded from `redcal`, so these percentages will tend to be small. Dashes indicate missing metric data, likely because part of the RTP failed to run. Also included are links to individual `rtp_summary` notebooks.

In [None]:
HTML(table.render(render_links=True, escape=False))

In [None]:
# Build table for antenna flagging
antnums = sorted(antennas.keys())
to_show = {'Ant': antnums, 'Node': [antennas[ant].node for ant in antnums]}

to_show['Most Recent Status'] = [antennas[ant].statuses[np.max(list(antennas[ant].statuses.keys()))] for ant in antnums]

to_show['Total Days This Season'] = [len(antennas[ant].total_flags) for ant in antnums]
to_show['Unflagged Days This Season'] = [antennas[ant].unflagged_days() for ant in antnums]
to_show['Flag % This Season'] = 1 - np.array(to_show['Unflagged Days This Season']) / np.array(to_show['Total Days This Season'])
to_show['Top Flag Rationale (Season)'] = [antennas[ant].most_common_flag_rationale() for ant in antnums]

to_show['Total Days This Week'] = [len([antennas[ant].total_flags[jd] for jd in jds[0:min(len(jds), 7)] if jd in antennas[ant].total_flags]) for ant in antnums]
to_show['Unflagged Days This Week'] = [antennas[ant].unflagged_days(jds[0:min(len(jds), 7)]) for ant in antnums]
to_show['Flag % This Week'] = 1 - np.array(to_show['Unflagged Days This Week']) / np.array(to_show['Total Days This Week'])
to_show['Top Flag Rationale (Week)'] = [antennas[ant].most_common_flag_rationale(jds[0:min(len(jds), 7)]) for ant in antnums]

jd_links = []
for jd in jds:
    jd_links.append(f'<a href="{jd_to_summary_url(jd)}">{jd}</a>')
    to_show[jd_links[-1]] = [antennas[ant].total_flags[jd] if jd in antennas[ant].total_flags else np.nan for ant in antnums ]
df = pd.DataFrame(to_show)

table = df.style.hide_index()\
          .applymap(lambda val: f'background-color: {status_colors[val]}' if val in status_colors else '', subset=['Most Recent Status']) \
          .format('{:,.0%}', na_rep='-', subset=jd_links + ['Flag % This Season', 'Flag % This Week']) \
          .format('{:}', na_rep='-', subset=['Top Flag Rationale (Season)', 'Top Flag Rationale (Week)']) \
          .format('{:,.1f}', na_rep='-', subset=['Unflagged Days This Season', 'Unflagged Days This Week'] ) \
          .background_gradient(cmap='plasma', vmax=1, vmin=0, axis=None, subset=jd_links) \
          .bar(subset=['Flag % This Season', 'Flag % This Week'], vmin=0, vmax=1) 

# Table 2: Per-Antenna, Per-Night Flagging

This table summarizes information about individual antennas for the whole season and in the last seven days. This includes the number of days the antenna was in the data, what fraction of them were flagged, and how many unflagged days that leaves. It also attempts to figure out which metric led the most flagging, by seeing which step produced the most flags and when which median metric was largest within that step. Finally, it includes per-night flagging fractions.

In [None]:
HTML(table.render(render_links=True, escape=False))

In [None]:
# Build table for node flagging
nodenums = sorted(nodes.keys())

to_show = {'Node': nodenums, 'Number of Ants': [len(nodes[n]) for n in nodenums]}

to_show['Total Antenna-Days This Season'] = [np.sum([jd in antenna.total_flags for antenna in nodes[n] for jd in jds]) for n in nodenums]
to_show['Unflagged Antenna-Days This Season'] = [np.sum([antenna.unflagged_days() for antenna in nodes[n]]) for n in nodenums]
to_show['Flag % This Season'] = 1 - np.array(to_show['Unflagged Antenna-Days This Season']) / np.array(to_show['Total Antenna-Days This Season'])

to_show['Total Antenna-Days This Week'] = [np.sum([jd in antenna.total_flags for antenna in nodes[n] for jd in jds[0:min(len(jds), 7)]]) for n in nodenums]
to_show['Unflagged Antenna-Days This Week'] = [np.sum([antenna.unflagged_days(jds[0:min(len(jds), 7)]) for antenna in nodes[n]]) for n in nodenums]
to_show['Flag % This Week'] = 1 - np.array(to_show['Unflagged Antenna-Days This Week']) / np.array(to_show['Total Antenna-Days This Week'])

jd_links = []
for jd in jds:
    jd_links.append(f'<a href="{jd_to_summary_url(jd)}">{jd}</a>')
    to_show[jd_links[-1]] = [np.nansum([antenna.total_flags[jd] if jd in antenna.total_flags else np.nan for antenna in nodes[n]]) / 
                             np.sum([jd in antenna.total_flags for antenna in nodes[n]]) for n in nodenums]
df = pd.DataFrame(to_show)
table = df.style.hide_index() \
          .format('{:,.0%}', na_rep='-', subset=jd_links + ['Flag % This Season', 'Flag % This Week'] ) \
          .format('{:,.1f}', na_rep='-', subset=['Unflagged Antenna-Days This Season', 'Unflagged Antenna-Days This Week'] ) \
          .background_gradient(cmap='plasma', vmax=1, vmin=0, axis=None, subset=jd_links) \
          .bar(subset=['Flag % This Season', 'Flag % This Week'], vmin=0, vmax=1) 

          

# Table 3: Per-Node, Per-Night Flagging

This table summarizes flagging by node, showing how many antennas are in the data from each node, how many antenna-days were observed in each node, and how many of them were unflagged (both for the whole season and the last 7 nights).

In [None]:
HTML(table.render(render_links=True, escape=False))