In [20]:
import os
import pandas as pd
import numpy as np
from netCDF4 import Dataset
from datetime import datetime
from pathlib import Path

_VALID_LVL_TYPES = ["pressure", "height"]
_VALID_CONV_DIAG_TYPES = ["omf", "oma", "observation", "hofx"]
_VALID_RADIANCE_DIAG_TYPES = ["omf", "oma", "observation", "hofx",
                              "water_fraction", "land_fraction",
                              "cloud_fraction", "snow_fraction",
                              "ice_fraction"]


class Radiance(GSIdiag):

    def __init__(self, path):
        """
        Initialize a Radiance GSI diagnostic object

        Args:
            path : (str) path to radiance GSI diagnostic object
        Returns:
            self : GSI diag radiance object containing the path
                   to extract data
        """
        super().__init__(path)

        self._read_obs()
        self.metadata['Diag File Type'] = 'radiance'

    def __str__(self):
        return "Radiance GSI diagnostic object"

    def _read_obs(self):
        """
        Reads the data from the radiance diagnostic file during
        initialization into a multidimensional pandas dataframe.
        """
        df_dict = {}
        chan_info = {}

        with Dataset(self.path, mode='r') as f:

            # Grab dimensions to get lens
            nchans = f.dimensions['nchans']
            nobs = f.dimensions['nobs']

            for var in f.variables:
                if len(f.variables[var].shape) == 1:
                    # Add channel info to own dict
                    if len(f.variables[var][:]) == len(nchans):
                        chan_info[var] = f.variables[var][:]
                    elif len(f.variables[var][:]) == len(nobs):
                        df_dict[var] = f.variables[var][:]

        self.chan_info = chan_info

        # Sets correct channel number to indexed channel
        nchans = len(chan_info['chaninfoidx'])
        iters = int(len(df_dict['Channel_Index'])/nchans)

        for a in range(iters):
            df_dict['Channel_Index'][a*nchans:(a+1)*nchans] = \
                chan_info['sensor_chan']
        df_dict['Channel'] = df_dict['Channel_Index']

        # Create pandas dataframe from dict
        df = pd.DataFrame(df_dict)

        # Creates multidimensional indexed dataframe
        indices = ['Channel', 'QC_Flag']
        df.set_index(indices, inplace=True)

        # Rename columns
        df.columns = df.columns.str.lower()

        # Rename cloud_frac to cloud_fraction
        df = df.rename(columns={'cloud_frac': 'cloud_fraction'})

        for bias_type in ['unadjusted', 'adjusted']:
            df = df.rename(columns={
                f'obs_minus_forecast_{bias_type}': f'omf_{bias_type}',
                })
            # Create hofx columns
            df[f'hofx_{bias_type}'] = df['observation'] - \
                df[f'omf_{bias_type}']

        self.data_df = df

    def get_data(self, diag_type, channel=None, qcflag=None, use_flag=False,
                 separate_channels=False, separate_qc=False, errcheck=True,
                 bias_correction=True):
        """
        Given parameters, get the data from a conventional diagnostic file

        Args:
            diag_type : (str; Required) type of data to extract
                        i.e. observation, omf, oma, hofx
            channel : (list of ints; default=None) observation channel number
            qcflag : (list of ints; default=None) qc flag number
            separate_channels : (bool; default=False) if True, returns
                                dict of separate data by specified channels
            separate_qc : (bool; default=False) if True, returns dict of
                          separate data by specified qc flag
            use_flag : (bool; default=False) if True, will only return where
                       use_flag==1
            errcheck : (bool; default=True) when True and qcflag==0, will
                       toss out obs where inverse obs error is zero (i.e.
                       not assimilated in GSI)
            bias_correction : (bool; default=True) If True, will return bias
                              corrected data.
        Returns:
            data : requested indexed data
        """
        if diag_type not in _VALID_RADIANCE_DIAG_TYPES:
            raise ValueError((f'{diag_type} is not a valid diag_type. '
                              'Valid choices are: '
                              f'{" | ".join(_VALID_RADIANCE_DIAG_TYPES)}'))

        self.metadata['Variable'] = 'brightness_temperature'
        self.metadata['Diag Type'] = diag_type
        self.metadata['Anl Use'] = False
        
        # If no channels given, return all channels
        self.metadata['Channels'] = 'All Channels' if channel is None \
            else channel
        
        # If no qc flags given, return all qc flags
        all_qcflags = self.data_df.index.get_level_values(
                'QC_Flag').unique().to_numpy()
        
        self.metadata['QC Flag'] = all_qcflags if qcflag is None \
            else qcflag

        if separate_channels or separate_qc:
            data = self._get_data_special(
                diag_type, channel, qcflag, use_flag, separate_channels,
                separate_qc, errcheck, bias_correction)
            return data

        else:
            indexed_df = self._select_radiance(
                channel, qcflag, use_flag, errcheck)
            data = self._query_diag_type(
                indexed_df, diag_type, bias_correction)

            data[data > 1e5] = np.nan

            return data

    def _select_radiance(self, channel=None, qcflag=None, use_flag=False,
                         errcheck=True):
        """
        Given parameters, get the indices of the observation
        locations from a radiance diagnostic file.
        """
        df = self.data_df
        
        # Index df by use_flag   
        if use_flag:
            use_flag_indx = np.where(chan_info['use_flag'] == 1)
            indx = np.ma.logical_or(use_flag_indx, chan_info['sensor_chan'])
            channel = self.chan_info['sensor_chan'][indx]
            
            idx_col = 'Channel'
            indx = df.index.get_level_values(idx_col) == ''
            
            for chan in channel:
                indx = np.ma.logical_or(
                    indx, df.index.get_level_values(idx_col) == chan)
            
            df = df.iloc[indx]

        # index dataframe by channel
        if channel is not None:
            idx_col = 'Channel'
            indx = df.index.get_level_values(idx_col) == ''
            for chan in channel:
                indx = np.ma.logical_or(
                    indx, df.index.get_level_values(idx_col) == chan)

                # If channel not valid, raise TypeError
                if not any(indx):
                    VALIDCHANS = df.index.get_level_values(
                        'Channel').unique().to_numpy()
                    raise TypeError(f'Channel {chan} is not a valid channel. '
                                    'Valid channels include: '
                                    f'{", ".join(str(i) for i in VALIDCHANS)}')
            df = df.iloc[indx]

        # index dataframe by qcflag
        if qcflag is not None:
            idx_col = 'QC_Flag'
            indx = df.index.get_level_values(idx_col) == ''
            for qcf in qcflag:
                indx = np.ma.logical_or(
                    indx, df.index.get_level_values(idx_col) == qcf)

            df = df.iloc[indx]

            # remove obs where inverse obs error is zero
            if errcheck and 0 in qcflag:
                indx = df.index.get_level_values(idx_col) == ''

                # Grab index where inverse ob error is not zero
                err_indx = np.isin(
                    df['inverse_observation_error'], 0, invert=True)
                indx = np.ma.logical_or(indx, err_indx)

                df = df.iloc[indx]

        return df

    def _get_data_special(self, diag_type, channel, qcflag, use_flag,
                          separate_channels, separate_qc, errcheck,
                          bias_correction):
        """
        Creates a dictionary that separates channels and qc flags
        depending on the conditions of seperate_channels and
        separate_qc
        """
        data_dict = {}

        if separate_channels and not separate_qc:
            for c in channel:
                indexed_df = self._select_radiance(
                    [c], qcflag, errcheck=errcheck)
                data = self._query_diag_type(
                    indexed_df, diag_type, bias_correction)
                data[data > 1e5] = np.nan

                data_dict['Channel %s' % c] = data

        if not separate_channels and separate_qc:
            for qc in qcflag:
                indexed_df = self._select_radiance(
                    channel, [qc], errcheck=errcheck)
                data = self._query_diag_type(diag_type, idx)
                data[data > 1e5] = np.nan

                data_dict['QC Flag %s' % qc] = data

        if separate_channels and separate_qc:
            for c in channel:
                data_dict['Channel %s' % c] = {}
                for qc in qcflag:
                    indexed_df = self._select_radiance(
                        [c], [qc], errcheck=errcheck)
                    data = self._query_diag_type(
                        indexed_df, diag_type, bias_correction)
                    data[data > 1e5] = np.nan

                    data_dict['Channel %s' % c]['QC Flag %s' % qc] = data

        return data_dict

    def get_lat_lon(self, channel=None, qcflag=None, errcheck=True):
        """
        Gets lats and lons with desired indices.

        Args:
            channel : (list of ints; default=None) observation channel number
            qcflag : (list of ints; default=None) qc flag number
            errcheck : (bool; default=True) when True and qcflag==0, will
                       toss out obs where inverse obs error is zero (i.e.
                       not assimilated in GSI)
        Returns:
            lat, lon : (array like) indexed latitude and longitude values
        """
        indexed_df = self._select_radiance(channel, qcflag, errcheck=errcheck)
        lats = indexed_df['latitude'].to_numpy()
        lons = indexed_df['longitude'].to_numpy()

        return lats, lons


#### Test driver

In [3]:
# file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_abi_g16_ges.2020092200.nc4'
file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_airs_aqua_ges.2020092200.nc4'

diag = Radiance(file)

diag.chan_info

diag_type = 'omf'
channel = None
qcflag = None
errcheck=True

data = diag.get_data(diag_type, channel=channel, qcflag=qcflag,
                    separate_channels=False, separate_qc=False)

data

# lat, lon = diag.get_lat_lon(channel=channel, qcflag=qcflag)
# lat

array([ 0.45512748, -1.6221615 , -1.4767848 , ...,  0.08683787,
       -0.01648418,  0.08822461], dtype=float32)

#### Make dataframe

In [7]:
# file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_amsua_metop-a_ges.2020092200.nc4'
file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_abi_g16_ges.2020092200.nc4'


with Dataset(file, mode='r') as f:
#     for var in f.variables:
#         print(var)
    print(f)

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    Satellite_Sensor: abi_g16
    Satellite: g16
    Observation_type: abi
    Outer_Loop_Iteration: 1
    Number_of_channels: 10
    Number_of_Predictors: 12
    date_time: 2020092200
    ireal_radiag: 30
    ipchan_radiag: 8
    iextra: 0
    jextra: 0
    idiag: 23
    angord: 4
    iversion_radiag: 40000
    New_pc4pred: 1
    ioff0: 23
    ijacob: 0
    dimensions(sizes): nchans(10), nobs(43410), Observation_Class_maxstrlen(7), BC_angord_arr_dim(4)
    variables(dimensions): int32 chaninfoidx(nchans), float64 frequency(nchans), int32 polarization(nchans), float64 wavenumber(nchans), float64 error_variance(nchans), float64 mean_lapse_rate(nchans), int32 use_flag(nchans), int32 sensor_chan(nchans), int32 satinfo_chan(nchans), int32 Channel_Index(nobs), |S1 Observation_Class(nobs,Observation_Class_maxstrlen), float32 Latitude(nobs), float32 Longitude(nobs), float32 Elevation(nobs), float32 Obs_Time(

  


In [24]:
# file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_amsua_metop-a_ges.2020092200.nc4'
file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_abi_g16_ges.2020092200.nc4'


df_dict = {}
chan_info = {}

with Dataset(file, mode='r') as f:
    
    # Grab dimensions to get lens
    nchans = f.dimensions['nchans']
    nobs = f.dimensions['nobs']
    
    for var in f.variables:
        if len(f.variables[var].shape) == 1:
            # Add channel info to own dict
            if len(f.variables[var][:]) == len(nchans):
                chan_info[var] = f.variables[var][:]
            elif len(f.variables[var][:]) == len(nobs):
                df_dict[var] = f.variables[var][:]

# Sets correct channel number to indexed channel
nchans = len(chan_info['chaninfoidx'])
iters = int(len(df_dict['Channel_Index'])/nchans)

for a in range(iters):
    df_dict['Channel_Index'][a*nchans:(a+1)*nchans] = chan_info['sensor_chan']
    
df_dict['Channel'] = df_dict['Channel_Index']
                
# Create pandas dataframe from dict
df = pd.DataFrame(df_dict)

# Creates multidimensional indexed dataframe
indices = ['Channel', 'QC_Flag']
df.set_index(indices, inplace=True)

# Rename columns
df.columns = df.columns.str.lower()
df = df.rename(columns={
    'obs_minus_forecast_unadjusted': 'omf_unadjusted',
    'obs_minus_forecast_adjusted': 'omf_adjusted',
    })
# Create hofx columns
df['hofx_unadjusted'] = df['observation'] - df['omf_unadjusted']
df['hofx_adjusted'] = df['observation'] - df['omf_adjusted']

data_df = df
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,channel_index,latitude,longitude,elevation,obs_time,scan_position,sat_zenith_angle,sat_azimuth_angle,sol_zenith_angle,sol_azimuth_angle,...,bc_scan_angle,bc_cloud_liquid_water,bc_lapse_rate_squared,bc_lapse_rate,bc_cosine_latitude_times_node,bc_sine_latitude,bc_emissivity,bc_fixed_scan_position,hofx_unadjusted,hofx_adjusted
Channel,QC_Flag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
7,-0.0,7,-29.434389,233.841507,0.0,-2.661944,66.0,64.989998,1.191536,33.540001,330.709991,...,0.0,0.0,0.008534,-0.006477,0.0,0.0,0.0,-0.953676,287.165192,287.200012
8,-0.0,8,-29.434389,233.841507,0.0,-2.661944,66.0,64.989998,1.191536,33.540001,330.709991,...,0.0,0.0,0.003573,0.046582,0.0,0.0,0.0,0.310679,230.493591,230.315445
9,-0.0,9,-29.434389,233.841507,0.0,-2.661944,66.0,64.989998,1.191536,33.540001,330.709991,...,0.0,0.0,0.005917,0.062065,0.0,0.0,0.0,0.783660,242.835907,243.692566
10,-0.0,10,-29.434389,233.841507,0.0,-2.661944,66.0,64.989998,1.191536,33.540001,330.709991,...,0.0,0.0,0.045094,-0.057100,0.0,0.0,0.0,0.287807,256.265442,256.022247
11,-0.0,11,-29.434389,233.841507,0.0,-2.661944,66.0,64.989998,1.191536,33.540001,330.709991,...,0.0,0.0,0.004269,-0.027456,0.0,0.0,0.0,-0.829823,283.676117,283.036682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,-50.0,12,33.039959,333.535370,0.0,2.504722,66.0,64.669998,4.266109,144.610001,22.799999,...,0.0,0.0,0.003032,0.016335,0.0,0.0,0.0,-1.068744,253.617264,253.075211
13,-50.0,13,33.039959,333.535370,0.0,2.504722,66.0,64.669998,4.266109,144.610001,22.799999,...,0.0,0.0,-0.052760,0.082246,0.0,0.0,0.0,-0.588254,289.407745,289.260468
14,-50.0,14,33.039959,333.535370,0.0,2.504722,66.0,64.669998,4.266109,144.610001,22.799999,...,0.0,0.0,-0.086218,0.131680,0.0,0.0,0.0,-0.754972,288.669525,288.783691
15,-50.0,15,33.039959,333.535370,0.0,2.504722,66.0,64.669998,4.266109,144.610001,22.799999,...,0.0,0.0,-0.076825,0.117584,0.0,0.0,0.0,-0.969829,285.086609,285.017426


In [3]:
data_df.columns

Index(['channel_index', 'latitude', 'longitude', 'elevation', 'obs_time',
       'scan_position', 'sat_zenith_angle', 'sat_azimuth_angle',
       'sol_zenith_angle', 'sol_azimuth_angle', 'sun_glint_angle',
       'scan_angle', 'water_fraction', 'land_fraction', 'ice_fraction',
       'snow_fraction', 'water_temperature', 'land_temperature',
       'ice_temperature', 'snow_temperature', 'soil_temperature',
       'soil_moisture', 'land_type_index', 'tsavg5', 'sstcu', 'sstph', 'sstnv',
       'dta', 'dqa', 'dtp_avh', 'vegetation_fraction', 'snow_depth', 'tpwc',
       'clw_guess_retrieval', 'sfc_wind_speed', 'cloud_frac', 'ctp', 'clw',
       'tpwc', 'clw_obs', 'clw_guess', 'foundation_temperature',
       'sst_warm_layer_dt', 'sst_cool_layer_tdrop', 'sst_dtz_dtfound',
       'observation', 'omf_unadjusted', 'omf_adjusted',
       'inverse_observation_error', 'emissivity', 'weighted_lapse_rate',
       'dtb_dts', 'bc_constant', 'bc_scan_angle', 'bc_cloud_liquid_water',
       'bc_lapse_r

In [9]:
inv_ob = data_df['inverse_observation_error'].to_numpy()
len(np.where(inv_ob==0)[0])

19943

#### Get indexed df using `_select_radiance()`

In [70]:
diag_type = 'omf'
channel = [8]
qcflag = None
use_flag=True
errcheck=True

In [71]:
df = data_df

if use_flag:
    use_flag_indx = np.where(chan_info['use_flag'] == 1)
    use_flag_channel = chan_info['sensor_chan'][use_flag_indx].tolist()

    idx_col = 'Channel'
    indx = df.index.get_level_values(idx_col) == ''

    for chan in use_flag_channel:
        indx = np.ma.logical_or(
            indx, df.index.get_level_values(idx_col) == chan)

    df = df.iloc[indx]

# index dataframe by channel
if channel is not None:
    idx_col = 'Channel'
    indx = df.index.get_level_values(idx_col) == ''
    
    for chan in channel:
        indx = np.ma.logical_or(
            indx, df.index.get_level_values(idx_col) == chan)
        # If channel not valid, raise TypeError
        if not any(indx):
            VALIDCHANS = df.index.get_level_values('Channel').unique().to_numpy()
            raise TypeError(f'Channel {chan} is not a valid channel. '
                             'Valid channels include: '
                             f'{", ".join(str(i) for i in VALIDCHANS)}')
    df = df.iloc[indx]
    

# index dataframe by qcflag
if qcflag is not None:
    idx_col = 'QC_Flag'
    indx = df.index.get_level_values(idx_col) == ''
    for qcf in qcflag:
        indx = np.ma.logical_or(
            indx, df.index.get_level_values(idx_col) == qcf)
        # If qcflag not valid, raise TypeError
        if not any(indx):
            VALIDQC = df.index.get_level_values('QC_Flag').unique().to_numpy()
            raise TypeError(f'QC flag {qcf} is not valid. '
                             'Valid QC flags include: '
                             f'{", ".join(str(i) for i in VALIDQC)}')
        
    df = df.iloc[indx]

    # remove obs where inverse obs error is zero
    if errcheck and 0 in qcflag:
        idx_col = 'QC_Flag'
        indx = df.index.get_level_values(idx_col) == ''
        # Grab index where inverse ob error is not zero
        err_indx = np.isin(df['inverse_observation_error'], 0, invert=True)
        indx = np.ma.logical_or(indx, err_indx)

        df = df.iloc[indx]    


df

Unnamed: 0_level_0,Unnamed: 1_level_0,channel_index,latitude,longitude,elevation,obs_time,scan_position,sat_zenith_angle,sat_azimuth_angle,sol_zenith_angle,sol_azimuth_angle,...,bc_scan_angle,bc_cloud_liquid_water,bc_lapse_rate_squared,bc_lapse_rate,bc_cosine_latitude_times_node,bc_sine_latitude,bc_emissivity,bc_fixed_scan_position,hofx_unadjusted,hofx_adjusted
Channel,QC_Flag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
8,-0.0,8,-29.434389,233.841507,0.000000e+00,-2.661944,66.0,64.989998,1.191536,33.540001,330.709991,...,0.0,0.0,0.003573,0.046582,0.0,0.0,0.0,0.310679,230.493591,230.315445
8,-0.0,8,-29.024220,234.818192,0.000000e+00,2.671389,65.0,63.959999,1.184031,95.989998,266.869995,...,0.0,0.0,0.014050,0.092369,0.0,0.0,0.0,0.306475,229.433212,229.307129
8,-0.0,8,-28.014570,234.447403,0.000000e+00,-1.828611,65.0,63.880001,1.199565,39.740002,311.130005,...,0.0,0.0,0.000815,0.022252,0.0,0.0,0.0,0.306475,232.718475,232.509018
8,-0.0,8,-26.627541,234.952927,0.000000e+00,-2.495278,64.0,62.889999,1.209339,32.950001,322.589996,...,0.0,0.0,0.000035,0.004630,0.0,0.0,0.0,0.302323,232.936493,232.704498
8,-0.0,8,-24.791389,232.074677,7.622797e-17,2.838056,66.0,64.900002,1.261873,95.980003,267.429993,...,0.0,0.0,0.000756,0.021433,0.0,0.0,0.0,0.310679,233.829987,233.623856
8,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,-0.0,8,27.926451,336.342621,0.000000e+00,-2.328611,66.0,64.919998,4.356342,118.610001,287.589996,...,0.0,0.0,0.028516,0.131592,0.0,0.0,0.0,0.310679,231.690063,231.621872
8,-0.0,8,30.871149,333.403534,0.000000e+00,0.504722,65.0,63.560001,4.287576,144.929993,329.100006,...,0.0,0.0,0.028655,-0.131911,0.0,0.0,0.0,0.306475,233.160660,232.824890
8,-0.0,8,31.290701,334.365082,0.000000e+00,0.838056,66.0,64.570000,4.295779,146.789993,339.230011,...,0.0,0.0,0.060119,-0.191069,0.0,0.0,0.0,0.310679,232.686874,232.327621
8,-0.0,8,32.011139,334.409424,0.000000e+00,2.838056,66.0,64.919998,4.288797,143.210007,32.639999,...,0.0,0.0,0.089984,-0.233759,0.0,0.0,0.0,0.310679,237.596375,237.224289


In [8]:
# df['inverse_observation_error']
errindx = np.isin(df['inverse_observation_error'], 0, invert=True)

errindx[0:10]

array([False, False, False, False,  True, False, False,  True, False,
       False])

In [119]:
VALIDCHANS = df.index.get_level_values('Channel').unique().to_numpy()

print(", ".join(str(i) for i in VALIDCHANS))

7, 8, 9, 10, 11, 12, 13, 14, 15, 16


In [117]:
VALIDCHANS

array([ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16])