In [1]:
import os
import pandas as pd
import numpy as np
from netCDF4 import Dataset
from datetime import datetime
from pathlib import Path

_VALID_LVL_TYPES = ["pressure", "height"]
_VALID_CONV_DIAG_TYPES = ["omf", "oma", "observation", "hofx"]
_VALID_RADIANCE_DIAG_TYPES = ["omf", "oma", "observation", "hofx",
                              "water_fraction", "land_fraction",
                              "cloud_fraction", "snow_fraction",
                              "ice_fraction"]


class GSIdiag:

    def __init__(self, path):
        """
        Initialize a GSI diagnostic object
        INPUT:
            path : path to GSI diagnostic object
        """

        self.path = path
        self.filename = os.path.splitext(Path(self.path).stem)[0]
        self.obs_type = self.filename.split('_')[1]
        self.variable = self.filename.split('_')[2]
        self.ftype = self.filename.split('_')[-1]

        _str_date = os.path.basename(self.path).split('.')[1]
        # Checks if '_ensmean' is included in file name
        self.date = datetime.strptime(_str_date.split('_')[0], '%Y%m%d%H')

        _var_key_name = 'Variable' if self.obs_type == 'conv' else 'Satellite'
        self.metadata = {'Obs Type': self.obs_type,
                         _var_key_name: self.variable,
                         'Date': self.date,
                         'File Type': self.ftype
                         }

    def __len__(self):
        return len(self.lats)

    def _query_diag_type(self, df, diag_type, bias_correction):
        """
        Query the data type being requested and returns
        the appropriate indexed data
        """

        bias = 'adjusted' if bias_correction else 'unadjusted'

        if self.variable == 'uv':
            if diag_type in ['Observation']:
                u = df[f'u_{diag_type}']
                v = df[f'v_{diag_type}']
            else:
                u = df[f'u_{diag_type}_{bias}']
                v = df[f'v_{diag_type}_{bias}']

            return u.to_numpy(), v.to_numpy()

        else:
            if diag_type in ['Observation']:
                data = df[f'{diag_type}']
            else:
                data = df[f'{diag_type}_{bias}']

            return data.to_numpy()


class Conventional(GSIdiag):

    def __init__(self, path):
        """
        Initialize a conventional GSI diagnostic object

        Args:
            path : (str) path to conventional GSI diagnostic object
        Returns:
            self : GSI diag conventional object containing the path
                   to extract data
        """
        super().__init__(path)

        self._read_obs()
        self.metadata['Diag File Type'] = 'conventional'

    def __str__(self):
        return "Conventional GSI diagnostic object"

    def _read_obs(self):
        """
        Reads the data from the conventional diagnostic file during
        initialization into a multidimensional pandas dataframe.
        """
        df_dict = {}

        # Open netCDF file, store data into dictionary
        with Dataset(self.path, mode='r') as f:
            for var in f.variables:
                # Station_ID and Observatio_Class variables need
                # to be converted from byte string to string
                if var in ['Station_ID', 'Observation_Class']:
                    data = f.variables[var][:]
                    data = [i.tobytes(fill_value='/////', order='C')
                            for i in data]
                    data = np.array(
                        [''.join(i.decode('UTF-8', 'ignore').split())
                         for i in data])
                    df_dict[var] = data

                # Grab variables with only 'nobs' dimension
                elif len(f.variables[var].shape) == 1:
                    df_dict[var] = f.variables[var][:]

        # Create pandas dataframe from dict
        df = pd.DataFrame(df_dict)

        # Creates multidimensional indexed dataframe
        indices = ['Station_ID', 'Observation_Class', 'Observation_Type',
                   'Observation_Subtype', 'Pressure', 'Height', 'Analysis_Use_Flag']
        df.set_index(indices, inplace=True)

        # Rename columns
        df.columns = df.columns.str.lower()
        if self.variable == 'uv':
            for wind_type in ['u', 'v']:
                for bias_type in ['unadjusted', 'adjusted']:
                    df = df.rename(columns={
                        f'{wind_type}_obs_minus_forecast_{bias_type}': f'{wind_type}_omf_{bias_type}'
                        })
                    # Create hofx columns
                    df[f'{wind_type}_hofx_{bias_type}'] = df[f'{wind_type}_observation'] - \
                        df[f'{wind_type}_omf_{bias_type}']

        else:
            for bias_type in ['unadjusted', 'adjusted']:
                df = df.rename(columns={
                    f'obs_minus_forecast_{bias_type}': f'omf_{bias_type}',
                    })
                # Create hofx columns
                df[f'hofx_{bias_type}'] = df['observation'] - df[f'omf_{bias_type}']

        self.data_df = df

    def get_data(self, diag_type, obsid=None, subtype=None, station_id=None,
                 analysis_use=False, lvls=None, lvl_type='pressure',
                 bias_correction=True):
        """
        Given parameters, get the data from a conventional diagnostic file

        Args:
            diag_type : (str; Required) type of data to extract
                        i.e. observation, omf, oma, hofx
            obsid : (list of ints; optional; default=None) observation
                    measurement ID number; default=None
            subtype : (list of ints; optional; default=None) observation
                      measurement ID subtype number, default=None
            station_id : (list of str; optional; default=None)
                         station id, default=None
            analysis_use : (bool; defaul=False) if True, will return
                           three sets of data:
                           assimlated (analysis_use_flag=1, qc<7),
                           rejected (analysis_use_flag=-1, qc<8),
                           monitored (analysis_use_flag=-1, qc>7)
            lvls : (list type; default=None) List of pressure or height
                   levels i.e. [250,500,750,1000]. List must be arranged
                   low to high.
            lvl_type : (str; default='pressure') lvls definition as
                       'pressure' or 'height'.
            bias_correction : (bool; default=True) If True, will return bias
                              corrected data.
        Returns:
            data : requested indexed data
        """

        if diag_type not in _VALID_CONV_DIAG_TYPES:
            raise ValueError((f'{diag_type} is not a valid diag_type. '
                              'Valid choices are: '
                              f'{" | ".join(_VALID_CONV_DIAG_TYPES)}'))

        self.metadata['Diag Type'] = diag_type
        self.metadata['ObsID'] = obsid
        self.metadata['Subtype'] = subtype
        self.metadata['Station ID'] = station_id
        self.metadata['Anl Use'] = analysis_use
        self.metadata['Levels'] = lvls
        self.metadata['Levels Type'] = lvl_type

        # Selects proper levels
        if lvls is not None:
            # Check if level type is valid
            if lvl_type not in _VALID_LVL_TYPES:
                raise ValueError((f'{lvl_type} is not a valid lvl_type. '
                                  'Valid choices are: '
                                  f'{" | ".join(_VALID_LVL_TYPES)}'))

            data = self._get_lvl_data(
                diag_type, obsid, subtype, station_id,
                analysis_use, lvls, lvl_type, bias_correction)

            return data

        else:
            if analysis_use:
                assimilated_df, rejected_df, monitored_df = self._select_conv(
                    obsid, subtype, station_id, analysis_use)

                if self.variable == 'uv':
                    u_assimilated, v_assimilated = self._query_diag_type(
                        assimilated_df, diag_type, bias_correction)
                    u_rejected, v_rejected = self._query_diag_type(
                        rejected_df, diag_type, bias_correction)
                    u_monitored, v_monitored = self._query_diag_type(
                        monitored_df, diag_type, bias_correction)

                    u = {'assimilated': u_assimilated,
                         'rejected': u_rejected,
                         'monitored': u_monitored}
                    v = {'assimilated': v_assimilated,
                         'rejected': v_rejected,
                         'monitored': v_monitored}

                    return u, v

                else:
                    assimilated_data = self._query_diag_type(
                        assimilated_df, diag_type, bias_correction)
                    rejected_data = self._query_diag_type(
                        rejected_df, diag_type, bias_correction)
                    monitored_data = self._query_diag_type(
                        monitored_df, diag_type, bias_correction)

                    data = {'assimilated': assimilated_data,
                            'rejected': rejected_data,
                            'monitored': monitored_data
                            }

                    return data

            else:
                indexed_df = self._select_conv(obsid, subtype, station_id)

                if self.variable == 'uv':
                    u, v = self._query_diag_type(
                        indexed_df, diag_type, bias_correction)

                    return u, v

                else:
                    data = self._query_diag_type(
                        indexed_df, diag_type, bias_correction)

                    return data

    def _select_conv(self, obsid=None, subtype=None, station_id=None,
                     analysis_use=False):
        """
        Given parameters, multidimensional dataframe is indexed
        to only include selected locations from a conventional
        diagnostic file.

        Args:
            obsid : (list of ints; default=None) observation measurement
                    ID number
            subtype : (list of ints; default=None) subtype number
            station_id : (list of str; default=None) station id tag
            analysis_use : (bool; deafault=False) if True, will separate
                           into three indexed dataframes: assimilated,
                           rejected, monitored
        Returns:
            df : (pandas dataframe) indexed multidimentsional
                 dataframe from selected data
        """

        df = self.data_df

        # select data by obsid, subtype, and station ids
        if obsid is not None:
            idx_col = 'Observation_Type'
            indx = df.index.get_level_values(idx_col) == ''
            for obid in obsid:
                indx = np.ma.logical_or(
                    indx, df.index.get_level_values(idx_col) == obid)
            df = df.iloc[indx]
        if subtype is not None:
            idx_col = 'Observation_Subtype'
            indx = df.index.get_level_values(idx_col) == ''
            for stype in subtype:
                indx = np.ma.logical_or(
                    indx, df.index.get_level_values(idx_col) == stype)
            df = df.iloc[indx]
        if station_id is not None:
            idx_col = 'Station_ID'
            indx = df.index.get_level_values(idx_col) == ''
            for stn_id in station_id:
                indx = np.ma.logical_or(
                    indx, df.index.get_level_values(idx_col) == stn_id)
            df = df.iloc[indx]

        if analysis_use:
            # Separate into 3 dataframes; assimilated, rejected, and monitored
            indx = df.index.get_level_values('Analysis_Use_Flag') == ''

            assimilated_indx = np.ma.logical_or(
                indx, df.index.get_level_values('Analysis_Use_Flag') == 1)
            rejected_indx = np.ma.logical_or(
                indx, df.index.get_level_values('Analysis_Use_Flag') == -1)
            monitored_indx = np.ma.logical_or(
                indx, df.index.get_level_values('Analysis_Use_Flag') == -1)

            assimilated_df = df.iloc[assimilated_indx]
            rejected_df = df.iloc[rejected_indx]
            monitored_df = df.iloc[monitored_indx]

            # Find rejected and monitored based on Prep_QC_Mark
            try:
                assimilated_df = assimilated_df.loc[
                    assimilated_df['prep_qc_mark'] < 7]
                rejected_df = rejected_df.loc[
                    rejected_df['prep_qc_mark'] < 8]
                monitored_df = monitored_df.loc[
                    monitored_df['prep_qc_mark'] > 7]
            except KeyError:
                assimilated_df = assimilated_df.loc[
                    assimilated_df['setup_qc_mark'] < 7]
                rejected_df = rejected_df.loc[
                    rejected_df['setup_qc_mark'] < 8]
                monitored_df = monitored_df.loc[
                    monitored_df['setup_qc_mark'] > 7]

            return assimilated_df, rejected_df, monitored_df

        else:
            return df
        
        
    def _get_lvl_data(self, diag_type, obsid=None, subtype=None, station_id=None,
                 analysis_use=False, lvls=None, lvl_type='pressure',
                 bias_correction=True):
        """
        Given a list of levels, will create a dictionary of data that is selected between
        each level. Will return a dictionary with subsetted pressure or height levels where
        data is separated within those levels:

       dict = {250-500: <data>,
               500-750: <data>,
               750-1000: <data>} 
        """
        binned_data = {}
        
        for i, low_bound in enumerate(lvls[:-1]):
            high_bound = lvls[i+1]

            if analysis_use:
                assimilated_df, rejected_df, monitored_df = self._select_conv(
                    obsid, subtype, station_id, analysis_use)

                assimilated_lvl_df = self._select_levels(assimilated_df, low_bound, high_bound, lvl_type)
                rejected_lvl_df = self._select_levels(rejected_df, low_bound, high_bound, lvl_type)
                monitored_lvl_df = self._select_levels(monitored_df, low_bound, high_bound, lvl_type)

                if self.variable == 'uv':
                    u_assimilated, v_assimilated = self._query_diag_type(
                        assimilated_lvl_df, diag_type, bias_correction)
                    u_rejected, v_rejected = self._query_diag_type(
                        rejected_lvl_df, diag_type, bias_correction)
                    u_monitored, v_monitored = self._query_diag_type(
                        monitored_lvl_df, diag_type, bias_correction)

                    u = {'assimilated': u_assimilated,
                         'rejected': u_rejected,
                         'monitored': u_monitored}
                    v = {'assimilated': v_assimilated,
                         'rejected': v_rejected,
                         'monitored': v_monitored}

                    data = {'u': u,
                            'v': v}
                else:
                    assimilated_data = self._query_diag_type(
                        assimilated_lvl_df, diag_type, bias_correction)
                    rejected_data = self._query_diag_type(
                        rejected_lvl_df, diag_type, bias_correction)
                    monitored_data = self._query_diag_type(
                        monitored_lvl_df, diag_type, bias_correction)

                    data = {'assimilated': assimilated_data,
                            'rejected': rejected_data,
                            'monitored': monitored_data
                            }

                binned_data[f'{low_bound}-{high_bound}'] = data

            else:
                indexed_df = self._select_conv(obsid, subtype, station_id)

                lvl_df = self._select_levels(indexed_df, low_bound, high_bound, lvl_type)

                if self.variable == 'uv':
                    u, v = self._query_diag_type(
                        lvl_df, diag_type, bias_correction)

                    data = {'u': u,
                            'v': v}
                else:
                    data = self._query_diag_type(
                        lvl_df, diag_type, bias_correction)


                binned_data[f'{low_bound}-{high_bound}'] = data
            
        return binned_data

    def _select_levels(self, df, low_bound, high_bound, lvl_type):
        """
        Selects data between two level bounds from given dataframe.
        """
        
        if lvl_type == 'pressure':
            # Grab data greater than low bound and less or than equal to high_bound
            df = df.query(f'(Pressure > {low_bound}) and (Pressure <= {high_bound})')

        else:
            # Grab data greater than or equal to low bound and less than high_bound
            df = df.query(f'(Height >= {low_bound}) and (Height < {high_bound})')

        return df
    
    def get_pressure(self, obsid=None, subtype=None, station_id=None,
                     analysis_use=False):
        """
        Grabs indexed pressure data.
        """
        if analysis_use:
            assimilated_df, rejected_df, monitored_df = self._select_conv(
                    obsid, subtype, station_id, analysis_use)

            pressure = {'assimilated': assimilated_df.reset_index()['Pressure'].to_numpy(),
                        'rejected': rejected_df.reset_index()['Pressure'].to_numpy(),
                        'monitored': monitored_df.reset_index()['Pressure'].to_numpy()}
            
        else:
            indexed_df = self._select_conv(obsid, subtype, station_id)
            pressure = indexed_df.reset_index()['Pressure'].to_numpy()
            
        return pressure
    
    def get_height(self, obsid=None, subtype=None, station_id=None,
                   analysis_use=False):
        """
        Grabs indexed height data.
        """
        if analysis_use:
            assimilated_df, rejected_df, monitored_df = self._select_conv(
                    obsid, subtype, station_id, analysis_use)

            height = {'assimilated': assimilated_df.reset_index()['Height'].to_numpy(),
                      'rejected': rejected_df.reset_index()['Height'].to_numpy(),
                      'monitored': monitored_df.reset_index()['Height'].to_numpy()}
            
        else:
            indexed_df = self._select_conv(obsid, subtype, station_id)
            height = indexed_df.reset_index()['Height'].to_numpy()
            
        return height
    
    
    def get_lat_lon(self, obsid=None, subtype=None, station_id=None,
                    analysis_use=False, lvls=None, lvl_type='pressure'):
        """
        Grabs indexed lats and lons from inputs.
        
        Args:
            obsid : (list of ints; optional; default=None) observation
                    measurement ID number; default=None
            subtype : (list of ints; optional; default=None) observation
                      measurement ID subtype number, default=None
            station_id : (list of str; optional; default=None)
                         station id, default=None
            analysis_use : (bool; defaul=False) if True, will return
                           three sets of data:
                           assimlated (analysis_use_flag=1, qc<7),
                           rejected (analysis_use_flag=-1, qc<8),
                           monitored (analysis_use_flag=-1, qc>7)
            lvls : (list type; default=None) List of pressure or height
                   levels i.e. [250,500,750,1000]. List must be arranged
                   low to high.
            lvl_type : (str; default='pressure') lvls definition as
                       'pressure' or 'height'.
        Returns:
            lat, lon : (array like) requested indexed latitude and longitude
        """
        # Selects proper levels
        if lvls is not None:
            # Check if level type is valid
            if lvl_type not in _VALID_LVL_TYPES:
                raise ValueError((f'{lvl_type} is not a valid lvl_type. '
                                  'Valid choices are: '
                                  f'{" | ".join(_VALID_LVL_TYPES)}'))
                
        if analysis_use:
            assimilated_df, rejected_df, monitored_df = self._select_conv(
                        obsid, subtype, station_id, analysis_use)
            
            # select by levels
            if lvls is not None:
                binned_lats = {}
                binned_lons = {}
                
                for i, low_bound in enumerate(lvls[:-1]):
                    high_bound = lvls[i+1]

                    assimilated_lvl_df = self._select_levels(
                        assimilated_df, low_bound, high_bound, lvl_type)
                    rejected_lvl_df = self._select_levels(
                        rejected_df, low_bound, high_bound, lvl_type)
                    monitored_lvl_df = self._select_levels(
                        monitored_df, low_bound, high_bound, lvl_type)
                    
                    lats = {'assimilated': assimilated_lvl_df['latitude'].to_numpy(),
                            'rejected': rejected_lvl_df['latitude'].to_numpy(),
                            'monitored': monitored_lvl_df['latitude'].to_numpy()
                            }
                    lons = {'assimilated': assimilated_lvl_df['longitude'].to_numpy(),
                            'rejected': rejected_lvl_df['longitude'].to_numpy(),
                            'monitored': monitored_lvl_df['longitude'].to_numpy()
                            }
                    
                    binned_lats[f'{low_bound}-{high_bound}'] = lats
                    binned_lons[f'{low_bound}-{high_bound}'] = lons
                
                return binned_lats, binned_lons
            
            else:
                
                lats = {'assimilated': assimilated_df['latitude'].to_numpy(),
                        'rejected': rejected_df['latitude'].to_numpy(),
                        'monitored': monitored_df['latitude'].to_numpy()
                        }
                lons = {'assimilated': assimilated_df['longitude'].to_numpy(),
                        'rejected': rejected_df['longitude'].to_numpy(),
                        'monitored': monitored_df['longitude'].to_numpy()
                        }
                
                return lats, lons
            
        else:
            indexed_df = self._select_conv(obsid, subtype, station_id)
            
            # select by levels
            if lvls is not None:
                binned_lats = {}
                binned_lons = {}
                
                for i, low_bound in enumerate(lvls[:-1]):
                    high_bound = lvls[i+1]
                    
                    lvl_df = self._select_levels(
                        indexed_df, low_bound, high_bound, lvl_type)
                    
                    lats = lvl_df['latitude'].to_numpy()
                    lons = lvl_df['longitude'].to_numpy()
                    
                    binned_lats[f'{low_bound}-{high_bound}'] = lats
                    binned_lons[f'{low_bound}-{high_bound}'] = lons
                
                return binned_lats, binned_lons
            
            else:
                return indexed_df['latitude'].to_numpy(), indexed_df['longitude'].to_numpy()
                        
                        
                        
                
                
        
        

In [50]:
file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_conv_t_ges.2020092200.nc4'

diag = Conventional(file)

obsid = [120]
subtype = [0]
analysis_use = True
lvls = [250, 500, 750, 1000]
bias_correction = True

data = diag.get_data('omf', obsid=obsid, subtype=subtype, analysis_use=analysis_use,
                     lvls=lvls, lvl_type='pressure', bias_correction=bias_correction)
lat, lon = diag.get_lat_lon(obsid=obsid, subtype=subtype, analysis_use=analysis_use,
                     lvls=lvls, lvl_type='pressure')

lat


{'250-500': {'assimilated': array([27.23568, 27.23375, 27.22843, ..., 60.27967, 60.28578, 60.29608],
        dtype=float32),
  'rejected': array([27.69483, 27.693  , 27.69234, 27.68982, 27.68906, 27.68762,
         27.68686, 27.68663, 27.68569, 27.68547, 27.68542, 27.68527,
         27.68512, 27.68507, 27.68483, 27.68482, 27.68617, 27.68957,
         27.6988 , 27.69955, 27.71105], dtype=float32),
  'monitored': array([53.23   , 53.23   , 53.23   , 53.23   , 53.23   , 42.51   ,
         42.51   , 42.51   , 42.51   , 42.51   , 42.51   , 42.51   ,
         42.51   , 42.51   , 42.51   , 42.51   , 42.51   , 42.51   ,
         42.51   , 42.51   , 64.30192, 64.30192, 64.30192, 64.30192,
         64.30192, 64.30192, 64.30192, 68.33762, 68.33762, 68.33762,
         68.33762, 68.33762, 68.33762, 68.33762, 68.33762,  5.25195,
          5.25272,  5.26144], dtype=float32)},
 '500-750': {'assimilated': array([27.22636, 27.23463, 27.2377 , ..., 60.15147, 60.15855, 60.17303],
        dtype=float32),
 

In [30]:
diag = Conventional(file)

pressure = diag.get_pressure(obsid=obsid, subtype=subtype, analysis_use=analysis_use)
pressure

array([980., 977., 951., ..., 337., 327., 311.])

In [33]:
diag = Conventional(file)

height = diag.get_height(obsid=obsid, subtype=subtype, analysis_use=analysis_use)
height

array([2.6900000e+02, 9.9999998e+10, 9.9999998e+10, ..., 9.9999998e+10,
       9.9999998e+10, 9.9999998e+10])

In [16]:
file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_conv_t_ges.2020092200.nc4'

df_dict = {}

with Dataset(file, mode='r') as f:
    
    for var in f.variables:
        print(var)

Station_ID
Observation_Class
Observation_Type
Observation_Subtype
Latitude
Longitude
Station_Elevation
Pressure
Height
Time
Prep_QC_Mark
Setup_QC_Mark
Prep_Use_Flag
Analysis_Use_Flag
Nonlinear_QC_Rel_Wgt
Errinv_Input
Errinv_Adjust
Errinv_Final
Observation
Obs_Minus_Forecast_adjusted
Obs_Minus_Forecast_unadjusted
Data_Pof
Data_Vertical_Velocity
Bias_Correction_Terms


### Create dataframe

In [2]:
file = '/scratch2/NCEPDEV/stmp1/Kevin.Dougherty/ncDiags/gdas.20200922/00/atmos/diag_conv_t_ges.2020092200.nc4'
variable = 't'

df_dict = {}

# Open netCDF file, store data into dictionary
with Dataset(file, mode='r') as f:
    for var in f.variables:
        # Station_ID and Observatio_Class variables need
        # to be converted from byte string to string
        if var in ['Station_ID', 'Observation_Class']:
            data = f.variables[var][:]
            data = [i.tobytes(fill_value='/////', order='C')
                    for i in data]
            data = np.array(
                [''.join(i.decode('UTF-8', 'ignore').split())
                 for i in data])
            df_dict[var] = data

        # Grab variables with only 'nobs' dimension
        elif len(f.variables[var].shape) == 1:
            df_dict[var] = f.variables[var][:]

# Create pandas dataframe from dict
df = pd.DataFrame(df_dict)

# Creates multidimensional indexed dataframe
indices = ['Station_ID', 'Observation_Class', 'Observation_Type',
           'Observation_Subtype', 'Pressure', 'Height', 'Analysis_Use_Flag']
df.set_index(indices, inplace=True)

# Rename columns
df.columns = df.columns.str.lower()
if variable == 'uv':
    df = df.rename(columns={
        'u_obs_minus_forecast_unadjusted': 'u_omf_unadjusted',
        'u_obs_minus_forecast_adjusted': 'u_omf_adjusted',
        'v_obs_minus_forecast_unadjusted': 'v_omf_unadjusted',
        'v_obs_minus_forecast_adjusted': 'v_omf_adjusted'
        })
    # Create hofx columns
    df['u_hofx_unadjusted'] = df['u_observation'] - \
        df['u_omf_unadjusted']
    df['v_hofx_unadjusted'] = df['v_observation'] - \
        df['v_omf_unadjusted']
    df['u_hofx_adjusted'] = df['u_observation'] - \
        df['u_omf_adjusted']
    df['v_hofx_adjusted'] = df['v_observation'] - \
        df['v_omf_adjusted']

else:
    df = df.rename(columns={
        'obs_minus_forecast_unadjusted': 'omf_unadjusted',
        'obs_minus_forecast_adjusted': 'omf_adjusted',
        })
    # Create hofx columns
    df['hofx_unadjusted'] = df['observation'] - df['omf_unadjusted']
    df['hofx_adjusted'] = df['observation'] - df['omf_adjusted']
    
data_df = df

In [3]:
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,latitude,longitude,station_elevation,time,prep_qc_mark,setup_qc_mark,prep_use_flag,nonlinear_qc_rel_wgt,errinv_input,errinv_adjust,errinv_final,observation,omf_adjusted,omf_unadjusted,data_pof,data_vertical_velocity,hofx_unadjusted,hofx_adjusted
Station_ID,Observation_Class,Observation_Type,Observation_Subtype,Pressure,Height,Analysis_Use_Flag,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
ZSNO,t,180,1,1031.599976,0.0,1.0,-35.000000,6.800000,0.0,0.00000,2.0,0.0,0.0,4.0,4.000000e-01,4.000000e-01,1.339194e-01,289.750000,1.425495,1.425495,1.000000e+09,1.000000e+09,288.324493,288.324493
VRWR7,t,180,1,1030.000000,0.0,1.0,-35.200001,8.900000,0.0,-3.00000,2.0,0.0,0.0,4.0,4.000000e-01,4.000000e-01,3.102377e-01,288.450012,0.605408,0.605408,1.000000e+09,1.000000e+09,287.844604,287.844604
VRWR7,t,180,1,1030.000000,0.0,1.0,-35.200001,9.000000,0.0,-2.00000,2.0,0.0,0.0,4.0,4.000000e-01,4.000000e-01,3.097822e-01,288.450012,0.583633,0.583633,1.000000e+09,1.000000e+09,287.866394,287.866394
64501,t,181,0,1013.299988,4.0,-1.0,-0.700000,8.750000,4.0,-3.00000,9.0,0.0,101.0,4.0,1.000000e-11,1.000000e-11,1.000000e-11,301.750000,1.256365,1.256365,1.000000e+09,1.000000e+09,300.493622,300.493622
FOOG,t,187,0,1013.599976,3.0,-1.0,-0.700000,8.750000,3.0,-1.00000,9.0,0.0,101.0,4.0,1.000000e-11,1.000000e-11,1.000000e-11,301.350006,1.052835,1.052835,1.000000e+09,1.000000e+09,300.297180,300.297180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/////,t,180,0,995.700012,0.0,1.0,80.629997,357.359985,0.0,-1.91667,2.0,1.0,0.0,4.0,4.000000e-01,4.000000e-01,3.128819e-01,272.350006,-0.186130,-0.186130,1.000000e+09,1.000000e+09,272.536133,272.536133
/////,t,180,0,996.599976,0.0,1.0,80.610001,357.309998,0.0,0.08333,2.0,1.0,0.0,4.0,4.000000e-01,4.000000e-01,3.130049e-01,271.950012,-0.301150,-0.301150,1.000000e+09,1.000000e+09,272.251160,272.251160
/////,t,180,0,995.200012,0.0,1.0,80.639999,357.380005,0.0,-2.91667,2.0,1.0,0.0,4.0,4.000000e-01,4.000000e-01,3.127697e-01,272.450012,-0.197209,-0.197209,1.000000e+09,1.000000e+09,272.647217,272.647217
/////,t,180,0,997.799988,0.0,1.0,80.599998,357.209991,0.0,2.06667,2.0,1.0,0.0,4.0,4.000000e-01,4.000000e-01,3.135123e-01,271.850006,-0.254984,-0.254984,1.000000e+09,1.000000e+09,272.104980,272.104980


In [7]:
indx = data_df.index[df['prep_qc_mark'] == 2]

lats = df['latitude'][indx].to_numpy()
lons = df['longitude'][indx].to_numpy()

lons[3]

8.9

### Select data based on inputs (`_select_conv()`)

In [11]:
diag_type = 'omf'
obsid = [120]
subtype = [0]
station_id = None
analysis_use = True
bias_correction = True

In [14]:
df = data_df

# select data by obsid, subtype, and station ids
if obsid is not None:
    idx_col = 'Observation_Type'
    indx = df.index.get_level_values(idx_col) == ''
    for obid in obsid:
        indx = np.ma.logical_or(
            indx, df.index.get_level_values(idx_col) == obid)
    df = df.iloc[indx]
if subtype is not None:
    idx_col = 'Observation_Subtype'
    indx = df.index.get_level_values(idx_col) == ''
    for stype in subtype:
        indx = np.ma.logical_or(
            indx, df.index.get_level_values(idx_col) == stype)
    df = df.iloc[indx]
if station_id is not None:
    idx_col = 'Station_ID'
    indx = df.index.get_level_values(idx_col) == ''
    for stn_id in station_id:
        indx = np.ma.logical_or(
            indx, df.index.get_level_values(idx_col) == stn_id)
    df = df.iloc[indx]

if analysis_use:
    # Separate into 3 dataframes; assimilated, rejected, and monitored
    indx = df.index.get_level_values('Analysis_Use_Flag') == ''

    assimilated_indx = np.ma.logical_or(
        indx, df.index.get_level_values('Analysis_Use_Flag') == 1)
    rejected_indx = np.ma.logical_or(
        indx, df.index.get_level_values('Analysis_Use_Flag') == -1)
    monitored_indx = np.ma.logical_or(
        indx, df.index.get_level_values('Analysis_Use_Flag') == -1)

    assimilated_df = df.iloc[assimilated_indx]
    rejected_df = df.iloc[rejected_indx]
    monitored_df = df.iloc[monitored_indx]

    # Find rejected and monitored based on Prep_QC_Mark
    try:
        assimilated_df = assimilated_df.loc[
            assimilated_df['prep_qc_mark'] < 7]
        rejected_df = rejected_df.loc[
            rejected_df['prep_qc_mark'] < 8]
        monitored_df = monitored_df.loc[
            monitored_df['prep_qc_mark'] > 7]
    except KeyError:
        assimilated_df = assimilated_df.loc[
            assimilated_df['setup_qc_mark'] < 7]
        rejected_df = rejected_df.loc[
            rejected_df['setup_qc_mark'] < 8]
        monitored_df = monitored_df.loc[
            monitored_df['setup_qc_mark'] > 7]

In [15]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,latitude,longitude,station_elevation,time,prep_qc_mark,setup_qc_mark,prep_use_flag,nonlinear_qc_rel_wgt,errinv_input,errinv_adjust,errinv_final,observation,omf_adjusted,omf_unadjusted,data_pof,data_vertical_velocity,hofx_unadjusted,hofx_adjusted
Station_ID,Observation_Class,Observation_Type,Observation_Subtype,Pressure,Height,Analysis_Use_Flag,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
60630,t,120,0,980.0,2.690000e+02,1.0,27.230000,2.500000,269.0,-0.77000,2.0,0.0,0.0,4.0,0.833333,0.632971,0.058988,303.450012,-4.367577,-4.367577,1.000000e+09,1.000000e+09,307.817596,307.817596
60630,t,120,0,977.0,1.000000e+11,1.0,27.229879,2.499850,269.0,-0.76847,2.0,0.0,0.0,4.0,0.833333,0.632971,0.202508,308.450012,0.632872,0.632872,1.000000e+09,1.000000e+09,307.817139,307.817139
60630,t,120,0,951.0,1.000000e+11,1.0,27.229460,2.495120,269.0,-0.75489,2.0,0.0,0.0,4.0,0.909091,0.909091,0.727273,310.750000,1.055756,1.055756,1.000000e+09,1.000000e+09,309.694244,309.694244
60630,t,120,0,925.0,7.950000e+02,1.0,27.229019,2.487980,269.0,-0.74078,2.0,0.0,0.0,4.0,1.000000,0.779562,0.623650,309.250000,0.815466,0.815466,1.000000e+09,1.000000e+09,308.434540,308.434540
60630,t,120,0,921.0,1.000000e+11,1.0,27.228861,2.487030,269.0,-0.73860,2.0,0.0,0.0,4.0,1.000000,0.779946,0.623957,309.049988,0.929095,0.929095,1.000000e+09,1.000000e+09,308.120880,308.120880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
03005,t,120,0,400.0,7.250000e+03,1.0,60.244991,359.470947,84.0,-0.35189,2.0,0.0,0.0,4.0,1.250000,1.205588,0.964470,250.449997,-0.591626,-0.591626,1.000000e+09,1.000000e+09,251.041626,251.041626
03005,t,120,0,375.0,1.000000e+11,1.0,60.257370,359.539429,84.0,-0.32576,2.0,0.0,0.0,4.0,1.250000,1.250000,1.000000,247.149994,-1.231759,-1.231759,1.000000e+09,1.000000e+09,248.381760,248.381760
03005,t,120,0,337.0,1.000000e+11,1.0,60.279671,359.662964,84.0,-0.28320,2.0,0.0,0.0,4.0,1.250000,1.250000,1.000000,242.350006,-1.119568,-1.119568,1.000000e+09,1.000000e+09,243.469574,243.469574
03005,t,120,0,327.0,1.000000e+11,1.0,60.285782,359.696777,84.0,-0.27135,2.0,0.0,0.0,4.0,1.250000,1.250000,1.000000,240.949997,-1.032963,-1.032963,1.000000e+09,1.000000e+09,241.982956,241.982956


### Index by pressure/height

In [31]:
lvls = [250, 500, 750, 1000]
lvl_type = 'pressure'

In [32]:
# analysis_use=False
def _select_levels(df, low_bound, high_bound, lvl_type):
    
    if lvl_type == 'pressure':
        df = df.query(f'(Pressure <= {high_bound}) and (Pressure > {low_bound})')
        
    else:
        df = df.query(f'(Height <= {high_bound}) and (Height > {low_bound})')
        
    return df


binned_data = {}

for i, low_bound in enumerate(lvls[:-1]):
    high_bound = lvls[i+1]
    
    lvl_df = _select_levels(indexed_df, low_bound, high_bound, lvl_type)

In [32]:

for i, low_bound in enumerate(lvls[:-1]):
    high_bound = lvls[i+1]
    
    if analysis_use:
        assimilated_df, rejected_df, monitored_df = self._select_conv(
            obsid, subtype, station_id, analysis_use)

        assimilated_lvl_df = self._select_levels(assimilated_df, low_bound, high_bound, lvl_type)
        rejected_lvl_df = self._select_levels(rejected_df, low_bound, high_bound, lvl_type)
        monitored_lvl_df = self._select_levels(assmonitored, low_bound, high_bound, lvl_type)

        if self.variable == 'uv':
            u_assimilated, v_assimilated = self.query_diag_type(
                    assimilated_lvl_df, diag_type, bias_correction)
            u_rejected, v_rejected = self.query_diag_type(
                rejected_lvl_df, diag_type, bias_correction)
            u_monitored, v_monitored = self.query_diag_type(
                monitored_lvl_df, diag_type, bias_correction)

            u = {'assimilated': u_assimilated,
                 'rejected': u_rejected,
                 'monitored': u_monitored}
            v = {'assimilated': v_assimilated,
                 'rejected': v_rejected,
                 'monitored': v_monitored}

            data = {'u': u,
                    'v': v}
        else:
            assimilated_data = self.query_diag_type(
                assimilated_lvl_df, diag_type, bias_correction)
            rejected_data = self.query_diag_type(
                rejected_lvl_df, diag_type, bias_correction)
            monitored_data = self.query_diag_type(
                monitored_lvl_df, diag_type, bias_correction)

            data = {'assimilated': assimilated_data,
                    'rejected': rejected_data,
                    'monitored': monitored_data
                    }
            
    else:
        indexed_df = self._select_conv(obsid, subtype, station_id)

        for i, low_bound in enumerate(lvls[:-1]):
            high_bound = lvls[i+1]

            lvl_df = self._select_levels(indexed_df, low_bound, high_bound, lvl_type)

            if self.variable == 'uv':
                u, v = self._query_diag_type(
                    lvl_df, diag_type, bias_correction)

                data = {'u': u,
                        'v': v}
            else:
                data = self._query_diag_type(
                    lvl_df, diag_type, bias_correction)
            
        
    binned_data[f'{low_bound}-{high_bound}'] = data
    
    
    
# else:
#     indexed_df = self._select_conv(obsid, subtype, station_id)

#     for i, low_bound in enumerate(lvls[:-1]):
#         high_bound = lvls[i+1]

#         lvl_df = _select_levels(indexed_df, low_bound, high_bound, lvl_type)

#         if self.variable == 'uv':
#             u, v = self._query_diag_type(
#                 lvl_df, diag_type, bias_correction)

#             data = {'u': u,
#                     'v': v}
#         else:
#             data = self._query_diag_type(
#                 lvl_df, diag_type, bias_correction)

#         binned_data[f'{low_bound}-{high_bound}'] = data
    

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,latitude,longitude,station_elevation,time,prep_qc_mark,setup_qc_mark,prep_use_flag,nonlinear_qc_rel_wgt,errinv_input,errinv_adjust,errinv_final,observation,omf_adjusted,omf_unadjusted,data_pof,data_vertical_velocity,hofx_unadjusted,hofx_adjusted
Station_ID,Observation_Class,Observation_Type,Observation_Subtype,Pressure,Height,Analysis_Use_Flag,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
60630,t,120,0,980.0,2.690000e+02,1.0,27.230000,2.500000,269.0,-0.77000,2.0,0.0,0.0,4.0,0.833333,0.632971,0.058988,303.450012,-4.367577,-4.367577,1.000000e+09,1.000000e+09,307.817596,307.817596
60630,t,120,0,977.0,1.000000e+11,1.0,27.229879,2.499850,269.0,-0.76847,2.0,0.0,0.0,4.0,0.833333,0.632971,0.202508,308.450012,0.632872,0.632872,1.000000e+09,1.000000e+09,307.817139,307.817139
60630,t,120,0,951.0,1.000000e+11,1.0,27.229460,2.495120,269.0,-0.75489,2.0,0.0,0.0,4.0,0.909091,0.909091,0.727273,310.750000,1.055756,1.055756,1.000000e+09,1.000000e+09,309.694244,309.694244
60630,t,120,0,925.0,7.950000e+02,1.0,27.229019,2.487980,269.0,-0.74078,2.0,0.0,0.0,4.0,1.000000,0.779562,0.623650,309.250000,0.815466,0.815466,1.000000e+09,1.000000e+09,308.434540,308.434540
60630,t,120,0,921.0,1.000000e+11,1.0,27.228861,2.487030,269.0,-0.73860,2.0,0.0,0.0,4.0,1.000000,0.779946,0.623957,309.049988,0.929095,0.929095,1.000000e+09,1.000000e+09,308.120880,308.120880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
03005,t,120,0,808.0,1.000000e+11,1.0,60.147350,358.908020,84.0,-0.65468,2.0,0.0,0.0,4.0,1.250000,1.009974,0.807979,279.250000,1.863482,1.863482,1.000000e+09,1.000000e+09,277.386505,277.386505
03005,t,120,0,782.0,1.000000e+11,1.0,60.146740,358.924072,84.0,-0.63989,2.0,0.0,0.0,4.0,1.250000,0.990414,0.792331,276.649994,0.373853,0.373853,1.000000e+09,1.000000e+09,276.276154,276.276154
03005,t,120,0,778.0,1.000000e+11,1.0,60.146629,358.926514,84.0,-0.63758,2.0,0.0,0.0,4.0,1.250000,0.706513,0.565210,276.350006,0.207206,0.207206,1.000000e+09,1.000000e+09,276.142792,276.142792
03005,t,120,0,772.0,1.000000e+11,1.0,60.146488,358.930298,84.0,-0.63410,2.0,0.0,0.0,4.0,1.250000,0.773927,0.619141,275.649994,-0.291039,-0.291039,1.000000e+09,1.000000e+09,275.941040,275.941040


In [None]:
lvl_df