## Test indexing

### Create a mock dataset

In [1]:
import xarray as xr
import numpy as np
import pandas as pd

# Define dimensions
Nt = 10
time = pd.date_range('2024-01-01', periods=Nt, freq='D')
pres = [100, 600, 2000, 6000, 11000]  # Pressure levels

# Create data for TEMP(TIME, PRES)
temp_data = 15 + 8 * np.random.randn(Nt, len(pres))  # Example temperature data

# Create data for STATION(TIME) and PLACE(TIME)
station_data = [f'st{stnum:02.0f}' for stnum in np.arange(1, Nt+1)]
ocean_data = ['Atlantic', 'Arctic', 'Pacific', 'Mediterranean', 'Southern', 
                              'Baltic', 'Indian', 'Caribbean', 'Weddell', 'Ross']

# Create data for ZONE(PRES)
zone_data = ['epipelagic', 'mesopelagic', 'bathypelagic', 'abyssopelagic', 'hadopelagic']

# Create the Dataset
ds = xr.Dataset(
    {
        'TEMP': (['TIME', 'PRES'], temp_data),
        'OCEAN': (['TIME'], ocean_data),
        'STATION': (['TIME'], station_data),
        'ZONE': (['PRES'], zone_data)
    },
    coords={
        'TIME': time,
        'PRES': pres
    }
)

In [2]:
ds

In [3]:
def pick(ds, squeeze=True, **conditions):
    """
    Filter an xarray.Dataset based on conditions applied to its one-dimensional variables.

    This function is equivalent to `.isel()` but works with non-coordinate variables.
    For example, if we have a variable `STATION(TIME)`, we can select by station: 
    `pick(ds, STATION='sta01')` or by multiple stations: `pick(ds, STATION=['sta01', 'sta02'])`.

    The function selects and returns the subset of the dataset where the specified 
    condition(s) on the given variable(s) are met. The dimension along which the filtering 
    occurs is determined dynamically based on the variable(s) provided in `conditions`.

    Parameters
    ----------
    ds : xarray.Dataset
        The input dataset to be filtered.
    **conditions : dict
        Key-value pairs where the key is the name of a one-dimensional variable in the 
        dataset, and the value is the condition. The condition can be a single value 
        (e.g., STATION='sta02') or a list of values (e.g., STATION=['sta02', 'sta03']).
    squeeze : bool, optional
        If True (default), the returned dataset will be squeezed to remove any singleton 
        dimensions. If False, the original dimensions will be preserved.

    Returns
    -------
    xarray.Dataset
        A dataset filtered to only include the indices that match the condition(s). The 
        dimension along which filtering is applied is inferred from the condition variable.
        The returned dataset may be squeezed depending on the `squeeze` parameter.

    Raises
    ------
    ValueError
        If the specified variable does not exist in the dataset or is not one-dimensional.

    Examples
    --------
    >>> ds = xr.Dataset(
    ...     {
    ...         'TEMP': (['TIME', 'PRES'], temp_data),
    ...         'OCEAN': (['TIME'], ocean_data),
    ...         'STATION': (['TIME'], station_data)
    ...     },
    ...     coords={
    ...         'TIME': time,
    ...         'PRES': pres
    ...     }
    ... )
    >>> pick(ds, STATION='st02')
    <xarray.Dataset>
    Dimensions:  (TIME: 1, PRES: 5)
    Coordinates:
      * TIME     (TIME) datetime64[ns] 2024-01-02
      * PRES     (PRES) float64 1e+03 875.0 750.0 625.0 500.0
    Data variables:
        TEMP     (TIME, PRES) float64 14.5 15.3 12.7 17.6 8.67
        OCEAN    (TIME) <U13 'Arctic'
        STATION  (TIME) <U3 'st02'

    >>> pick(ds, STATION=['st02', 'st03'])
    <xarray.Dataset>
    Dimensions:  (TIME: 2, PRES: 5)
    Coordinates:
      * TIME     (TIME) datetime64[ns] 2024-01-02 2024-01-03
      * PRES     (PRES) float64 1e+03 875.0 750.0 625.0 500.0
    Data variables:
        TEMP     (TIME, PRES) float64 14.5 15.3 12.7 17.6 8.67 16.8 12.4 ...
        OCEAN    (TIME) <U13 'Arctic' 'Pacific'
        STATION  (TIME) <U3 'st02' 'st03'
    """

    # Iterate over the conditions
    for var_name, value in conditions.items():
        # Check if the variable exists in the dataset
        if var_name not in ds:
            raise ValueError(f"Variable '{var_name}' not found in the dataset.")
        
        # Find the dimension that the variable depends on
        var_dims = ds[var_name].dims
        
        # Ensure the variable is one-dimensional
        if len(var_dims) != 1:
            raise ValueError(f"Variable '{var_name}' must be one-dimensional.")
        
        dim = var_dims[0]  # Get the dimension name
        
        # Handle cases where 'value' is a list or array of values
        if isinstance(value, (list, np.ndarray)):
            indices = ds[var_name].isin(value)
        else:
            indices = ds[var_name] == value
        
        # Filter the dataset using the indices
        ds = ds.isel({dim: indices})
    
    if squeeze:
        ds = ds.squeeze()

    return ds


In [4]:
ds

In [5]:
pick(ds, ZONE=['epipelagic', 'bathypelagic'])

In [10]:
import kval
import importlib as imp

In [11]:
from kval.file import xr_funcs

In [12]:
imp.reload(xr_funcs)

<module 'kval.file.xr_funcs' from '/home/oyvindl/work/code/python/kval/src/kval/file/xr_funcs.py'>

In [17]:
xr_funcs.pick(ds, STATION=['st02', 'st04'],  ZONE=['epipelagic', 'bathypelagic'])

In [18]:
mock_dataset = ds

In [29]:
    result = xr_funcs.pick(mock_dataset, STATION=['st02', 'st03'])
    assert result.sizes['TIME'] == 2
    assert 'st02' in result.STATION.values
    assert 'st03' in result.STATION.values


In [30]:
result

In [31]:
import pytest

In [43]:
result = xr_funcs.pick(mock_dataset, STATION='st11')
list(result.dims) == ['TIME', 'PRES']

True

In [52]:
    result = xr_funcs.pick(mock_dataset, ZONE=['epipelagic', 'hadopelagic'])
    assert result.sizes['PRES'] == 2
    assert 'epipelagic' in result.ZONE.values
    assert 'hadopelagic' in result.ZONE.values

In [47]:
    result = xr_funcs.pick(mock_dataset, ZONE='epipelagic')
    assert list(result.dims)== ['TIME']
    assert 'epipelagic' in result.ZONE.values


In [54]:
    result = xr_funcs.pick(mock_dataset, STATION='st02', squeeze=False)
    assert result.sizes['TIME'] == 1
    assert result.sizes['PRES'] == 5  # PRES dimension should remain unchanged

In [59]:
    with pytest.raises(ValueError):
        xr_funcs.pick(mock_dataset, TEMP=15)

In [60]:
xr_funcs.pick(mock_dataset, TEMP=15)

ValueError: Variable 'TEMP' must be one-dimensional.

In [62]:

if __name__ == "__main__":
    pytest.main()

[31mERROR: usage: ipykernel_launcher.py [options] [file_or_dir] [file_or_dir] [...]
ipykernel_launcher.py: error: unrecognized arguments: -f
  inifile: None
  rootdir: /home/oyvindl
[0m


In [64]:
result

In [73]:
result.TIME

In [77]:
pd.Timestamp('2024-01-02'), pd.Timestamp('2024-01-03'), pd.Timestamp('2024-01-05')] in result.TIME

SyntaxError: unmatched ']' (1957186885.py, line 1)

In [75]:
    # Use multiple conditions: both STATION and ZONE
    result = xr_funcs.pick(mock_dataset, 
                           STATION=['st02', 'st03', 'st05'], 
                           ZONE=['epipelagic', 'bathypelagic'])
    
    # Check that only the entries that match both conditions are present
    assert result.sizes['TIME'] == 3  # Only one TIME index should match the condition
    assert result.sizes['PRES'] == 2  # Only one PRES index should match the condition
    
    # Verify the results
    assert ['epipelagic', 'bathypelagic'] in result.ZONE.values
    assert ['st02', 'st03', 'st05'] in result.STATION.values
    assert 'st01' not in result.STATION.values    
    assert 'abyssopelagic' not in result.ZONE.values
    
    # Check if `TIME` dimension is correctly filtered
    assert len(result.TIME) == 3
    pd.Timestamp('2024-01-02') in result.TIME
    assert result.TIME.values == [pd.Timestamp('2024-01-02'), pd.Timestamp('2024-01-03'), pd.Timestamp('2024-01-05')]

['2024-01-02T00:00:00.000000000', '2024-01-03T00:00:00.000000000',
       '2024-01-05T00:00:00.000000000']

    # Check if `PRES` dimension is correctly filtered
    assert result.PRES.values[0] == 100

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()