In [None]:
import pandas as pd
import numpy as np
import anndata as ad
from alphapepttools.pp import filter_by_metadata

In [None]:
# While filtering AnnData objects with familiar pandas/numpy slicing is possible,
# we found that applying multiple filters and filters based on ranges of data can get
# cumbersome, as illustrated by this example.

X = pd.DataFrame(
    {
        **{f"gene_{i}": np.random.randn(6) for i in range(5)},
    },
    index=[f"cell_{i}" for i in range(6)],
)

sample_metadata = pd.DataFrame(
    {
        "column1": ["A", "B", "C", "D", "E", "F"],
        "column2": [50, 200, 50, 200, 50, 200],
    }
)

test_adata = ad.AnnData(X)
test_adata.obs = sample_metadata

# Instead of this, which is quite convoluted as more columns are added
adata_filtered_1 = test_adata[
    (test_adata.obs["column1"].isin(["A", "B", "C"]))
    | ((test_adata.obs["column2"] > 20) & (test_adata.obs["column2"] <= 100)),  # NOQA: PLR2004
    :,
]

# We use this for easy and transparent filtering
adata_filtered_2 = filter_by_metadata(
    test_adata, {"column1": ["A", "B", "C"], "column2": (20, 100)}, axis=0, logic="or", action="keep"
)
if not adata_filtered_1.obs.equals(adata_filtered_2.obs):
    raise ValueError("The two filtering methods did not produce the same result!")