In [1]:
from pathlib import Path

import pandas as pd
import scanpy as sc
from scipy.io import mmwrite

In [17]:
pipe_folders = (
    Path("./data/PIPE-0011-ATAC+GEX_counts"),
    Path("./data/PIPE-0010-ATAC+GEX_counts"),
)

In [18]:
# Load all the CellBender-adjusted count matrices
datas = {
    cellbent_filtered_matrix_path: sc.read_10x_h5(cellbent_filtered_matrix_path)
    for pipe_folder in pipe_folders
    for cellbent_filtered_matrix_path in pipe_folder.glob("**/cellbent_filtered.h5")
}
datas

{}

In [6]:
# Map in the ATAC barcodes
for k, v in datas.items():
    v.obs["atac_barcode"] = pd.read_csv(
        k.parent / "per_barcode_metrics.csv",
        usecols=["atac_barcode", "gex_barcode"],
    ).set_index("gex_barcode")["atac_barcode"]

In [7]:
# Drop the Amulet-called-multiplets, also print out the count
for k, v in datas.items():
    multiplets = (
        (k.parent / "amulet" / "MultipletBarcodes_01.txt").read_text().splitlines()
    )
    print(k, len(multiplets))
    v = v[~v.obs.index.isin(multiplets)]

In [8]:
# Load the corresponding raw count matrix and add this as a layer
for k, v in datas.items():
    raw_counts = sc.read_10x_h5(k.parent / "raw_feature_bc_matrix.h5")
    v.layers["raw"] = raw_counts[v.obs.index].X

In [9]:
# Prefix barcodes using the sample tag (e.g., "E14_WT_2")
for k, v in datas.items():
    sample_tag = k.parent.parent.stem
    sample_tag = sample_tag.replace("WT", "Wt").replace("Wt", "Con")
    if sample_tag.startswith(("Hom", "Con")):
        # We have an E18 sample that wasn't prefixed in cellranger-arc
        sample_tag = "E18_" + sample_tag

    v.obs.index = sample_tag + ":" + v.obs.index
    v.obs["atac_barcode"] = sample_tag + ":" + v.obs["atac_barcode"]
    v.obs["sample"] = sample_tag
    v.obs[["age", "condition", "replicate"]] = sample_tag.split("_")

In [10]:
# Check for duplicated gene shorthands
{k: v.var[v.var.index.duplicated(keep=False)] for k, v in datas.items()}

{}

In [11]:
# Suffix duplicate var indices (gene shorthands)
for k, v in datas.items():
    v.var_names_make_unique()

In [12]:
# Check that we no longer have duplicate gene names
{k: v.var[v.var.index.duplicated(keep=False)] for k, v in datas.items()}

{}

In [13]:
# Make sure we don't have any duplicated `obs` barcodes in any samples
{k: v.obs[v.obs.index.duplicated(keep=False)] for k, v in datas.items()}

{}

In [14]:
concatted_data = sc.concat(datas.values(), join="outer", merge="same")
concatted_data

ValueError: No objects to concatenate

In [16]:
# Check our `obs` barcodes are unique
concatted_data.obs[concatted_data.obs.index.duplicated(keep=False)]

NameError: name 'concatted_data' is not defined

In [15]:
# Add a column to var indicating if the gene is mitochondrial
concatted_data.var["mt"] = concatted_data.var_names.str.startswith("mt-")

NameError: name 'concatted_data' is not defined

In [None]:
# Save just the count matrix and names for the R Azimuth workflow
Path("./gex_barcodes.txt").write_text("\n".join(concatted_data.obs_names))
Path("./gex_gene_names.txt").write_text("\n".join(concatted_data.var_names))
mmwrite("./gex.mtx", concatted_data.X.T)

In [None]:
concatted_data = sc.read("gex.h5ad")

In [None]:
# Load in the Azimuth labels
label_dfs = [
    pd.read_csv(label_file, index_col=0).rename(
        columns={
            "predicted.celltype": f"label_{label_file.stem}",
            "predicted.celltype.score": f"label_{label_file.stem}_score",
        }
    )
    for label_file in Path("./azimuth/labels").glob("*.csv")
]

concatted_data.obs = concatted_data.obs.join(label_dfs)

In [None]:
# Save the AnnData object for the Python workflow
concatted_data.write("gex.h5ad")