In [None]:
import anndata
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import pickle
import numpy as np
import gzip

import matplotlib.pyplot as plt

from pathlib import Path

from mdl.sc_isoform_paper.constants import MASSEQ_KEYS, SAMPLE_COLORS, SHORTREAD_KEYS
from mdl.isoscelles.leiden import cluster_leaf_nodes, cluster_labels
from mdl.sc_isoform_paper.pipseq_barcodes import barcode_to_sequence

We are going to run the standard Scanpy embedding/clustering and then integrate our different libraries to see how well they match.

In [None]:
root_dir = Path.home()
data_path = root_dir / "data"
figure_path = root_dir / "202501_figures"

In [None]:
input_3p = data_path / "10x_3p_pbmc/outs/raw_feature_bc_matrix/"
input_5p = data_path / "10x_5p_pbmc/outs/raw_feature_bc_matrix/"
input_PIPseq = data_path / "pipseq_pbmc/raw_matrix/"

In [None]:
data_3p = sc.read_10x_mtx(input_3p)

In [None]:
data_5p = sc.read_10x_mtx(input_5p)

In [None]:
data_pip = sc.read_10x_mtx(input_PIPseq)

In [None]:
data_3p

In [None]:
data_5p

In [None]:
data_pip

In [None]:
data_3p.var_names_make_unique()
data_5p.var_names_make_unique()
data_pip.var_names_make_unique()

In [None]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
data_3p.var["mt"] = data_3p.var_names.str.startswith("MT-")
# ribosomal genes
data_3p.var["ribo"] = data_3p.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
data_3p.var["hb"] = data_3p.var_names.str.contains("^HB[^(P)]")

data_5p.var["mt"] = data_5p.var_names.str.startswith("MT-")
data_5p.var["ribo"] = data_5p.var_names.str.startswith(("RPS", "RPL"))
data_5p.var["hb"] = data_5p.var_names.str.contains("^HB[^(P)]")

data_pip.var["mt"] = data_pip.var_names.str.startswith("MT-")
data_pip.var["ribo"] = data_pip.var_names.str.startswith(("RPS", "RPL"))
data_pip.var["hb"] = data_pip.var_names.str.contains("^HB[^(P)]")

In [None]:
sc.pp.calculate_qc_metrics(
    data_3p, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

sc.pp.calculate_qc_metrics(
    data_5p, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

sc.pp.calculate_qc_metrics(
    data_pip, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

In [None]:
sc.pl.violin(
    data_3p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_5p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_pip,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.scatter(data_3p, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pl.scatter(data_5p, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pl.scatter(data_pip, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pp.filter_cells(data_3p, min_counts=1000)
sc.pp.filter_cells(data_3p, min_genes=100)
sc.pp.filter_genes(data_3p, min_cells=3)

In [None]:
sc.pp.filter_cells(data_5p, min_counts=1000)
sc.pp.filter_cells(data_5p, min_genes=100)
sc.pp.filter_genes(data_5p, min_cells=3)

In [None]:
sc.pp.filter_cells(data_pip, min_counts=1000)
sc.pp.filter_cells(data_pip, min_genes=100)
sc.pp.filter_genes(data_pip, min_cells=3)

In [None]:
sc.pl.violin(
    data_3p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_5p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_pip,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
data_3p.X.sum()

In [None]:
data_5p.X.sum()

In [None]:
data_pip.X.sum()

In [None]:
data_3p

In [None]:
data_5p

In [None]:
data_pip

In [None]:
adata_combined = data_3p.concatenate(data_5p, data_pip, batch_key="batch")

In [None]:
adata_combined

In [None]:
sc.pp.normalize_total(adata_combined, target_sum=1e4)
sc.pp.log1p(adata_combined)

In [None]:
sc.pp.highly_variable_genes(adata_combined, n_top_genes=2000)

In [None]:
sc.pp.pca(adata_combined)

In [None]:
sc.pp.neighbors(adata_combined, use_rep="X_pca")
sc.tl.umap(adata_combined)
sc.tl.leiden(adata_combined, flavor="igraph", n_iterations=2)

We can check the UMAP before integration.

In [None]:
sc.pl.umap(adata_combined, color=["leiden"])

In [None]:
sc.pl.umap(adata_combined, color=["batch"])

In [None]:
# UMAP with cluster labels from standalone clusterings

The different libraries do not overlap almost at all.

We will run Harmony integration now, but first make a backup of the UMAP to be able to compare to later.

In [None]:
if 'X_umap' in adata_combined.obsm:
    adata_combined.obsm['X_umap_pre'] = adata_combined.obsm['X_umap'].copy()
if 'leiden' in adata_combined.obs:
    adata_combined.obs['leiden_pre'] = adata_combined.obs['leiden'].copy()
    if 'leiden_colors' in adata_combined.uns:
        adata_combined.uns['leiden_pre_colors'] = list(adata_combined.uns['leiden_colors'])

In [None]:
sce.pp.harmony_integrate(adata_combined, "batch")

In [None]:
sc.pp.neighbors(adata_combined, use_rep="X_pca_harmony")

In [None]:
sc.tl.umap(adata_combined)

In [None]:
adata_combined.obs['leiden'] = (
    adata_combined.obs['leiden']
      .cat.remove_unused_categories()
)

null = adata_combined.uns.pop('leiden_colors', None)

In [None]:
sc.tl.leiden(adata_combined, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(adata_combined, color=["leiden"])

In [None]:
sc.pl.umap(adata_combined, color=["batch"])

In [None]:
# adding library labels
adata_combined.obs["batch"] = adata_combined.obs["batch"].cat.rename_categories(["10x 3'", "10x 5'", "PIPseq"])

In [None]:

sc.pl.umap(adata_combined, color=["batch"], show=False, legend_loc="right margin")
plt.savefig(figure_path / "integrated_UMAP_colored_by_batch.svg", bbox_inches="tight")

To have a better idea of what the Harmony integration did, we will load back the clustering results from earlier.

In [None]:
sample_order = [MASSEQ_KEYS[i] for i in (1, 3, 4)]
sample_order

In [None]:
with open(data_path / "shortread_clustering_100k.pickle", "rb") as fh:
    sr_clustering = pickle.load(fh)

with open(data_path / "shortread_stats_100k.pickle", "rb") as fh:
    ix_dict = pickle.load(fh)["ix_dict"]

with open(data_path / "shortread_stats_100k.pickle", "rb") as fh:
    sr_numis = pickle.load(fh)["numis"]

In [None]:
c_arrays = dict()
for key in SHORTREAD_KEYS:
    _leaf_nodes = cluster_leaf_nodes(sr_clustering[key][0])
    _label_array = cluster_labels(sr_clustering[key][0], _leaf_nodes)
    _k2i = {k: i for i, k in enumerate(sorted(_leaf_nodes))}    
    c_arrays[key] = np.array([_k2i.get(k, -1) for k in _label_array])

In [None]:
bc_dict = dict()

for fp in data_path.glob("10x*/outs/raw_feature_bc_matrix/barcodes.tsv.gz"):
    fp = fp.parent
    print(fp)
    with gzip.open(fp / "barcodes.tsv.gz", "rt") as fh:
        # strip off the -1 suffix from barcodes
        bc_dict[fp.parent.parent.name] = [line.strip()[:-2] for line in fh]

fp = data_path / "pipseq_pbmc"
print(fp)
with gzip.open(fp / "raw_matrix" / "barcodes.tsv.gz", "rt") as fh:
    bc_dict[fp.name] = [line.strip() for line in fh]


In [None]:
bc_to_cluster = {
    k: dict(
        zip(
            (bc for bc,i in zip(bc_dict[k2], ix_dict[k2]) if i),
            c_arrays[k2]
        )
    ) for k, k2 in zip(sample_order, ["pipseq_pbmc", "10x_3p_pbmc", "10x_5p_pbmc"])
}

In [None]:
# same as the labels from notebook 01
cluster_names = {
    ("PIPseq", "0.8x"): {
        0: 'CD4 T cells 1',
        1: 'CD4 T cells 2',
        2: 'Naïve CD4',
        3: 'Cytotoxic T cells',
        4: 'Innate Lymphoid',
        5: 'CD16 Monocytes',
        6: 'CD14 Monocytes',
        7: 'B cells',
    },
    ("10x 3'",): {
        0: 'CD4 T cells 1',
        1: 'CD4 T cells 2',
        2: 'Naïve CD4',
        3: 'Cytotoxic T cells',
        4: 'B cells',
        5: 'CD14 Monocytes',
        6: 'CD16 Monocytes',
        7: 'DC',
    },
    ("10x 5'",): {
        0: 'CD4 T cells 1',
        1: 'CD4 T cells 2',
        2: 'Naïve CD4',
        3: 'Cytotoxic T cells',
        4: 'Innate Lymphoid',
        5: 'B cells',
        6: 'CD14 Monocytes',
        7: 'CD16 Monocytes',
    }
}

cluster_reverse_labels = {
    k: {v: i for i,v in cluster_names[k].items()}
    for k in cluster_names
}


Reorganize the data to add as a column to our scanpy object.

In [None]:
batch_to_bccluster = {}
for k, d in bc_to_cluster.items():
    batch_name = k[0]
    batch_to_bccluster[batch_name] = {str(bc): int(c) for bc, c in d.items()}

batch_to_names = {k[0]: v for k, v in cluster_names.items()}

In [None]:
raw_bc = (
    (adata_combined.obs['barcode'].astype(str)
     if 'barcode' in adata_combined.obs.columns
     else pd.Series(adata_combined.obs_names, index=adata_combined.obs_names).astype(str))
    .str.replace(r'(?:-\d+)+$', '', regex=True)
)

In [None]:
pairs_to_label = {}
for batch, bc2cid in batch_to_bccluster.items():
    names = batch_to_names.get(batch, {})
    for bc, cid in bc2cid.items():
        pairs_to_label[(batch, bc)] = names.get(int(cid), 'Unassigned')

lookup = pd.Series(pairs_to_label)


In [None]:
keys = pd.MultiIndex.from_arrays([
    adata_combined.obs['batch'].astype(str).values,
    raw_bc.astype(str).values
])
labels = lookup.reindex(keys).values

In [None]:
adata_combined.obs['celltype'] = pd.Categorical(labels)

We can now compare the library and cell type before and after Harmony integration.

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(16, 10), constrained_layout=True)

# A) pre-integration, colored by batch
sc.pl.embedding(
    adata_combined, basis='X_umap_pre', color='batch',
    ax=axes[0, 0], show=False, title='Pre-Harmony: batch',
    legend_loc='right margin'
)

# B) pre-integration, colored by external labels
sc.pl.embedding(
    adata_combined, basis='X_umap_pre', color='celltype',
    ax=axes[0, 1], show=False, title='Pre-Harmony: cell type',
    legend_loc='right margin', legend_fontsize=8
)

# C) post-integration, colored by batch
sc.pl.umap(
    adata_combined, color='batch',
    ax=axes[1, 0], show=False, title='Post-Harmony: batch',
    legend_loc='right margin', legend_fontsize=8
)

# D) post-integration, colored by external labels
sc.pl.umap(
    adata_combined, color='celltype',
    ax=axes[1, 1], show=False, title='Post-Harmony: cell type',
    legend_loc='right margin', legend_fontsize=8
)

plt.savefig(figure_path / "umaps_combined.svg", format="svg")