In [None]:
import anndata
import scanpy as sc
import scanpy.external as sce
import pandas as pd

import matplotlib.pyplot as plt

from pathlib import Path

We are going to run the standard Scanpy embedding/clustering and then integrate our different libraries to see how well they match.

In [None]:
root_dir = Path.home()
data_path = root_dir / "data"
figure_path = root_dir / "202501_figures"

In [None]:
input_3p = root_path / "10x_3p_pbmc/outs/raw_feature_bc_matrix/"
input_5p = root_path / "10x_5p_pbmc/outs/raw_feature_bc_matrix/"
input_PIPseq = root_path / "pipseq_pbmc/raw_matrix/"

In [None]:
data_3p = sc.read_10x_mtx(input_3p)

In [None]:
data_5p = sc.read_10x_mtx(input_5p)

In [None]:
data_pip = sc.read_10x_mtx(input_PIPseq)

In [None]:
data_3p

In [None]:
data_5p

In [None]:
data_pip

In [None]:
data_3p.var_names_make_unique()
data_5p.var_names_make_unique()
data_pip.var_names_make_unique()

In [None]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
data_3p.var["mt"] = data_3p.var_names.str.startswith("MT-")
# ribosomal genes
data_3p.var["ribo"] = data_3p.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
data_3p.var["hb"] = data_3p.var_names.str.contains("^HB[^(P)]")

data_5p.var["mt"] = data_5p.var_names.str.startswith("MT-")
data_5p.var["ribo"] = data_5p.var_names.str.startswith(("RPS", "RPL"))
data_5p.var["hb"] = data_5p.var_names.str.contains("^HB[^(P)]")

data_pip.var["mt"] = data_pip.var_names.str.startswith("MT-")
data_pip.var["ribo"] = data_pip.var_names.str.startswith(("RPS", "RPL"))
data_pip.var["hb"] = data_pip.var_names.str.contains("^HB[^(P)]")

In [None]:
sc.pp.calculate_qc_metrics(
    data_3p, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

sc.pp.calculate_qc_metrics(
    data_5p, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

sc.pp.calculate_qc_metrics(
    data_pip, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

In [None]:
sc.pl.violin(
    data_3p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_5p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_pip,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.scatter(data_3p, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pl.scatter(data_5p, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pl.scatter(data_pip, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pp.filter_cells(data_3p, min_counts=1000)
sc.pp.filter_cells(data_3p, min_genes=100)
sc.pp.filter_genes(data_3p, min_cells=3)

In [None]:
sc.pp.filter_cells(data_5p, min_counts=1000)
sc.pp.filter_cells(data_5p, min_genes=100)
sc.pp.filter_genes(data_5p, min_cells=3)

In [None]:
sc.pp.filter_cells(data_pip, min_counts=1000)
sc.pp.filter_cells(data_pip, min_genes=100)
sc.pp.filter_genes(data_pip, min_cells=3)

In [None]:
sc.pl.violin(
    data_3p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_5p,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.violin(
    data_pip,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
data_3p.X.sum()

In [None]:
data_5p.X.sum()

In [None]:
data_pip.X.sum()

In [None]:
data_3p

In [None]:
data_5p

In [None]:
data_pip

In [None]:
adata_combined = data_3p.concatenate(data_5p, data_pip, batch_key="batch")

In [None]:
adata_combined

In [None]:
sc.pp.normalize_total(adata_combined, target_sum=1e4)
sc.pp.log1p(adata_combined)

In [None]:
sc.pp.highly_variable_genes(adata_combined, n_top_genes=2000)

In [None]:
sc.pp.pca(adata_combined)

In [None]:
sc.pp.neighbors(adata_combined, use_rep="X_pca")
sc.tl.umap(adata_combined)
sc.tl.leiden(adata_combined, flavor="igraph", n_iterations=2)

We can check the UMAP before integration.

In [None]:
sc.pl.umap(adata_combined, color=["leiden"])

In [None]:
sc.pl.umap(adata_combined, color=["batch"])

The different libraries do not overlap almost at all.

We will run Harmony integration now.

In [None]:
sce.pp.harmony_integrate(adata_combined, "batch")

In [None]:
sc.pp.neighbors(adata_combined, use_rep="X_pca_harmony")

In [None]:
sc.tl.umap(adata_combined)

In [None]:
adata_combined.obs['leiden'] = (
    adata_combined.obs['leiden']
      .cat.remove_unused_categories()
)

null = adata_combined.uns.pop('leiden_colors', None)

In [None]:
sc.tl.leiden(adata_combined, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(adata_combined, color=["leiden"])

In [None]:
sc.pl.umap(adata_combined, color=["batch"])

In [None]:
# adding library labels
adata_combined.obs["batch"] = adata_combined.obs["batch"].cat.rename_categories(["10x 3'", "10x 5'", "PIPseq"])

In [None]:
sc.pl.umap(adata_combined, color=["batch"], show=False, legend_loc="right margin")
plt.savefig(figure_path / "integrated_UMAP_colored_by_batch.svg", bbox_inches="tight")