## Collect single-cell and bulk counts from source directories

In [None]:
import re
import gtfparse

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as adata
import plotly.express as px

from tqdm import tqdm
from pathlib import Path

%load_ext blackcellmagic

In [None]:
# Working directory
prefix = "???/deconvolution_benchmarking/04_tcga_bulk_validation"

In [None]:
# Training patient IDs
train_p_ids = [
    "CID3586",
    "CID3941",
    "CID3963",
    "CID44041",
    "CID4530N",
    "CID3838",
    "CID3946",
    "CID4040",
    "CID4461",
    "CID44991",
    "CID45171",
    "CID4535",
    "CID3948",
    "CID4398",
    "CID4463",
    "CID4495",
    "CID4513",
    "CID4465",
]
# Training patient IDs
test_p_ids = [
    "CID4067",
    "CID4290A",
    "CID4471",
    "CID3921",
    "CID4066",
    "CID4523",
    "CID44971",
    "CID4515",
]

## QC TCGA
Filter out samples with:
- Intragenic rate > 0.95
- 90% reads mapping to protein-coding regions

In [None]:
# UUID regex for TCGA
tcga_uuid_regex = (
    r"[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}\Z"
)

# Load RNA-SeQC report
tcga_seqc_report_df = pd.read_csv(
    Path(prefix).joinpath(
        "data/raw/tcga_bulk/TCGA_-_breast_cancer_GRCh38_v84.RNA-SeQC_Report.tsv"
    ),
    index_col=0,
    sep="\t",
)

In [None]:
# Load raw counts
tcga_raw_df = pd.read_csv(
    Path(prefix).joinpath(
        "data/raw/tcga_bulk/TCGA_-_breast_cancer_GRCh38_v84.genes.ExpectedCounts.csv"
    ),
    index_col=0,
    sep=",",
)

# Columns with duplicated uuid are read as "1b907925-b33c-4e4a-96e0-65f15b4712b9.1"
# Remove the ".*"
tcga_raw_df.columns = [i.split(".")[0] for i in tcga_raw_df.columns]

# Split raw into counts and metadata dataframe
tcga_raw_counts_df = tcga_raw_df[
    [i for i in tcga_raw_df.columns if re.match(tcga_uuid_regex, i)]
]
tcga_raw_meta_df = tcga_raw_df[
    [i for i in tcga_raw_df.columns if not re.match(tcga_uuid_regex, i)]
]

# Drop duplicated rows (with same gene counts)
tcga_raw_counts_df = tcga_raw_counts_df.T.drop_duplicates().T

In [None]:
# RNA-SeQC report has 11 samples that were run through CellRanger multiple times
# Remove these duplicates and keep the one with highest Intragenic Rate
tcga_seqc_report_df = (
    tcga_seqc_report_df.reset_index()
    .sort_values(["Sample", "Intragenic Rate"], ascending=False)
    .drop_duplicates(subset=["Sample"], keep="first")
    .set_index(["Sample"])
)

#### Plot stacked bar chart of Intergenic Rate & Intragenic Rate (Intronic Rate, Exonic Rate)

In [None]:
plot_inter_intra_df = (
    tcga_seqc_report_df[["Intergenic Rate", "Intronic Rate", "Exonic Rate"]]
    .sort_values(["Intergenic Rate", "Exonic Rate"], ascending=False)
    .reset_index()
    .melt(
        id_vars=["Sample"],
        value_vars=["Intergenic Rate", "Intronic Rate", "Exonic Rate"],
    )
)
plot_inter_intra_df.rename(
    columns={"variable": "Type", "value": "Proportion"}, inplace=True
)

In [None]:
fig = px.bar(
    plot_inter_intra_df,
    x="Sample",
    color="Type",
    y="Proportion",
    category_orders={"Type": ["Exonic Rate", "Intronic Rate", "Intergenic Rate"]},
)

# By default opacity is 50%. Set it to 100%
fig.update_traces(opacity=1)

# Add horizontal line at 95%
fig.add_hline(y=0.95, line_width=0.5)

# Update axes
fig.update_xaxes(
    title_standoff=5,
    title_font_size=10,
    linecolor="black",
    linewidth=0.5,
    # ticks="outside",
    showticklabels=False,
    tickfont_size=8,
    ticklen=2,
    tickwidth=0.5,
)
fig.update_yaxes(
    title_standoff=5,
    title_font_size=10,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    showticklabels=True,
    tickfont_size=8,
    ticklen=2,
    tickwidth=0.5,
    dtick=0.1,
)

# Update layout
fig["layout"].update(
    font_size=10,
    plot_bgcolor="rgba(0,0,0,0)",
    legend=dict(title_font_size=8, font_size=6),
    showlegend=True,
    newshape=dict(opacity=1),
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
)

fig.write_image(
    Path(prefix).joinpath("figures/intragenic_filtering").with_suffix(".png"),
    scale=4,
    width=1000,
    height=250,
)

#### Corresponding with conclusion from stacked bar chart, filter out samples with intragenic rate < 95%

In [None]:
# First filter samples from RNA-SeQC report
filtered_tcga_seqc_report_df = tcga_seqc_report_df[
    tcga_seqc_report_df["Intragenic Rate"] > 0.95
]

# Then use filtered indexes to grab sample according
filtered_tcga_raw_counts_df = tcga_raw_counts_df[filtered_tcga_seqc_report_df.index]

#### Plot stacked bar chart of reads mapped to gene biotypes

In [None]:
# Merge with "Gene Biotype" column of metadata
filtered_tcga_raw_df = tcga_raw_meta_df[["Gene Biotype"]].merge(
    filtered_tcga_raw_counts_df, left_index=True, right_index=True
)

# Get total counts mapped to each gene biotype for each sample
gene_biotype_counts_df = (
    filtered_tcga_raw_df.groupby(["Gene Biotype"]).sum().sort_index().T
)
gene_biotype_pct_df = gene_biotype_counts_df.div(
    gene_biotype_counts_df.sum(axis=1), axis=0
)
gene_biotype_pct_df.sort_values(["protein_coding"], ascending=True, inplace=True)

In [None]:
# Pivot gene_biotype_pct_df longer
pivoted_gene_biotype_pct_df = (
    gene_biotype_pct_df.reset_index()
    .melt(
        id_vars="index",
    )
    .rename(columns={"index": "Sample", "value": "Proportion"})
)

In [None]:
order = [
    "protein_coding",
    "vault_RNA",
    "unprocessed_pseudogene",
    "unitary_pseudogene",
    "translated_unprocessed_pseudogene",
    "translated_processed_pseudogene",
    "transcribed_unprocessed_pseudogene",
    "transcribed_unitary_pseudogene",
    "transcribed_processed_pseudogene",
    "snoRNA",
    "snRNA",
    "scaRNA",
    "scRNA",
    "sRNA",
    "ribozyme",
    "rRNA_pseudogene",
    "rRNA",
    "pseudogene",
    "processed_pseudogene",
    "polymorphic_pseudogene",
    "misc_RNA",
    "miRNA",
    "lncRNA",
    "TR_V_pseudogene",
    "TR_V_gene",
    "TR_J_pseudogene",
    "TR_J_gene",
    "TR_D_gene",
    "TR_C_gene",
    "TEC",
    "Mt_tRNA",
    "Mt_rRNA",
    "IG_V_pseudogene",
    "IG_V_gene",
    "IG_J_pseudogene",
    "IG_J_gene",
    "IG_D_gene",
    "IG_C_pseudogene",
    "IG_C_gene",
]

In [None]:
# Only plot the first 120 samples
plot_gene_biotype_pct_df = pivoted_gene_biotype_pct_df[
    pivoted_gene_biotype_pct_df["Sample"].isin(gene_biotype_pct_df.index[:120])
]

# Plot stacked bar chart beautifully
fig = px.bar(
    plot_gene_biotype_pct_df,
    x="Sample",
    color="Gene Biotype",
    y="Proportion",
    category_orders={"Gene Biotype": order},
    color_discrete_sequence=px.colors.qualitative.Dark24_r,
)

# By default opacity is 50%. Set it to 100%
fig.update_traces(opacity=1)

# Add horizontal line at 95%
fig.add_hline(y=0.90, line_width=0.5)

# Update axes
fig.update_xaxes(
    title_standoff=5,
    title_font_size=10,
    linecolor="black",
    linewidth=0.5,
    # ticks="outside",
    showticklabels=False,
    tickfont_size=8,
    ticklen=2,
    tickwidth=0.5,
)
fig.update_yaxes(
    title_standoff=5,
    title_font_size=10,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    showticklabels=True,
    tickfont_size=8,
    ticklen=2,
    tickwidth=0.5,
    dtick=0.1,
)

# Update layout
fig["layout"].update(
    font_size=10,
    plot_bgcolor="rgba(0,0,0,0)",
    legend=dict(
        title_font_size=8,
        font_size=6,
        y=-0.05,
        orientation="h",
    ),
    showlegend=True,
    newshape=dict(opacity=1),
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
)

fig.write_image(
    Path(prefix).joinpath("figures/gene_biotype_filtering").with_suffix(".png"),
    scale=4,
    width=1000,
    height=600,
)

#### Corresponding with conclusion from stacked bar chart, filter out samples with less than 90% counts mapped from protein coding genes

In [None]:
# First filter samples from Gene Biotype table
filtered_gene_biotype_pct_df = gene_biotype_pct_df[
    gene_biotype_pct_df["protein_coding"] > 0.9
]

# Then use filtered indexes to grab sample according
filtered_tcga_raw_counts_df = filtered_tcga_raw_counts_df[
    filtered_gene_biotype_pct_df.index
]

#### Save QC-ed counts

In [None]:
# Collect list of samples that failed QC
failed_qc_38_df = pd.DataFrame(
    data=[
        i
        for i in tcga_raw_counts_df.columns
        if i not in filtered_tcga_raw_counts_df.columns
    ],
    columns=["sample_label"],
)

## Single cell reference

In [None]:
# First load single-cell data
sc_adata = sc.read_10x_mtx(prefix).joinpath("data/raw/sc_ref")
sc_df = sc_adata.to_df()

# Normalize by Counts-per-10,000
normalized_sc_df = sc_df.div(sc_df.sum(axis=1), axis=0) * 10000

# Get list of single-cell HUGO genes
sc_hugo_genes = sc_df.columns.tolist()

In [None]:
# Load single-cell labels
sc_labels_df = pd.read_csv(
    Path(prefix).joinpath("data/raw/sc_ref/Whole_miniatlas_meta.csv"),
    index_col=0,
    sep=",",
)
sc_labels_df.drop(["TYPE"], axis=0, inplace=True)

In [None]:
# Get HUGO-Ensembl mapping
genes_mapping_df = pd.read_csv(
    Path(prefix).joinpath("data/raw/hugo_ensembl_maps.tsv"),
    sep="\t",
    header=None,
    names=["hugo", "ensembl"],
)
genes_mapping_d = {
    row[1]["hugo"]: row[1]["ensembl"]
    for row in genes_mapping_df[genes_mapping_df["hugo"].isin(sc_hugo_genes)].iterrows()
}

# Rename HUGO by Ensembl gene names in single-cell reference
for df in tqdm([sc_df, normalized_sc_df]):
    df.rename(columns=genes_mapping_d, inplace=True)

#### Filter out counts and metadata by training patient ids

In [None]:
# Filter metadata
training_sc_labels_df = sc_labels_df[sc_labels_df["Patient"].isin(train_p_ids)]

# Filter raw counts and normalized counts
training_sc_df = sc_df[sc_df.index.isin(training_sc_labels_df.index)]
training_normalized_sc_df = normalized_sc_df[
    normalized_sc_df.index.isin(training_sc_labels_df.index)
]

#### Save into AnnData objects

In [None]:
# Observations of AnnData objects are cell-type labels
# Variables are Ensembl IDs
adata_obs_df = (
    training_sc_df.index.to_frame()
    .drop([0], axis=1)
    .merge(
        training_sc_labels_df[
            ["Patient", "celltype_major", "celltype_minor", "celltype_subset"]
        ],
        left_index=True,
        right_index=True,
    )
)
adata_var_df = training_sc_df.columns.to_frame().drop([1], axis=1)

# Raw counts
ens_sc_adata = adata.AnnData(
    X=training_sc_df.values,
    obs=adata_obs_df,
    var=adata_var_df,
    dtype="float64",
)
ens_sc_adata.write_h5ad(
    Path(prefix).joinpath("data/filtered/non_intersect/scRNA_ref_raw.h5ad")
)

# Normalized counts
ens_normalized_sc_adata = adata.AnnData(
    X=training_normalized_sc_df.values,
    obs=adata_obs_df,
    var=adata_var_df,
    dtype="float64",
)
ens_normalized_sc_adata.write_h5ad(
    Path(prefix).joinpath("data/filtered/non_intersect/scRNA_ref_normalized.h5ad")
)

In [None]:
# Save raw bulk counts
filtered_tcga_raw_counts_df.to_csv(
    Path(prefix).joinpath(f"data/filtered/non_intersect/tcga_raw_counts.csv"), sep="\t"
)

# Load TPM counts from TCGA and only keep samples that passed QC in filtered_tcga_raw_counts_df
tcga_tpm_df = pd.read_csv(
    Path(prefix).joinpath(
        "data/raw/tcga_bulk/TCGA_-_breast_cancer_GRCh38_v84.genes.TPM.csv"
    ),
    index_col=0,
    sep=",",
)
filtered_tcga_tpm_counts_df = tcga_tpm_df[filtered_tcga_raw_counts_df.columns]

# Save TPM bulk counts
filtered_tcga_tpm_counts_df.to_csv(
    Path(prefix).joinpath(f"data/filtered/non_intersect/tcga_tpm_counts.csv"), sep="\t"
)

## Intersect genes between QC-ed bulk counts and single cell

In [None]:
# Get intersecting genes
intersect_genes = [
    gene for gene in training_sc_df.columns if gene in filtered_tcga_raw_counts_df.index
]

#### Intersect TCGA bulk counts and save it

In [None]:
# Filter tpm and raw counts in TCGA bulk samples
intersect_tpm_counts_df = filtered_tcga_tpm_counts_df.loc[intersect_genes, :]
intersect_raw_counts_df = filtered_tcga_raw_counts_df.loc[intersect_genes, :]

# Save into csv
intersect_tpm_counts_df.to_csv(
    Path(prefix).joinpath(f"data/filtered/intersect/tcga_tpm_counts.csv"), sep="\t"
)
intersect_raw_counts_df.to_csv(
    Path(prefix).joinpath(f"data/filtered/intersect/tcga_raw_counts.csv"), sep="\t"
)

#### Intersect single-cell reference and save it

In [None]:
# Raw counts
intersect_sc_df = training_sc_df[intersect_genes]

# Normalized counts
intersect_normalized_sc_df = training_normalized_sc_df[intersect_genes]

In [None]:
# Observations of AnnData objects are cell-type labels
# Variables are Ensembl IDs
intersect_adata_obs_df = (
    intersect_sc_df.index.to_frame()
    .drop([0], axis=1)
    .merge(
        training_sc_labels_df[
            ["Patient", "celltype_major", "celltype_minor", "celltype_subset"]
        ],
        left_index=True,
        right_index=True,
    )
)
intersect_adata_var_df = intersect_sc_df.columns.to_frame().drop([1], axis=1)

# Raw counts
intersect_sc_adata = adata.AnnData(
    X=intersect_sc_df.values,
    obs=intersect_adata_obs_df,
    var=intersect_adata_var_df,
    dtype="float64",
)
intersect_sc_adata.write_h5ad(
    Path(prefix).joinpath("data/filtered/intersect/scRNA_ref_raw.h5ad")
)

# Normalized counts
intersect_normalized_sc_adata = adata.AnnData(
    X=intersect_normalized_sc_df.values,
    obs=intersect_adata_obs_df,
    var=intersect_adata_var_df,
    dtype="float64",
)
intersect_normalized_sc_adata.write_h5ad(
    Path(prefix).joinpath("data/filtered/intersect/scRNA_ref_normalized.h5ad")
)