# Prep environment

Portions of code adapted from Theis Lab Single Cell best practices, see for more details: https://www.sc-best-practices.org/conditions/differential_gene_expression.html

Ensure that conda environment running has both Python and R installed

In [None]:
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import sc_toolbox
import anndata
from statannot import add_stat_annotation

import os
os.environ['R_HOME']='/hpc/group/goldsteinlab/envs/Python_R_4_env/lib/R'

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
from rpy2.robjects import r

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [None]:
#Show specific size of pandas dataframe when produced
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
%%R
.libPaths( c( "/hpc/group/goldsteinlab/envs/Python_R_4_env/lib/R" , .libPaths() ) )

In [None]:
%%R
setwd('/hpc/group/goldsteinlab/envs/Python_R_4_env/lib/R/library')

In [None]:
%%R
library("Rcpp")
library("ggplot2")
library("ggrepel")
library("repr")
library('edgeR')
library('SingleCellExperiment')
library('scater')
library('Seurat')

# Read in and prep dataset

In [None]:
os.chdir('/hpc/group/goldsteinlab/Python/ONB')
adata_tumor=sc.read_h5ad('Primary_ONB_tumors_only_scvi.h5ad')

In [None]:
# plot adata_tumor.obs['tumor_clusters']
# this is what we will use for the DE, should nicely show RPM and RPMA
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata_tumor, color=['tumor_clusters'],
          legend_loc='on data', save=False, frameon=False, ax=ax)

# Create pseudobulks

In [None]:
adata_tumor.obs.groupby(['mouse_ident', 'tumor_clusters']).apply(len)

In [None]:
adata=adata_tumor

In [None]:
#We need tumor_type (condition label), mouse_ident (replicate), and tumor_clusters (cell_type)

#First create mouse-condition combination in metadata
adata.obs["sample"] = [
    f"{rep}_{l}" for rep, l in zip(adata.obs["mouse_ident"], adata.obs["tumor_type"])
]

In [None]:
#Set categorical metadata
adata.obs["replicate"] = adata.obs["mouse_ident"].astype("category")
adata.obs["label"] = adata.obs["tumor_type"].astype("category")
adata.obs["sample"] = adata.obs["sample"].astype("category")
adata.obs["cell_type"] = adata.obs["tumor_clusters"].astype("category")

In [None]:

NUM_OF_CELL_PER_DONOR = 20


def aggregate_and_filter(
    adata,
    cell_identity,
    donor_key="sample",
    condition_key="label",
    cell_identity_key="cell_type",
    obs_to_keep=[],  # which additional metadata to keep, e.g. gender, age, etc.
    replicates_per_patient=3,
):
    # subset adata to the given cell identity
    adata_cell_pop = adata[adata.obs[cell_identity_key] == cell_identity].copy()
    # check which donors to keep according to the number of cells specified with NUM_OF_CELL_PER_DONOR
    size_by_donor = adata_cell_pop.obs.groupby([donor_key]).size()
    donors_to_drop = [
        donor
        for donor in size_by_donor.index
        if size_by_donor[donor] <= NUM_OF_CELL_PER_DONOR
    ]
    if len(donors_to_drop) > 0:
        print("Dropping the following samples:")
        print(donors_to_drop)
    df = pd.DataFrame(columns=[*adata_cell_pop.var_names, *obs_to_keep])

    adata_cell_pop.obs[donor_key] = adata_cell_pop.obs[donor_key].astype("category")
    for i, donor in enumerate(donors := adata_cell_pop.obs[donor_key].cat.categories):
        print(f"\tProcessing donor {i+1} out of {len(donors)}...", end="\r")
        if donor not in donors_to_drop:
            adata_donor = adata_cell_pop[adata_cell_pop.obs[donor_key] == donor]
            # create replicates for each donor
            indices = list(adata_donor.obs_names)
            random.shuffle(indices)
            indices = np.array_split(np.array(indices), replicates_per_patient)
            for i, rep_idx in enumerate(indices):
                adata_replicate = adata_donor[rep_idx]
                # specify how to aggregate: sum gene expression for each gene for each donor and also keep the condition information
                agg_dict = {gene: "sum" for gene in adata_replicate.var_names}
                for obs in obs_to_keep:
                    agg_dict[obs] = "first"
                # create a df with all genes, donor and condition info
                df_donor = pd.DataFrame(adata_replicate.X.A)
                df_donor.index = adata_replicate.obs_names
                df_donor.columns = adata_replicate.var_names
                df_donor = df_donor.join(adata_replicate.obs[obs_to_keep])
                # aggregate
                df_donor = df_donor.groupby(donor_key).agg(agg_dict)
                df_donor[donor_key] = donor
                df.loc[f"donor_{donor}_{i}"] = df_donor.loc[donor]
    print("\n")
    # create AnnData object from the df
    adata_cell_pop = sc.AnnData(
        df[adata_cell_pop.var_names], obs=df.drop(columns=adata_cell_pop.var_names)
    )
    return adata_cell_pop

In [None]:
obs_to_keep = ["label", "cell_type", "replicate", "sample"]

In [None]:
#Use raw counts matrix for edgeR anndata object
adata.X = adata.layers["counts"].copy()

In [None]:
#Create anndata object with pseudobulks
cell_type = adata.obs["cell_type"].cat.categories[0]
print(
    f'Processing {cell_type} (1 out of {len(adata.obs["cell_type"].cat.categories)})...'
)
adata_pb = aggregate_and_filter(adata, cell_type, obs_to_keep=obs_to_keep)
for i, cell_type in enumerate(adata.obs["cell_type"].cat.categories[1:]):
    print(
        f'Processing {cell_type} ({i+2} out of {len(adata.obs["cell_type"].cat.categories)})...'
    )
    adata_cell_type = aggregate_and_filter(adata, cell_type, obs_to_keep=obs_to_keep)

In [None]:
adata_cell_type.obs.groupby('sample').apply(len)

In [None]:
adata_pb.obs.groupby('sample').apply(len)

In [None]:
# code above does not appropriately concatenate the two sample categories
# so can easily fix this with:

#Convert anndata obs back to str prior to concatenation
adata_pb.obs["replicate"] = adata_pb.obs["replicate"].astype("category")
adata_pb.obs["label"] = adata_pb.obs["label"].astype("category")
adata_pb.obs["sample"] = adata_pb.obs["sample"].astype("category")
adata_pb.obs["cell_type"] = adata_pb.obs["cell_type"].astype("category")

adata_cell_type.obs["replicate"] = adata_cell_type.obs["replicate"].astype("category")
adata_cell_type.obs["label"] = adata_cell_type.obs["label"].astype("category")
adata_cell_type.obs["sample"] = adata_cell_type.obs["sample"].astype("category")
adata_cell_type.obs["cell_type"] = adata_cell_type.obs["cell_type"].astype("category")

In [None]:
#Concatenate datasets
adata_pb1 = anndata.concat([adata_cell_type, adata_pb])

In [None]:
# generate df for export to R
df=pd.DataFrame(data=adata_pb1.X, index=adata_pb1.obs_names, columns=adata_pb1.var_names)
# transpose 
df=df.T

In [None]:
# write df to csv
df.to_csv('/hpc/group/goldsteinlab/R/Working_directory/RPM_vs_RPMA_gene_counts_psuedobulked.csv')

# edgeR processing

In [None]:
%%R
setwd('/hpc/group/goldsteinlab/R/Working_directory/')

In [None]:
%%R
# Read in data with R
data <- read.csv('/hpc/group/goldsteinlab/R/Working_directory/RPM_vs_RPMA_gene_counts_psuedobulked.csv', row.names=1)

In [None]:
%%R
head(data)

In [None]:
%%R
datagroups <- c("RPMA", "RPMA", "RPMA", "RPMA", "RPMA", "RPMA",
               "RPM", "RPM", "RPM")

In [None]:
%%R
#Now filter genes with less than 2 cpm (standard here ~100)
keep <- rowSums(cpm(d)>100) >= 2
d <- d[keep,]
dim(d)

In [None]:
%%R
#Normalize data (sc best practice textbook does this)
d <- calcNormFactors(d)

In [None]:
%%R
#Plot data
plotMDS(d, method="bcv", col=as.numeric(d$samples$group))
legend("bottomleft", as.character(unique(d$samples$group)), col=1:3, pch=20)

In [None]:
%%R
# estimate GLM dispersion
design.mat <- model.matrix(~0 + d$samples$group)
colnames(design.mat) <- levels(d$samples$group)
d2 <- estimateGLMCommonDisp(d,design.mat)
d2 <- estimateGLMTrendedDisp(d2,design.mat, method="power")

In [None]:
%%R
# now run DE using GLM
fit <- glmQLFit(d2, design.mat)
qlf <- glmQLFTest(fit, contrast=c(1, -1))
tt <- topTags(qlf, n=Inf) #Note FDR is p-value adj, this is default BH correction
tt <-tt$table

In [None]:
%%R
#table
tr <- glmTreat(fit, contrast=c(1, -1), lfc=1.5)
print(head(topTags(tr), 30))

# generate associated plots

In [None]:
%%R
# generate volcano plot

# Create categories to color
tt$Change <- "NO"
tt $Change[tt$logFC > 0.6 & tt$FDR < 0.05] <- "UP"
tt $Change[tt$logFC < -0.6 & tt$FDR < 0.05] <- "DOWN"
mycolors <- c("#ff7f0e","gray", "#984ea3")
names(mycolors) <- c("DOWN", "NO", "UP")

#Now add gene names (stored in rownames) as a columc called label, only labeling genes in Up or Down categories above
tt$label <- NA
labels <- rownames(tt)
tt$label <- labels
specific_labels <- c('Nhlh2','Neurod6', 'Myt1l', 'Neurod1',
                     'Lhx2', 'Ebf1', 'Sox11', 'Insm1',
                     'Mycl', 'Gap43', 'Tubb3', 'Grp',
                     'Uchl1', 'Runx1t1', 'Ncam1',
                     'Sox4', 'Sox2', 'Ezh2', 'Hes6',
                     'Atrx', 'Cdk4', 'Nfib', 'Myc', 'Fosb',
                     'Nfe2l2', 'Vim', 'Junb', 'Fos',
                     'Meis2', 'Atf3', 'Krt18', 'Epcam',
                     'Cdk6', 'Egfr', 'Plcg2', 'Meis1',
                     'Foxi1', 'Calcr')

tt$label[(!tt$label %in% specific_labels)] <- NA
tt$label[!(tt$FDR < 0.05)] <- NA

options(ggrepel.max.overlaps = Inf)
p <- ggplot(data=tt, aes(x=logFC, y=-log10(FDR), col=Change, label=label)) + geom_point(alpha = 1/1.3) + theme_classic() + geom_vline(xintercept=c(-0.6, 0.6), linetype='dashed') +
  geom_hline(yintercept=-log10(0.05), linetype='dashed') + scale_colour_manual(values = mycolors) + geom_text_repel(color="black", size=6, min.segment.length = 0.05,
                                                                                                                    nudge_y = 0.5, nudge_x = 0.5) + labs(title = "RPM vs. RPMA") +
  theme(plot.title = element_text(hjust = 0.5, size=20, face='bold'), text = element_text(size = 25, face="bold"),
        axis.text.x = element_text(color = "black"),
        axis.text.y = element_text(color = "black"))

p


In [None]:
%%R
# write de genes to csv
head(tt)
write.csv(tt, "RPM_v_RPMA_glmQLF_model_de.csv")