In [1]:
library(reticulate)
library(tidyverse)
library(sceasy)
library(Seurat)
library(SoupX)

“package ‘reticulate’ was built under R version 4.4.3”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: SeuratObject

“package ‘SeuratObject’ was built under R version 4.4.3”
Loading required 

In [None]:
use_condaenv("/software/cellgen/team274/lr26/miniforge3/envs/my-r", required = TRUE)
loompy <- reticulate::import('loompy')
anndata <- reticulate::import('anndata')

In [None]:
samples <- c('CG_SB_NB13960948', 'CG_SB_NB13960949', 'CG_SB_NB13960950', 'CG_SB_NB13960951',
             'CG_SB_NB14449539', 'CG_SB_NB14449540', 'CG_SB_NB14449541')

In [None]:
# Define the function to create the soup profile and adjust counts
make_soup <- function(sobj, sample_id) {
  # Define paths for raw and filtered feature matrices
  path_filt <- paste0("/lustre/scratch126/casm/team274sb/lr26/scRNA/", sample_id, "/filtered_feature_bc_matrix/")
  path_raw <- paste0("/lustre/scratch126/casm/team274sb/lr26/scRNA/", sample_id, "/raw_feature_bc_matrix/")
  
  # Read filtered and raw data using Seurat's Read10X
  filt_counts <- Read10X(data.dir = path_filt)  # Filtered feature matrix
  raw_counts <- Read10X(data.dir = path_raw)    # Raw feature matrix

  # Check if the counts matrices were loaded properly
  if (is.null(filt_counts) | is.null(raw_counts)) {
    stop("Error loading the raw or filtered matrices. Check the file paths.")
  }
  
  # Print example gene names to check format differences
 
  # Create a SoupChannel object with the raw and filtered counts
  sc <- SoupChannel(raw_counts, filt_counts, calcSoupProfile = FALSE)
  
  # Estimate contamination profile (soup profile)
  sc <- estimateSoup(sc)  

  # Adjust counts based on the estimated contamination profile
  ambient_contamination_values <- sc@ambientContamination

  sobj[["ambient_contamination"]] <- ambient_contamination_values

  return(sobj)
}


In [2]:
# Set your specific file paths
path_raw <- "/lustre/scratch126/casm/team274sb/lr26/scRNA/CG_SB_NB13960950/raw_feature_bc_matrix/"
path_filt <- "/lustre/scratch126/casm/team274sb/lr26/scRNA/CG_SB_NB13960950/filtered_feature_bc_matrix/"

# Read in the raw and filtered matrices using Seurat's Read10X function
tod <- Seurat::Read10X(file.path(path_raw))   # raw data
toc <- Seurat::Read10X(file.path(path_filt))  # filtered data

# Create the SoupChannel object using the loaded matrices
soc = SoupChannel(tod, toc, calcSoupProfile = FALSE)
soc = estimateSoup(soc)
# Assume 'sc' is your SoupChannel object and 'seurat_obj' is your Seurat object




#print(dim(tod))  # Should print something like (genes, cells)
#print(dim(toc))  # Should print something like (genes, cells)
#identical(rownames(tod), rownames(toc))  # Should return TRUE
# Check if there are any all-zero rows (genes) in `tod` or `toc`
# Check for empty columns (cells) in `tod` and `toc`



NULL

In [3]:
soc

Channel with 33694 genes and 1566 cells



In [None]:
mad_outlier <- function(sobj, metric, nmads){
  M <- sobj@meta.data[[metric]]
  median_M <- median(M, na.rm = TRUE)
  mad_M <- mad(M, na.rm = TRUE)
  outlier <- (M < (median_M - nmads * mad_M)) | (M > (median_M + nmads * mad_M))
  return(outlier)
}

pp <- function(sample_id){
  # Dynamically construct the path to the filtered feature matrix
  path <- paste0("/lustre/scratch126/casm/team274sb/lr26/scRNA/", sample_id, "/filtered_feature_bc_matrix/")
  
  # Read the filtered data using Read10X
  sobj <- Read10X(data.dir = path)
  
  # Ensure that sobj is a matrix (or sparse matrix) and not a list or any other structure
  if (is.list(sobj)) {
    sobj <- sobj[[1]]  # If a list is returned, extract the count matrix
  }
  
  # Check the dimensions of sobj to ensure it's a valid count matrix
  if (is.null(sobj) || !inherits(sobj, "matrix") && !inherits(sobj, "dgCMatrix")) {
    stop("The data loaded is not in the expected matrix format.")
  }
  
  # Create Seurat object from the count matrix
  sobj <- CreateSeuratObject(counts = sobj, min.cells = 0, min.features = 0)
  
  # Add sample_id to the metadata for tracking
  sobj$sample_id <- sample_id
  
  # Add QC metrics
  sobj$log1p_total_counts <- log1p(sobj@meta.data$nCount_RNA)
  sobj$log1p_n_genes_by_counts <- log1p(sobj@meta.data$nFeature_RNA)
  sobj[["percent.mt"]] <- PercentageFeatureSet(sobj, pattern = "^MT-")
  
  return(sobj)
}

# Apply the pp function to each sample and store the Seurat objects in a list
data_list <- lapply(samples, pp)


get_soup_groups <- function(sobj){
  # Perform Seurat analysis steps
  sobj <- NormalizeData(sobj, verbose = FALSE)
  sobj <- FindVariableFeatures(object = sobj, nfeatures = 2000, verbose = FALSE, selection.method = 'vst')
  sobj <- ScaleData(sobj, verbose = FALSE)
  sobj <- RunPCA(sobj, npcs = 20, verbose = FALSE)
  sobj <- FindNeighbors(sobj, dims = 1:20, verbose = FALSE)
  sobj <- FindClusters(sobj, resolution = 0.5, verbose = FALSE)
  
  # Retrieve the 'seurat_clusters' and convert it to numeric (starting from 1)
  clusters <- as.factor(sobj@meta.data[['seurat_clusters']])
  
  
  return(clusters)
}

add_soup_groups <- function(sobj){
  # Add the soup groups to the Seurat object
  sobj$soup_group <- get_soup_groups(sobj)
  return(sobj)
}

# Apply the add_soup_groups function to each Seurat object in data_list
data_list <- sapply(data_list, add_soup_groups)

In [None]:
data_list[2]$CG_SB_NB13960949[[]]

In [None]:
make_soup <- function(sobj, sample_id) {
  # Dynamically create the path for each sample's raw and filtered data
  path_filt <- paste0("/lustre/scratch126/casm/team274sb/lr26/scRNA/", sample_id, "/filtered_feature_bc_matrix/")
  path_raw <- paste0("/lustre/scratch126/casm/team274sb/lr26/scRNA/", sample_id, "/raw_feature_bc_matrix/")

  # Read the raw and filtered data for each sample
  raw <- Read10X(data.dir = path_raw)  # Raw count matrix (with all barcodes)
  filtered <- Read10X(data.dir = path_filt)  # Filtered count matrix (with only barcodes from the filtered data)

  # Ensure matching barcodes
  filtered_barcodes <- colnames(sobj@assays$RNA@layers$counts)
  raw_barcodes <- colnames(raw)

  # Check for common barcodes between raw and filtered
  common_barcodes <- intersect(filtered_barcodes, raw_barcodes)

  # Subset raw data to only include the common barcodes
  raw <- raw[, raw_barcodes %in% common_barcodes, drop = FALSE]

  # Ensure matching genes (rownames) between raw and Seurat object
  raw_genes <- rownames(raw)
  seurat_genes <- rownames(sobj@assays$RNA@layers$counts)

  # Find common genes
  common_genes <- intersect(raw_genes, seurat_genes)

  # Subset raw data to include only common genes
  raw <- raw[common_genes, , drop = FALSE]
  sobj_genes <- sobj@assays$RNA@layers$counts[common_genes, , drop = FALSE]

  # Reorder the raw matrix to match the Seurat object genes' order
  raw <- raw[seurat_genes, , drop = FALSE]

  # Now, proceed with SoupChannel adjustment using both raw and filtered counts
  sc = SoupChannel(raw, sobj_genes)  # Use filtered counts from Seurat object and raw data
  
  # Estimate soup profile
  sc = estimateSoup(sc)

  # Adjust the counts by removing contamination
  out = adjustCounts(sc, roundToInt = TRUE)

  # Store original counts as a separate assay if not already stored
  if (is.null(sobj[["original.counts"]])) {
    sobj[["original.counts"]] <- CreateAssayObject(counts = sobj@assays$RNA@counts)
  }

  # Add the adjusted counts to a new assay (keeping the raw counts untouched)
  sobj[["soupx_adjusted"]] <- CreateAssayObject(counts = out)
  
  return(sobj)
}

# Apply the make_soup function to each Seurat object in the data_list (with the corresponding sample_id)
data_list <- lapply(1:length(samples), function(i) {
  sobj <- data_list[[i]]  # Access the Seurat object
  sample_id <- samples[i]  # Get the corresponding sample_id
  sobj <- make_soup(sobj, sample_id)  # Apply the function
  return(sobj)
})


# Example of how to check sums and compare counts:
sum(data_list[[1]]@assays$original.counts@counts)  # Sum of original counts for the first object
sum(data_list[[1]]@assays$RNA@counts) / sum(data_list[[1]]@assays$original.counts@counts)  # Ratio of raw to original counts


In [None]:
# Add SampleID to each Seurat object in your list
data_list_with_sampleID <- lapply(names(data_list), function(sample_name) {
  sobj <- data_list[[sample_name]]
  sobj$SampleID <- sample_name  # Add SampleID to metadata
  return(sobj)
})

# Now, merge all Seurat objects into one large Seurat object
combined_sobj <- Reduce(function(x, y) merge(x, y, add.cell.ids = c(x$SampleID[1], y$SampleID[1]), 
                                              project = "Combined_Sample"), data_list_with_sampleID)

# Check the resulting metadata
head(combined_sobj@meta.data)

# Optionally, save the combined Seurat object to an RDS file
saveRDS(combined_sobj, "/path/to/save/combined_seurat_object.rds")
