In [1]:
library(future)
library(Seurat)
library(data.table)
library(dplyr)
library(ggplot2)
library(Matrix)

Loading required package: SeuratObject

Loading required package: sp


Attaching package: ‘SeuratObject’


The following objects are masked from ‘package:base’:

    intersect, t


“package ‘data.table’ was built under R version 4.4.2”
“package ‘dplyr’ was built under R version 4.4.1”

Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


“package ‘ggplot2’ was built under R version 4.4.1”
“package ‘Matrix’ was built under R version 4.4.3”


In [2]:
# With 800GB RAM available, we can be much more generous
RAM.thresh <- 200000 * 1024^2  # 200GB limit for future operations
options(future.globals.maxSize = RAM.thresh)

In [3]:
# Use all 12 CPUs since you have abundant memory
plan(multisession, workers = 12)

In [4]:
# Load object
cat("Loading Seurat object...\n")
sce1 <- readRDS("/tscc/lustre/ddn/scratch/pkudtarkar/FNIH/250429_Reference_heart_preprocessed.rds")

Loading Seurat object...


In [5]:
# Set identity
Idents(sce1) <- "celltype"
sce1$Cell_Type <- Idents(sce1)

cat("Dataset info:\n")
cat("Number of cells:", ncol(sce1), "\n")
cat("Number of genes:", nrow(sce1), "\n")
cat("Cell types:\n")
print(table(Idents(sce1)))

Loading required package: Signac



Dataset info:
Number of cells: 329255 
Number of genes: 36510 
Cell types:

       Fibroblast       Endothelial               vCM       Macrophages 
            67157             52039             68449             36401 
        Pericytes              Mast       Endocardial                 T 
            22489              1986              8927             14760 
               SM                DC                 B          Neuronal 
             6561             12968              1535              3962 
               NK lymph_Endothelial         Adipocyte        Epicardial 
             3726              1611              1122              1716 
              aCM 
            23846 


In [6]:
# Get normalized data
cat("Extracting normalized data...\n")
normalized_matrix <- GetAssayData(sce1, assay = "RNA", layer = "data")

Extracting normalized data...


In [7]:
cat("Sample values from normalized matrix:\n")
print(normalized_matrix[1:5, 1:5])

Sample values from normalized matrix:
5 x 5 sparse Matrix of class "dgCMatrix"
            QY_2193_1_2_QY_2192_1_2_AAACAGCCAACTAGGG-1
MIR1302-2HG                                          .
FAM138A                                              .
OR4F5                                                .
AL627309.1                                           .
AL627309.3                                           .
            QY_2193_1_2_QY_2192_1_2_AAACAGCCACTTACAG-1
MIR1302-2HG                                          .
FAM138A                                              .
OR4F5                                                .
AL627309.1                                           .
AL627309.3                                           .
            QY_2193_1_2_QY_2192_1_2_AAACAGCCAGTTTGTG-1
MIR1302-2HG                                          .
FAM138A                                              .
OR4F5                                                .
AL627309.1                               

In [8]:
cat("Estimated dense matrix size: ~450GB\n")
cat("Available RAM: 800GB\n")
cat("Using chunked processing for safety...\n")

Estimated dense matrix size: ~450GB
Available RAM: 800GB
Using chunked processing for safety...


In [9]:
# Process by gene chunks to stay within memory limits
cat("Starting chunked dense matrix export...\n")
output_file <- "/tscc/lustre/ddn/scratch/pkudtarkar/FNIH_scRNA_log_normalized_gene_by_cell_matrix-heart.tsv"

# Conservative chunk size for 1.5M cells
chunk_size <- 500  # Process 500 genes at a time (~30GB per chunk)

n_genes <- nrow(normalized_matrix)
n_chunks <- ceiling(n_genes / chunk_size)

cat("Processing", n_genes, "genes in", n_chunks, "chunks of", chunk_size, "genes each\n")
cat("Each chunk will use approximately 30GB RAM\n")

Starting chunked dense matrix export...
Processing 36510 genes in 74 chunks of 500 genes each
Each chunk will use approximately 30GB RAM


In [10]:
# Write header first
cell_names <- colnames(normalized_matrix)
header <- c("Gene", cell_names)
cat("Writing header with", length(cell_names), "cell names...\n")
write.table(t(header), file = output_file, sep = "\t", quote = FALSE, 
            row.names = FALSE, col.names = FALSE)

Writing header with 329255 cell names...


In [11]:
# Process each chunk
start_time <- Sys.time()

for (i in 1:n_chunks) {
  chunk_start_time <- Sys.time()
  
  start_idx <- ((i-1) * chunk_size) + 1
  end_idx <- min(i * chunk_size, n_genes)
  
  cat("Processing chunk", i, "of", n_chunks, "(genes", start_idx, "to", end_idx, ")")
  
  # Extract chunk and convert to dense
  chunk_sparse <- normalized_matrix[start_idx:end_idx, , drop = FALSE]
  chunk_dense <- as.matrix(chunk_sparse)
  chunk_df <- as.data.frame(chunk_dense)
  chunk_dt <- as.data.table(chunk_df, keep.rownames = "Gene")
  
  # Append to file
  fwrite(chunk_dt, file = output_file, sep = "\t", quote = FALSE, 
         append = TRUE, col.names = FALSE)
  
  # Clean up chunk immediately
  rm(chunk_sparse, chunk_dense, chunk_df, chunk_dt)
  gc()
  
  # Time estimation
  chunk_end_time <- Sys.time()
  chunk_duration <- as.numeric(difftime(chunk_end_time, chunk_start_time, units = "mins"))
  
  cat(" - completed in", round(chunk_duration, 2), "minutes\n")
  
  # Progress and time estimation
  if (i %% 10 == 0 || i == n_chunks) {
    elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "mins"))
    avg_time_per_chunk <- elapsed_time / i
    remaining_chunks <- n_chunks - i
    estimated_remaining <- remaining_chunks * avg_time_per_chunk
    
    cat("Progress: ", round(100 * i / n_chunks, 1), "% complete\n")
    cat("Elapsed time: ", round(elapsed_time, 1), " minutes\n")
    cat("Estimated remaining time: ", round(estimated_remaining, 1), " minutes\n")
    cat("Estimated total time: ", round(elapsed_time + estimated_remaining, 1), " minutes\n\n")
  }
}

total_time <- as.numeric(difftime(Sys.time(), start_time, units = "mins"))
cat("Dense matrix export completed in", round(total_time, 1), "minutes!\n")

# Clean up the large normalized matrix
rm(normalized_matrix)
gc()


Processing chunk 1 of 74 (genes 1 to 500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.22 minutes
Processing chunk 2 of 74 (genes 501 to 1000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 3 of 74 (genes 1001 to 1500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 4 of 74 (genes 1501 to 2000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 5 of 74 (genes 2001 to 2500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 6 of 74 (genes 2501 to 3000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 7 of 74 (genes 3001 to 3500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 8 of 74 (genes 3501 to 4000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 9 of 74 (genes 4001 to 4500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 10 of 74 (genes 4501 to 5000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Progress:  13.5 % complete
Elapsed time:  2  minutes
Estimated remaining time:  12.8  minutes
Estimated total time:  14.7  minutes

Processing chunk 11 of 74 (genes 5001 to 5500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 12 of 74 (genes 5501 to 6000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 13 of 74 (genes 6001 to 6500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 14 of 74 (genes 6501 to 7000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 15 of 74 (genes 7001 to 7500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 16 of 74 (genes 7501 to 8000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 17 of 74 (genes 8001 to 8500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 18 of 74 (genes 8501 to 9000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 19 of 74 (genes 9001 to 9500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 20 of 74 (genes 9501 to 10000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Progress:  27 % complete
Elapsed time:  4  minutes
Estimated remaining time:  10.7  minutes
Estimated total time:  14.7  minutes

Processing chunk 21 of 74 (genes 10001 to 10500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 22 of 74 (genes 10501 to 11000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 23 of 74 (genes 11001 to 11500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 24 of 74 (genes 11501 to 12000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 25 of 74 (genes 12001 to 12500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 26 of 74 (genes 12501 to 13000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 27 of 74 (genes 13001 to 13500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.21 minutes
Processing chunk 28 of 74 (genes 13501 to 14000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 29 of 74 (genes 14001 to 14500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 30 of 74 (genes 14501 to 15000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Progress:  40.5 % complete
Elapsed time:  6  minutes
Estimated remaining time:  8.7  minutes
Estimated total time:  14.7  minutes

Processing chunk 31 of 74 (genes 15001 to 15500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 32 of 74 (genes 15501 to 16000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 33 of 74 (genes 16001 to 16500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 34 of 74 (genes 16501 to 17000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 35 of 74 (genes 17001 to 17500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 36 of 74 (genes 17501 to 18000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 37 of 74 (genes 18001 to 18500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 38 of 74 (genes 18501 to 19000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 39 of 74 (genes 19001 to 19500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 40 of 74 (genes 19501 to 20000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Progress:  54.1 % complete
Elapsed time:  7.9  minutes
Estimated remaining time:  6.7  minutes
Estimated total time:  14.6  minutes

Processing chunk 41 of 74 (genes 20001 to 20500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 42 of 74 (genes 20501 to 21000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 43 of 74 (genes 21001 to 21500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 44 of 74 (genes 21501 to 22000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 45 of 74 (genes 22001 to 22500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 46 of 74 (genes 22501 to 23000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 47 of 74 (genes 23001 to 23500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 48 of 74 (genes 23501 to 24000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 49 of 74 (genes 24001 to 24500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 50 of 74 (genes 24501 to 25000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Progress:  67.6 % complete
Elapsed time:  9.9  minutes
Estimated remaining time:  4.7  minutes
Estimated total time:  14.6  minutes

Processing chunk 51 of 74 (genes 25001 to 25500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 52 of 74 (genes 25501 to 26000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 53 of 74 (genes 26001 to 26500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 54 of 74 (genes 26501 to 27000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 55 of 74 (genes 27001 to 27500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.18 minutes
Processing chunk 56 of 74 (genes 27501 to 28000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 57 of 74 (genes 28001 to 28500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.18 minutes
Processing chunk 58 of 74 (genes 28501 to 29000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 59 of 74 (genes 29001 to 29500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.21 minutes
Processing chunk 60 of 74 (genes 29501 to 30000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Progress:  81.1 % complete
Elapsed time:  11.8  minutes
Estimated remaining time:  2.8  minutes
Estimated total time:  14.6  minutes

Processing chunk 61 of 74 (genes 30001 to 30500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 62 of 74 (genes 30501 to 31000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 63 of 74 (genes 31001 to 31500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.18 minutes
Processing chunk 64 of 74 (genes 31501 to 32000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.21 minutes
Processing chunk 65 of 74 (genes 32001 to 32500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.18 minutes
Processing chunk 66 of 74 (genes 32501 to 33000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 67 of 74 (genes 33001 to 33500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.18 minutes
Processing chunk 68 of 74 (genes 33501 to 34000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 69 of 74 (genes 34001 to 34500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 70 of 74 (genes 34501 to 35000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Progress:  94.6 % complete
Elapsed time:  13.8  minutes
Estimated remaining time:  0.8  minutes
Estimated total time:  14.6  minutes

Processing chunk 71 of 74 (genes 35001 to 35500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 72 of 74 (genes 35501 to 36000 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.2 minutes
Processing chunk 73 of 74 (genes 36001 to 36500 )

"sparse->dense coercion: allocating vector of size 1.2 GiB"


 - completed in 0.19 minutes
Processing chunk 74 of 74 (genes 36501 to 36510 ) - completed in 0.12 minutes
Progress:  100 % complete
Elapsed time:  14.5  minutes
Estimated remaining time:  0  minutes
Estimated total time:  14.5  minutes

Dense matrix export completed in 14.5 minutes!


Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,14519122,775.5,28915812,1544.3,28915812,1544.3
Vcells,9913428722,75633.5,16654213030,127061.6,11009850188,83998.5


In [13]:
sce1

An object of class Seurat 
392885 features across 329255 samples within 4 assays 
Active assay: RNA (36510 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 3 other assays present: RNA_raw, SCT, ATAC
 10 dimensional reductions calculated: pca, harmony.rna, umap.rna, lsi, harmony.atac, umap.atac, umap.wnn, joint.umap, spca, umap.harmony.rna

In [14]:
# Extract and save UMAP coordinates
cat("Saving UMAP coordinates...\n")
umap_coordinates <- Embeddings(sce1, "umap.rna")
colnames(umap_coordinates) <- c("X", "Y")
write.table(umap_coordinates, 
           file = "/tscc/lustre/ddn/scratch/pkudtarkar/umap-heart.tsv", 
           sep = "\t", quote = FALSE, row.names = TRUE, col.names = TRUE)

Saving UMAP coordinates...


In [17]:
# Extract and save metadata
cat("Saving cell metadata...\n")
cell_metadata <- sce1@meta.data
write.table(cell_metadata, 
           file = "/tscc/lustre/ddn/scratch/pkudtarkar/metadata-heart.tsv", 
           sep = "\t", quote = FALSE, row.names = TRUE, col.names = TRUE)


Saving cell metadata...


In [16]:
# Verify outputs
cat("Verification:\n")
cat("Dense matrix file size:\n")
system(paste("ls -lh", output_file))

cat("First 3 rows and 5 columns of output:\n")
verification <- fread(output_file, nrows = 3, select = 1:min(6, 1528726))
print(verification[, 1:min(5, ncol(verification))])

cat("All processing completed successfully!\n")
cat("Output files:\n")
cat("1. Dense matrix:", output_file, "\n")
cat("2. UMAP coordinates: /tscc/lustre/ddn/scratch/pkudtarkar/umap-heart.tsv\n")
cat("3. Cell metadata: /tscc/lustre/ddn/scratch/pkudtarkar/metadata-heart.tsv\n")

cat("\nFinal summary:\n")
cat("- Dataset: 38,531 genes × 1,528,725 cells\n")
cat("- Processing time:", round(total_time, 1), "minutes\n")
cat("- Peak memory per chunk: ~30GB\n")
cat("- Total chunks processed:", n_chunks, "\n")

Verification:
Dense matrix file size:
First 3 rows and 5 columns of output:
          Gene QY_2193_1_2_QY_2192_1_2_AAACAGCCAACTAGGG-1
        <char>                                      <int>
1: MIR1302-2HG                                          0
2:     FAM138A                                          0
3:       OR4F5                                          0
   QY_2193_1_2_QY_2192_1_2_AAACAGCCACTTACAG-1
                                        <int>
1:                                          0
2:                                          0
3:                                          0
   QY_2193_1_2_QY_2192_1_2_AAACAGCCAGTTTGTG-1
                                        <int>
1:                                          0
2:                                          0
3:                                          0
   QY_2193_1_2_QY_2192_1_2_AAACAGCCATCCGTAA-1
                                        <int>
1:                                          0
2:                                  