<a href="https://colab.research.google.com/github/HardworkingPearl/VCC-state/blob/evo2emb/Experiments/dataload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from pathlib import Path
from datetime import datetime
from rpy2.robjects import r, globalenv
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import numpy as np
from scipy.sparse import csc_matrix
import anndata as ad
import os
from tqdm import tqdm

raw_dir = "/content/drive/MyDrive/VCC/datasets"
h5_dir = "/content/drive/MyDrive/VCC/datasets/h5"

### install packages

In [None]:
!pip install pyreadr
!pip install tzlocal
!pip install anndata
!sudo apt-get update -y || apt-get update -y
!sudo apt-get install -y build-essential gfortran cmake \
  libxml2-dev libcurl4-openssl-dev libssl-dev \
  libomp-dev libgsl-dev

In [None]:
from rpy2.robjects import r
r('''
options(repos=c(CRAN="https://cloud.r-project.org"))
if (!requireNamespace("remotes", quietly=TRUE)) install.packages("remotes")
# Install dependencies first to get clearer errors
remotes::install_cran(c("Matrix","Rcpp","RSpectra","uwot","RcppAnnoy","igraph"), upgrade="never")
remotes::install_github("cole-trapnell-lab/monocle3", upgrade="never", build_vignettes=FALSE)
suppressPackageStartupMessages(library(monocle3)); cat("loaded monocle3 from GitHub OK\\n")
''')


### Read Data and convert to h5ad file

In [None]:
def readRDS(file_path):
  readRDS = r['readRDS']
  cds = readRDS(file_path)
  globalenv['cds'] = cds

  r('M <- SummarizedExperiment::assay(cds, "counts")')
  x   = np.array(r('slot(M, "x")'))
  i   = np.array(r('slot(M, "i")'))
  p   = np.array(r('slot(M, "p")'))
  Dim = np.array(r('dim(M)'))

  X = csc_matrix((x, i, p), shape=(int(Dim[0]), int(Dim[1])))

  # metadata
  with localconverter(pandas2ri.converter):
      obs = r('as.data.frame(SummarizedExperiment::colData(cds))')
      var = r('as.data.frame(SummarizedExperiment::rowData(cds))')

  # gene name list
  with localconverter(pandas2ri.converter):
      obs_names = list(r('as.character(colnames(cds))'))
      var_names = list(r('as.character(rownames(cds))'))

  obs.index = obs_names
  var.index = var_names

  return X, obs, var

In [None]:
def to_h5ad(raw_path, h5_path):
  rds_files = [f for f in os.listdir(raw_path) if f.lower().endswith('.rds')]
  print(rds_files)
  for rds_file in tqdm(rds_files):
    file_path = f"{raw_path}/{rds_file}"
    X, obs, var = readRDS(file_path)
    if X.shape[1] == obs.shape[0]:
      adata = ad.AnnData(X=X.T, obs=obs, var=var)
    else:
      adata = ad.AnnData(X=X, obs=obs, var=var)
    adata.write_h5ad(f"{h5_path}/{rds_file[:-4]}.h5ad", compression="gzip")

In [None]:
# srivatsam
srivatsam_raw_path = f"{raw_dir}/srivatsam"
srivatsam_h5_dir = f"{h5_dir}/srivatsam"

to_h5ad(srivatsam_raw_path, srivatsam_h5_dir)