In [3]:
# Load necessary libraries
library(tools)
library(yaml)
library(SummarizedExperiment)
library(httr)
library(xml2)
library(jsonlite)

In [2]:
# Define the root directory where your subfolders are located
#root_dir <- "./"
#root_dir <- "/home/jovyan/MPRAhub/MPRAbase-Builder/data/RDS/"
root_dir <- 'RDS/'
# Get the list of all subfolders
#subfolders <- list.dirs(root_dir, recursive = FALSE)

# Initialize an empty list to store the data from .rds files
rds_data_list <- list()
subfolder <- root_dir
# Loop through each subfolder
#for (subfolder in subfolders) {
  
  # Find all .rds files in the current subfolder
  rds_files <- list.files(subfolder, pattern = "\\.rds$", full.names = TRUE)
  rds_files
  # If any .rds files are found, loop through them and read the data
  if (length(rds_files) > 0) {
    for (rds_file in rds_files) {
      # Read the .rds file and append it to the rds_data_list
      rds_data <- readRDS(rds_file)
      rds_data_list[[file_path_sans_ext(rds_file)]] <- rds_data
    }
  }
#}

# The rds_data_list now contains data from all .rds files in the subfolders


In [20]:
#rds_data_list

In [4]:
# Get more info from PMID
get_reference_from_PMID <- function(pmid) {
           if (!is.na(pmid) && !is.null(pmid)) {
        url <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&version=2.0&id=", pmid)
        response <- GET(url)
      
        json <- content(response, as="text")
        json_data <- fromJSON(json)
        titles=json_data$result[[2]]$title
        authors=json_data$result[[2]]$sortfirstauthor
        source = json_data$result[[2]]$source
        volumn = json_data$result[[2]]$volume
        pages = json_data$result[[2]]$pages
        year = json_data$result[[2]]$pubdate
        reference=paste0(authors,". et al. ",titles,source," ",volumn,",",pages,"(",year,")")

    }else{reference <- "Not published"}
           
    return(reference)
    }

In [5]:
### read passed_files
#passed_rds_files=readLines("../testthat/passed_files.txt")
passed_rds_files=rds_data_list
passed_rds_files=as.list(passed_rds_files)
head(passed_rds_files)

$`RDS//SRP018414-Liver_SE`
class: RangedSummarizedExperiment 
dim: 4966 3 
metadata(10): PMID GEO_number ... Cell_line_tissue DNA_RNA_reps
assays(1): ActivityScore
rownames(4966): 2:210861483-210861650 2:210861483-210861650 ...
  9:82902419-82902586 9:82902419-82902586
rowData names(3): genome_build sequence genomic_note
colnames(3): rep1 rep2 rep3
colData names(5): REP SRP_ID sample_status replicate_status cell_lines

$`RDS//SRP018414-Robin_Hepg2_SE`
class: RangedSummarizedExperiment 
dim: 4966 3 
metadata(10): PMID GEO_number ... Cell_line_tissue DNA_RNA_reps
assays(1): ActivityScore
rownames(4966): 2:210861483-210861650 2:210861483-210861650 ...
  9:82902419-82902586 9:82902419-82902586
rowData names(3): genome_build sequence genomic_note
colnames(3): rep1 rep2 rep3
colData names(5): REP SRP_ID sample_status replicate_status cell_lines

$`RDS//SRP044727-Hela_SE`
class: RangedSummarizedExperiment 
dim: 1665 1 
metadata(10): PMID GEO_number ... Cell_line_tissue DNA_RNA_reps
assays(1):

In [9]:
passed_rds_files

$`RDS//SRP018414-Liver_SE`
class: RangedSummarizedExperiment 
dim: 4966 3 
metadata(10): PMID GEO_number ... Cell_line_tissue DNA_RNA_reps
assays(1): ActivityScore
rownames(4966): 2:210861483-210861650 2:210861483-210861650 ...
  9:82902419-82902586 9:82902419-82902586
rowData names(3): genome_build sequence genomic_note
colnames(3): rep1 rep2 rep3
colData names(5): REP SRP_ID sample_status replicate_status cell_lines

$`RDS//SRP018414-Robin_Hepg2_SE`
class: RangedSummarizedExperiment 
dim: 4966 3 
metadata(10): PMID GEO_number ... Cell_line_tissue DNA_RNA_reps
assays(1): ActivityScore
rownames(4966): 2:210861483-210861650 2:210861483-210861650 ...
  9:82902419-82902586 9:82902419-82902586
rowData names(3): genome_build sequence genomic_note
colnames(3): rep1 rep2 rep3
colData names(5): REP SRP_ID sample_status replicate_status cell_lines

$`RDS//SRP044727-Hela_SE`
class: RangedSummarizedExperiment 
dim: 1665 1 
metadata(10): PMID GEO_number ... Cell_line_tissue DNA_RNA_reps
assays(1):

In [10]:
#pilot_list <- head(rds_data_list,n=2)
pilot_list <- passed_rds_files
#pilot_list <- passed_rds_files
yaml_data_output <- list()
yaml_initial_output <- list()
pmid_list=list()
for(i in 1:length(pilot_list)){
    
  if (i %% 3 == 0) {
    # Introduce a 1-second delay
    Sys.sleep(1)
  }
        
        
    my_string <- names(pilot_list[i])
    new_string <- sub(".*/", "###", my_string)  # replace last / with ###
    parts <- strsplit(new_string, "###")[[1]]  # split at ###
    last_folder <- parts[2] 
  #last_folder=my_string
    # sample meta data in info box
    yaml_data_output[[i]] <- list(
        
        id = paste0('BFC',i),
        uri = paste0('localhost://../../MPRAbase-Builder/', sub("\\./", "", names(pilot_list[i])), ".rds"),
        #uri = paste0('localhost:')
        "title" = last_folder,
        # paste0('BFC',i), #metadata(pilot_list[[i]])$ENCODE_ID,
        
        "provider" = metadata(pilot_list[[i]])$labs,
        "contact" = metadata(pilot_list[[i]])$labs,
        
        "geo" =  colData(pilot_list[[i]])$GEO_ID[[1]],
        "encode" = metadata(pilot_list[[i]])$ENCODE_ID,
        "PMID" = as.character(metadata(pilot_list[[i]])$PMID),
        
        "Organism" = metadata(pilot_list[[i]])$Organism,
        "LibraryType" = metadata(pilot_list[[i]])$Library_strategy,
        
        "ResourceName" = metadata(pilot_list[[i]])$Cell_line_tissue,
        "SampleName" = metadata(pilot_list[[i]])$sample_name,

        "description" = "In Development",
        "reference" = get_reference_from_PMID(metadata(pilot_list[[i]])$PMID)
        
        )
     
    yaml_initial_output[[i]] <- list(
        id = paste0('config', i),
        "datasets" = list(paste0('BFC',i)),
        "title" = "MPRA Secondary Analysis",
        
        "uri" = "localhost://../../MPRAbase-Builder/app-configs/analysis-secondary.R",
        "description" = "Test `SampleAssay` panel for biological replicate correlation analysis.") 
}


yaml::write_yaml(list(datasets=yaml_data_output, initial=yaml_initial_output), "MPRAbase-v0.1.3.yaml")



In [36]:
rds_data_list=list()
#### read from passed list
for (rds_file in passed_rds_files) {
    my_string <- as.character(rds_file)
    new_string <- sub(".*/", "###", my_string)  # replace last / with ###
    parts <- strsplit(new_string, "###")[[1]]  # split at ###
    last_folder <- parts[2] 
    path=paste0(as.character(rds_file),".rds")
    print(path)
    rds_data=readRDS(path)
    
    rds_data_list[[as.character(last_folder)]]<- rds_data
}


ERROR: Error in as.vector(x, mode = "character"): no method for coercing this S4 class to a vector


In [26]:
#rds_data_list

In [12]:

pilot_list=rds_data_list

yaml_data_output <- list()
yaml_initial_output <- list()
pmid_list=list()
for(i in 1:length(pilot_list)){
    
  if (i %% 3 == 0) {
    # Introduce a 1-second delay
    Sys.sleep(1)
  }

    # sample meta data in info box
    yaml_data_output[[i]] <- list(
        
        id = paste0('BFC',i),
        uri = paste0('localhost://../../MPRAbase-Builder/data//', sub("\\./", "", names(pilot_list[i])), ".rds"),
        "title" =  names(pilot_list[i]),
        # paste0('BFC',i), #metadata(pilot_list[[i]])$ENCODE_ID,
        
        "provider" = metadata(pilot_list[[i]])$labs,
        "contact" = metadata(pilot_list[[i]])$labs,
        
        "geo" =  colData(pilot_list[[i]])$GEO_ID[[1]],
        "encode" = metadata(pilot_list[[i]])$ENCODE_ID,
        "PMID" = as.character(metadata(pilot_list[[i]])$PMID),
        
        "Organism" = metadata(pilot_list[[i]])$Organism,
        "LibraryType" = metadata(pilot_list[[i]])$Library_strategy,
        
        "ResourceName" = metadata(pilot_list[[i]])$Cell_line_tissue,
        "SampleName" = metadata(pilot_list[[i]])$sample_name,

        "description" = "In Development",
        "reference" = get_reference_from_PMID(metadata(pilot_list[[i]])$PMID)
        )
     
    yaml_initial_output[[i]] <- list(
        id = paste0('config', i),
        "datasets" = list(paste0('BFC',i)),
        "title" = "MPRA Secondary Analysis",
        "uri" = "localhost://../configs/analysis-secondary.R",
        "description" = "Test `SampleAssay` panel for biological replicate correlation analysis.") 
}


yaml::write_yaml(list(datasets=yaml_data_output, initial=yaml_initial_output), "MPRAbase-v0.1.8-passed.yaml")


In [11]:

# Read the YAML file as character lines
file_lines <- readLines("MPRAbase-v0.1.3.yaml")

# Remove single quotes and ~ from each line
modified_lines <- gsub("'", "", file_lines)
modified_lines <- gsub("~", "Not found", modified_lines)
modified_lines <- gsub("\\[|\\]", "Not found", modified_lines)
modified_lines <- gsub("Not foundNot found", "Not found", modified_lines)
modified_lines <- gsub("Not foundNot found", "Not found", modified_lines)

#modified_lines <- gsub("dataRDS", "data/RDS", modified_lines)

# Write the modified contents back to the file
writeLines(modified_lines, "MPRAbase-v0.1.3.yaml")
