## coloc for eQTL and mQTL

In [None]:
library(pacman)
library(arrow)
library(coloc)
library(dplyr)

In [None]:
cohort <- 'nabec'
version <- 'July_2024'
target <- 'rna_TPM' #'RNA' 'cpg_islands_cov**'
meth <- "promoters2Kb"
varianttype_caller <- 'SV_harmonized_SNV' #SV SNV SNV_SV

In [None]:
eQTL_samplesize <- 205
mQTL_samplesize <- 205
type <- "quant"

### set exp ~ meth files

In [None]:
# parameters
modality <- 'RNAB'
set_name <- paste(cohort, version, target, varianttype_caller, "prun", sep = "_")
cohort_version_target <- paste(cohort, version, target, sep = "_")

# directories
in_dir <- "/data/CARDPB/data/NABEC/projects/QTL_paper_2024/SV-eQTL"
geno_dir <- file.path(in_dir, "genotypes", varianttype_caller)
quants_dir <- file.path(in_dir, "expression")
info_dir <- file.path(in_dir, "sample_info")
tensorqtl_dir <- file.path("/data/CARDPB/data/NABEC/projects/QTL_paper_2024/newSV-eQTL/tenosorqtl", paste(set_name,"sorted", sep = "_"))
results_dir <- file.path("/data/CARDPB/data/NABEC/projects/QTL_paper_2024/newSV-eQTL/results", paste(set_name, "sorted", sep = "_"))



# file path
cis_indep_file <- file.path(results_dir, paste0(set_name, ".cis.indep.csv"))
cis_map_file <- file.path(tensorqtl_dir, paste0(set_name, ".cis.map.csv"))
cis_exp_file <- file.path(tensorqtl_dir, paste0(set_name, ".cis_qtl_pairs."))

qtnormalized_expression_pc <- file.path(info_dir, paste0(cohort_version_target, ".normPC.csv"))

# numbers
alpha_value <- 0.05
min_nominal_alpha <- 1e-5
MIN_MAF <- 0
DEBUG <- FALSE


# exp_meth file
used_samples_list_file <- file.path(info_dir, paste0(set_name, ".samples"))
cis_exp_meth_indep_file <- file.path(results_dir, paste0(set_name, ".cis.indep.csv"))
cis_exp_meth_file <- file.path(tensorqtl_dir, paste0(set_name, ".cis.map.csv"))
cis_exp_meth_raw_file <- file.path(tensorqtl_dir, paste0(set_name, ".cis_qtl_pairs."))
qtnormalized_expression_pc <- file.path(info_dir, paste0(cohort_version_target, ".normPC.csv"))


## set mQTL files

In [None]:
cohort <- "nabec"
version <- "Aug_2024"
varianttype_caller <- "SV_harmonized_SNV"

In [None]:
# Naming
modality <- 'METH'
# cohort_build <- paste(cohort, version, target, sep = ".")
set_name <- paste(cohort, version, meth, varianttype_caller, sep = "_")
# Make sure if using both SV and SNV, use "SNV_SV_{caller}"
cohort_version_target <- paste(cohort, version, target, sep = "_")

# Directories
in_dir <- '/data/CARDPB/data/NABEC/projects/QTL_paper_2024/SV-mQTL'
geno_dir <- file.path(in_dir, 'genotypes', varianttype_caller)
quants_dir <- file.path(in_dir, 'expression')
info_dir <- file.path(in_dir, 'sample_info')
tensorqtl_dir <- file.path('/data/CARDPB/data/NABEC/projects/QTL_paper_2024/newSV-mQTL/tenosorqtl', set_name)
results_dir <- file.path('/data/CARDPB/data/NABEC/projects/QTL_paper_2024/newSV-mQTL/results', set_name)

# Files
used_samples_list_file <- file.path(info_dir, paste0(set_name, '.samples'))
cis_meth_indep_file <- file.path(results_dir, paste0(set_name, '.cis.indep.csv'))
cis_meth_map_file <- file.path(tensorqtl_dir, paste0(set_name, '.cis.map.csv'))
cis_meth_file <- file.path(tensorqtl_dir, paste0(set_name,".cis_qtl_pairs."))

In [9]:
eQTL_data = read.table(cis_map_file, sep=',',header = 1)
mQTL_data = read.table(cis_meth_map_file, sep=',', header =1)
exp_meth_data = read.table(cis_exp_meth_file, sep=',', header= 1)

## filter exp_data and gene ~ meth data by qval < 0.05

In [10]:
filtered_exp_data <- eQTL_data[eQTL_data$qval < 0.05, ]
filtered_exp_meth_data <- exp_meth_data[exp_meth_data$qval < 0.05, ]

In [None]:
#format the file
filtered_exp_data$new_variant_id <- gsub("napu_", "", filtered_exp_data$variant_id)
filtered_exp_data$chr <- sapply(strsplit(filtered_exp_data$new_variant_id, "_|:"), `[`, 1)
filtered_exp_data$position <- sapply(strsplit(filtered_exp_data$new_variant_id, "_|:"), `[`, 2)

In [14]:
#check the overlap of eQTL eGenes and sig hits of gene ~ meth
filtered_exp_meth_data <- exp_meth_data %>% filter(phenotype_id %in% filtered_exp_data$phenotype_id, qval < 0.05)

In [None]:
#format the file
filtered_exp_meth_data$new_variant_id <- gsub("napu_", "", filtered_exp_meth_data$variant_id)
filtered_exp_meth_data$chr <- sapply(strsplit(filtered_exp_meth_data$new_variant_id, "_"), `[`, 1)

In [17]:
#extract the eGenes which were also hit in gene ~ meth

In [18]:
eGenes <- filtered_exp_data$phenotype_id

In [19]:
target_data <- filtered_exp_meth_data[filtered_exp_meth_data$phenotype_id %in% eGenes, c('phenotype_id','chr')]

##  run colocalization for common eGenes and all meth region in mQTL (pval_nominal < 0.05) 

In [None]:
# Initialize an empty data frame to store results
coloc_results <- data.frame(
  phenotype_id = character(),
  variant_id = character(),
  PP.H0 = numeric(),
  PP.H1 = numeric(),
  PP.H2 = numeric(),
  PP.H3 = numeric(),
  PP.H4 = numeric(),
  stringsAsFactors = FALSE
)

# Loop through each row in the filtered_exp_data data frame
apply(target_data, 1, function(row) {
  chr <- row["chr"]
  eQTL_gene <- row["phenotype_id"]
  print(eQTL_gene)
  print(chr)
  # Construct the file paths
  cis_exp_file_chr <- file.path(paste0(cis_exp_file, chr, ".parquet"))
  cis_meth_file_chr <- file.path(paste0(cis_meth_file, chr, ".parquet"))
  cis_exp_meth_raw_file_chr <- file.path(paste0(cis_exp_meth_raw_file, chr, ".parquet"))

  # Read the parquet files
  temp_exp <- arrow::read_parquet(cis_exp_file_chr)
  temp_meth <- arrow::read_parquet(cis_meth_file_chr)
  temp_cis_exp_meth <- arrow::read_parquet(cis_exp_meth_raw_file_chr)

  target_regions <- temp_cis_exp_meth[temp_cis_exp_meth$phenotype_id == eQTL_gene & temp_cis_exp_meth$pval_nominal < 0.05, ]$variant_id

  # Subset temp_meth for each phenotype_id
  temp_exp_filtered <- temp_exp[temp_exp$phenotype_id == eQTL_gene, ]
  # Select and rename columns for both datasets
  temp_exp_filtered <- temp_exp_filtered[, c("variant_id", "slope", "slope_se", "af", "pval_nominal")]
  #remove na of pvalue
  temp_exp_filtered <- subset(temp_exp_filtered, !is.na(pval_nominal))

  colnames(temp_exp_filtered) <- c("snp", "beta", "varbeta", "maf","pval_nominal")
  temp_exp_filtered$N <- eQTL_samplesize
    
  for (current_phenotype_id in target_regions) {
    meth_subset <- temp_meth[temp_meth$phenotype_id == current_phenotype_id, ]
    print(current_phenotype_id)

    
    meth_subset <- meth_subset[, c("variant_id", "slope", "slope_se", "af", "pval_nominal")]

    #remove na of pvalue
    meth_subset <- subset(meth_subset, !is.na(pval_nominal))
    colnames(meth_subset) <- c("snp", "beta", "varbeta", "maf",  "pval_nominal")
    meth_subset$N <- mQTL_samplesize

    result <- coloc.abf(
      dataset1 = list(snp = temp_exp_filtered$snp, pvalues = temp_exp_filtered$pval_nominal,
                      MAF = temp_exp_filtered$maf, N = eQTL_samplesize, type = type),
      dataset2 = list(snp = meth_subset$snp, pvalues = meth_subset$pval_nominal,
                      MAF = meth_subset$maf, N = mQTL_samplesize, type = type)
    )    
    # Extract result summary
    summary <- result$summary

    print(summary)
    
    # Append the result to coloc_results
    coloc_results <<- rbind(coloc_results, data.frame(
      phenotype_id = current_phenotype_id,
      variant_id = eQTL_gene, # Replace with the appropriate value if required
      PP.H0 = summary["PP.H0.abf"],
      PP.H1 = summary["PP.H1.abf"],
      PP.H2 = summary["PP.H2.abf"],
      PP.H3 = summary["PP.H3.abf"],
      PP.H4 = summary["PP.H4.abf"]
    ))
  }
})

# coloc_results will contain the combined results for all phenotype_id values.
# Check final results
# Save coloc_results to a CSV file with meth included in the filename
write.csv(coloc_results, paste0("coloc_expression_", meth,"_",cohort,"_" varianttype_caller,"_results_Nov20_pvalues.csv"), row.names = FALSE)

In [5]:
## Count the number of the eGenes which is overlapped with the hit of gene ~ meth & the gene ~ meth included the mQTL hits

In [None]:
filtered_exp_data <- eQTL_data[eQTL_data$qval < 0.05, ]
filtered_exp_meth_data <- exp_meth_data[exp_meth_data$qval < 0.05, ]
filtered_meth_data <- mQTL_data[mQTL_data$qval < 0.05, ]

In [None]:
# Load necessary libraries
library(arrow)
library(dplyr)

# Extract the first row as a subset
#one_row <- target_data[1, , drop = FALSE] 

gene_count <- 0  # Initialize gene count

# Iterate over each row
apply(target_data, 1, function(row) {
  chr <- row["chr"]
  eQTL_gene <- row["phenotype_id"]
  
  print(eQTL_gene)
  print(chr)
  
  # Construct file paths
  cis_exp_file_chr <- file.path(paste0(cis_exp_file, chr, ".parquet"))
  cis_meth_file_chr <- file.path(paste0(cis_meth_file, chr, ".parquet"))
  cis_exp_meth_raw_file_chr <- file.path(paste0(cis_exp_meth_raw_file, chr, ".parquet"))
  
  # Read the Parquet files
  temp_exp <- arrow::read_parquet(cis_exp_file_chr)
  temp_meth <- arrow::read_parquet(cis_meth_file_chr)
  temp_cis_exp_meth <- arrow::read_parquet(cis_exp_meth_raw_file_chr)
  
  # Fetch the p-value threshold
  pval_threshold <- filtered_exp_meth_data %>%
    filter(phenotype_id == eQTL_gene) %>%
    pull(pval_nominal)
  
  # Extract significant methylation cites
  extract_sig_meth_cites <- temp_cis_exp_meth %>%
    filter(phenotype_id == eQTL_gene, pval_nominal <= pval_threshold) %>%
    pull(variant_id)
  
  print(extract_sig_meth_cites)  # Optional debug print
  
  # Loop through significant methylation cites
  for (meth_cite in extract_sig_meth_cites) {
    if (nrow(filtered_meth_data[filtered_meth_data$phenotype_id == meth_cite, ]) > 0) {
      gene_count <<- gene_count + 1  # Increment gene count
      break  # Exit loop after incrementing
    }
  }
})
# After calculating gene_count, write it to a text file
output_file <- paste0(set_name, "_gene_count.txt") 
write(gene_count, file = output_file)  

## check if the colocalized eQTL mQTL pairs shared the likely effector variant

In [10]:
filtered_exp_data <- eQTL_data[eQTL_data$qval < 0.05, ]
filtered_exp_meth_data <- exp_meth_data[exp_meth_data$qval < 0.05, ]
filtered_meth_data <- mQTL_data[mQTL_data$qval < 0.05, ]

In [None]:
coloc_results_read <- coloc_results %>% filter(PP.H4 > 0.6)

In [12]:
if (cohort == "nabec") {
  coloc_version <- "July_2024"
  RNA_varianttype_caller <- paste0('rna_TPM_',varianttype_caller, "_prun") 
} else if (cohort == "hbcc") {
  coloc_version <- "Nov_2024"
  RNA_varianttype_caller <- paste0('rna_TPM_',varianttype_caller) 
}


mQTL_coloc <- paste0("/data/CARDPB/data/NABEC/projects/QTL_paper_2024/newSV-eQTL/analysis/CAVIAR/",cohort, "_", coloc_version, "_", RNA_varianttype_caller, "_results.csv")
eQTL_coloc <- paste0("/data/CARDPB/data/NABEC/projects/QTL_paper_2024/newSV-eQTL/analysis/CAVIAR/",cohort, "_", version, "_", meth ,"_", varianttype_caller, "_results.csv")

CAVIAR_eQTL <- read.csv(mQTL_coloc)
CAVIAR_mQTL <- read.csv(eQTL_coloc)

SV_CAVIAR_eQTL <- CAVIAR_eQTL[CAVIAR_eQTL$TOP.SV.Causal.Post.Probablity	>= CAVIAR_eQTL$TOP.SNV.Causal.Post.Probablity, ]
SV_CAVIAR_mQTL <- CAVIAR_mQTL[CAVIAR_mQTL$TOP.SV.Causal.Post.Probablity	>= CAVIAR_mQTL$TOP.SNV.Causal.Post.Probablity, ]

In [None]:
# Initialize a counter and a list to store pairs with common variants
common_count <- 0
common_pairs <- list()

# Apply function to each row
apply(coloc_results_read, 1, function(row) {
  mQTL_hit <- row["phenotype_id"]
  eQTL_hit <- row["variant_id"]
  
  # Get the variant_id for eQTL and mQTL
  eQTL_variant <- SV_CAVIAR_eQTL[SV_CAVIAR_eQTL$phenotype_id == eQTL_hit, ]$TOP.SV.ID
  mQTL_variant <- SV_CAVIAR_mQTL[SV_CAVIAR_mQTL$phenotype_id == mQTL_hit, ]$TOP.SV.ID 

  # Skip if both eQTL_variant and mQTL_variant are NA
  if (all(is.na(eQTL_variant)) && all(is.na(mQTL_variant))) {
    return(NULL)  # Skip this iteration
  }

  # Check if they are common
  if (length(eQTL_variant) > 0 && length(mQTL_variant) > 0) {
    common_variants <- intersect(eQTL_variant, mQTL_variant)
    common_variants <- common_variants[!is.na(common_variants)]
    if (length(common_variants) > 0) {
      # Increment the counter and store the pair
      common_count <<- common_count + 1
      common_pairs[[length(common_pairs) + 1]] <<- list(
        mQTL_hit = mQTL_hit,
        eQTL_hit = eQTL_hit,
        common_variants = common_variants
      )
    }
  }
})

# Print the total number of pairs with common variants
print(paste("Number of pairs with common variants:", common_count))

# Convert common pairs to a data frame for easier viewing (optional)
common_pairs_df <- do.call(rbind, lapply(common_pairs, as.data.frame))
print(common_pairs_df)
