# Generate log2CPM for CMC data

In [1]:
library(tidyverse)
library(synapser)

-- [1mAttaching packages[22m --------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


TERMS OF USE NOTICE:
  When using Synapse, remember that the terms and conditions of use require that you:
  1) Attribute data contributors when discussing these data or results from these data.
  2)

In [2]:
synLogin()

Welcome, kj.benjamin!

NULL

## Phenotypes

In [3]:
# Download clinical metadata 
CLINICAL_ID = 'syn3354385'
clinical = data.table::fread(synGet(CLINICAL_ID, version = 4)$path)

# Download RNASeq metadata
METADATA_QC_DLPFC_ID = 'syn18358379' 
metadata = data.table::fread(synGet(METADATA_QC_DLPFC_ID, version = 3)$path)

# Join clinical and RNASeq metadata 
md = right_join(clinical, metadata, by = c("Individual ID" = "Individual_ID")) %>% 
    mutate(Dx = fct_recode(Dx, AFF_BP = "BP", AFF_BP = "AFF", Other = "undetermined", 
                           Control = "Control", SCZ = "SCZ")) %>%
    filter(Dx %in% c("Control", "SCZ"), Sex %in% c('XX', 'XY'))

# Compute read pair metrics and add Institution-Dx variable
md <- md %>%
  mutate(MappedRead_Pairs = Mapped_Reads/2) %>%
  mutate(`Institution-Dx` = paste0(`Institution`, "-", `Dx`)) %>% 
  mutate(TotalRead_Pairs = Total_Reads/2)

# Add MDS from SNPs
mds_file = paste0('/ceph/users/jbenja13/projects/sex_sz_ria/input/commonMind/',
                  'genotypes/mds/_m/CMC_MSSM-Penn-Pitt_DLPFC_QC.mds')
mds = data.table::fread(mds_file)
colnames(mds) = gsub('C', 'snpPC', colnames(mds))

pheno_file = paste0('/ceph/users/jbenja13/projects/sex_sz_ria/input/commonMind/', 
                    'phenotypes/combine_files/_m/CMC_phenotypes_all.csv')
pheno = read.csv(pheno_file, stringsAsFactors = F) 
genotypes = merge(pheno, mds, by.y='IID', by.x='Genotypes.Genotyping_Sample_ID') 

genotypes = genotypes %>% 
    dplyr::select("Individual_ID", starts_with("snpPC")) %>%
    rename("Individual ID"=Individual_ID)

md = md %>% left_join(genotypes, by="Individual ID") %>% distinct

md %>% dim

In [4]:
phenotypes = md %>% select("SampleID", "Individual ID", "Institution", 
                           "Reported Gender", "Dx", "Age of Death") %>%
    mutate(`Age of Death` = ifelse(`Age of Death` == "90+", "90", `Age of Death`))
colnames(phenotypes) <- gsub(' ', '_', colnames(phenotypes))
phenotypes %>% data.table::fwrite("cmc_phenotypes.csv", sep=',')
phenotypes %>% head(2)

SampleID,Individual_ID,Institution,Reported_Gender,Dx,Age_of_Death
<chr>,<chr>,<chr>,<chr>,<fct>,<chr>
MSSM_RNA_PFC_155,CMC_MSSM_087,MSSM,Female,Control,90
MSSM_RNA_PFC_280,CMC_MSSM_226,MSSM,Female,Control,90


## Gene expression (counts)

### Combined counts

In [5]:
# Download counts (DLPFC - MSSM)
COUNT_ID = 'syn17346208'
count = data.table::fread(synGet(COUNT_ID, version=2)$path) #synapser has updated without backwards compatibility
count$transcript_id.s. = NULL

# Download gene lengths (DLPFC - MSSM)
genelen_CMC = data.table::fread(synGet('syn17346397', version = 2)$path) %>%
    gather(sampleID, Length, -gene_id, -`transcript_id(s)`) %>%
    group_by(gene_id) %>%
    summarise(Length = median(Length, na.rm = T)) %>%
    ungroup() %>% data.frame()

 # Download counts (DLPFC - HBCC)
COUNT_ID = 'syn17894685'
count_HBCC = data.table::fread(synGet(COUNT_ID, version = 4)$path)
count_HBCC$transcript_id.s. = NULL

# Join HBCC and MSSM counts
NEW.COUNTS = full_join(count, count_HBCC, by = c("gene_id")) %>%
    column_to_rownames(var='gene_id') %>% t %>%
    as.data.frame %>% rownames_to_column %>% 
    filter(rowname %in% md$SampleID) %>% 
    column_to_rownames(var="rowname") %>% t %>%
    as.data.frame
NEW.COUNTS[1:2, 1:5]

Unnamed: 0_level_0,MSSM_RNA_PFC_1,MSSM_RNA_PFC_2,MSSM_RNA_PFC_3,MSSM_RNA_PFC_4,MSSM_RNA_PFC_5
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000000003.14,124,103,160,366,268
ENSG00000000005.5,1,0,1,4,2


In [6]:
NEW.COUNTS %>% dim

### CPM transformation and save

In [38]:
edgeR::cpm(NEW.COUNTS, log=TRUE) %>% as.data.frame %>% 
    rownames_to_column %>% rename("Geneid"="rowname") %>%
    data.table::fwrite("cmc_log2cpm.tsv", sep='\t')

## Gene annotation

In [7]:
# Get background genes 
backgroundGenes = data.frame(gene_id = rownames(NEW.COUNTS)) %>%
  mutate(id = gene_id) %>%
  separate(id, c('ensembl_gene_id','position'), sep = '\\.')

# Define biomart object
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                host = "uswest.ensembl.org", # Ensembl Release 99 (January 2020)
                dataset = "hsapiens_gene_ensembl")
# Query biomart
Ensemble2HGNC <- biomaRt::getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", 
                                      "percentage_gene_gc_content", "gene_biotype", 
                                      "chromosome_name"),
                       filters = "ensembl_gene_id", 
                       values = backgroundGenes$ensembl_gene_id,
                       mart = mart)

In [8]:
backgroundGenes %>% 
    inner_join(Ensemble2HGNC, by=c("ensembl_gene_id")) %>% 
    select(-c(percentage_gene_gc_content, gene_biotype, position)) %>% 
    data.table::fwrite("cmc_gene_annotation.tsv", sep='\t')