## Latent trait GWAS

Implementation of GenomicSEM to analyze cystatin C production and renal function as latent traits.

### Load prequisites

In [None]:
library(GenomicSEM)
setwd("/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/PRS/sem")

### Prepare data

Loop through each super-population. We found only AFR, CSA and EUR super-populations had sufficient trait heritability to perform SEM analysis. This is as per the GenomicSEM tutorial available at https://rpubs.com/MichelNivard/565885. We used LD references generated in UK Biobank population as part of the PanUKB project. The package requires a variant reference file, and as one was only provided for european populations we generate a variant reference in each super-population using UKB data. This is summarized in the notebook 'Variant_reference.ipynb'.

In [None]:
populations=c("AFR","CSA","EUR")


for (pop in populations) {
    print(pop)
    folder=paste('/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/PRS/',pop,'/',sep='')
    print(folder)
    setwd(folder)
    flush.console() 
    
    if (file.exists(paste(folder,'cystatin.sumstats.gz', sep=''))) {
    #Delete file if it exists
        file.remove(paste(folder,'cystatin.sumstats.gz', sep=''))
    }
    
    if (file.exists(paste(folder,'creatinine.sumstats.gz', sep=''))) {
    #Delete file if it exists
        file.remove(paste(folder,'creatinine.sumstats.gz', sep=''))
    }
    
    ref = paste("/mnt/grid/janowitz/home/references/maf/panukb_snps0.01_",pop,'_dedup.tsv',sep='')

    #Munge summary statistics
    munge(paste(folder,'cystatin_summary.tsv',sep=''), 
          ref,
          trait.names="cystatin", 
          info.filter = 0.8,
          maf.filter = 0.00)

    munge(paste(folder,'creatinine_summary.tsv',sep=''), 
          ref,
          trait.names="creatinine", 
          info.filter = 0.8,
          maf.filter = 0.00)
    
    flush.console() 

    #Perform LD score regression
    traits <- c("cystatin.sumstats.gz","creatinine.sumstats.gz")
    sample.prev <- c(NA,NA)
    population.prev <- c(NA,NA)
    ld<-paste("/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/UKBB.ALL.ldscore/",pop,sep='')
    wld <- paste("/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/UKBB.ALL.ldscore/",pop,sep='')
    trait.names<-c("cystatin", "creatinine")
    
    print(ld)
    flush.console() 

    LDSCoutput <- ldsc(traits, 
                       sample.prev, 
                       population.prev, 
                       ld, 
                       wld, 
                       trait.names,
                       chr=1)

    LDSCoutput$I[1] = 1
    LDSCoutput$I[4] = 1
    
    flush.console() 

    save(LDSCoutput, file=paste(folder,"LDSCoutput.RData",sep=''))
    
    #Define SEM model
    model<-'C=~NA*cystatin
        NC=~NA*creatinine +NA*cystatin


        C~~1*C
        NC~~1*NC
        C~~ 0*NC

        cystatin~~0*cystatin
        cystatin~~0*creatinine
        creatinine~~0*creatinine
    '
    
    #Esimate model without SNP data at this stage, to confirm model parameters seem reasonable
    output = usermodel(LDSCoutput,estimation="DWLS",model=model)
    print(output)
    
    flush.console() 

    #Import variant reference file
    files = c(paste(folder,'cystatin_summary.tsv',sep=''),paste(folder,'creatinine_summary.tsv',sep=''))
    ref = paste("/mnt/grid/janowitz/home/references/maf/panukb_snps0.005_",pop,'_dedup.tsv',sep='')
    trait.names = c("cystatin","creatinine")
    se.logit = c(F,F)


    p_sumstats<-sumstats(files, ref, trait.names, se.logit, info.filter=0.8, maf.filter=0.00, OLS=c(T,T),linprob=NULL, prop=NULL, parallel=TRUE, cores=16)
    p_sumstats<-unique(p_sumstats)
    save(p_sumstats, file=paste(folder,"sumstats.RData", sep=''))
    flush.console() 
}



#### Run SNP model

In [None]:
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
index = as.numeric(args[1])
pop = as.character(args[2])

library(GenomicSEM)

setwd(paste("/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/",pop, sep=''))


load(file="sumstats.RData")
load(file="LDSCoutput.RData")


# model with SNP

model<-'C=~NA*cystatin
        NC=~NA*creatinine +NA*cystatin

        C~SNP
        NC~SNP

        C~~1*C
        NC~~1*NC
        C~~ 0*NC

        cystatin~~0*cystatin
        cystatin~~0*creatinine
        creatinine~~0*creatinine
        SNP~~SNP'

GWAS_final = userGWAS(covstruc=LDSCoutput,SNPs=subset(p_sumstats, CHR==index),estimation="DWLS",model=model,parallel=FALSE ,sub =c("C~SNP","NC~SNP"))
filename=paste("sem_output/","_chr",index, '.RData', sep='')
save(GWAS_final, file=filename)

This script was called using the following bash script

In [None]:
%%bash

Rscript --no-save --slave /mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/sem.R $chr $population

### Post-processing

SNP modelling was run in parallel across chromosomes, after completion following script is used to merge the output files.

In [None]:
pop="EUR"
setwd(paste("/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/PRS/",pop, '/sem_output',sep=''))

files <- list.files(pattern = "*RData")
print(files)

latent1_full = data.frame()
latent2_full = data.frame()

for (file in files) {
    print(file)
    flush.console() 
    load(file)
    latent1 = GWAS_final[[1]]
    latent2 = GWAS_final[[2]]
    latent1_full = rbind(latent1,latent1_full)
    latent2_full = rbind(latent2, latent2_full)
    rm(GWAS_final)
}

print(nrow(latent1_full))
latent1_full = latent1_full[,c(-17,-16)]
latent1_full = latent1_full[,c(1,2,3,4,5,6,12,13,14,15)]
latent1_full = subset(latent1_full, Pval_Estimate!="NA")
print(nrow(latent1_full))

print(nrow(latent2_full))
latent2_full = latent2_full[,c(-17,-16)]
latent2_full = latent2_full[,c(1,2,3,4,5,6,12,13,14,15)]
latent2_full = subset(latent2_full, Pval_Estimate!="NA")
print(nrow(latent2_full))

write.table(latent1_full, "/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/EUR/summ_SEM_cystatin.tsv", row.names = FALSE, quote=FALSE,sep='\t')
write.table(latent2_full, "/mnt/grid/ukbiobank/data/Application58510/skleeman/gwas_cystatinc/EUR/summ_SEM_creatinine.tsv", row.names = FALSE, quote=FALSE,sep='\t')

### Estimation of effect sample size

As per GenomicSEM wiki page

In [None]:
#restrict to MAF of 40% and 10%
latent1_calc<-subset(latent1_full, latent1_full$MAF <= .4 & latent1_full$MAF >= .1)
#calculate effective N
Effective_N<-(mean(((latent1_calc$Z_Estimate/latent1_calc$est)^2)/(2*latent1_calc$MAF*(1-latent1_calc$MAF))))
Effective_N