In [1]:
.libPaths("/home/groups/candes/Software/miniconda2/envs/ukb/lib/R/library")
suppressMessages(library(tidyverse))
suppressMessages(library(cowplot))
suppressMessages(library(kableExtra))

source("../utils/utils_clumping.R")
source("utils_manhattan.R")

scratch <- "/scratch/PI/candes/ukbiobank_tmp"

fold <- "01"
resolution <- "Radj2"
phenotype.list <- c("platelet")

In [2]:
verify_discovery <- function(chr, bp.min, bp.max, Large.gwas) {
    gap <- 1e5
    discovery.matches <- Large.gwas %>% 
        filter(CHR==chr, BP.min<=bp.max+gap, BP.max>=bp.min-gap)
    if(nrow(discovery.matches)>0) {
        arrange(discovery.matches, desc(Importance))$SNP.lead[1]
    } else {
        return(NA)
    }
}

verify_discoveries <- function(chr, bp.min, bp.max, Large.gwas) {
    sapply(1:length(chr), function(j) {
        verify_discovery(chr[j], bp.min[j], bp.max[j], Large.gwas)
    })
}

## Load results from small GWAS

In [3]:
Discoveries <- lapply(phenotype.list, function(phenotype) {
    # Load LMM discoveries (if available)
    lmm.file <- sprintf("%s/discoveries/%s_lmm_regions_fold_%s.txt", scratch, phenotype, fold)
    if(file.exists(lmm.file)) {
        Discoveries.LMM <- read_delim(lmm.file, delim=" ", col_types=cols()) %>%
            mutate(Phenotype=phenotype, Method="LMM", Importance=-log10(P), Resolution="GWAS", Significance="FWER") %>%
            select(-c("P"))
        cat(sprintf("Found %d discoveries for %s made with LMM.\n", nrow(Discoveries.LMM), phenotype))
    } else {
        cat(sprintf("Discoveries for %s made with LMM are not available.\n", phenotype))
        Discoveries.LMM <- tibble()
    }
    
    # Load LMM discoveries with BH (if available)
    lmm.bh.file <- sprintf("%s/discoveries/%s_lmm_regions_fold_%s_BH.txt", scratch, phenotype, fold)
    if(file.exists(lmm.bh.file)) {
        Discoveries.LMM.BH <- read_delim(lmm.bh.file, delim=" ", col_types=cols()) %>%
            mutate(Phenotype=phenotype, Method="LMM", Importance=-log10(P), Resolution="GWAS", Significance="FDR") %>%
            select(-c("P"))
        cat(sprintf("Found %d discoveries for %s made with LMM and BH.\n", nrow(Discoveries.LMM.BH), phenotype))
    } else {
        cat(sprintf("Discoveries for %s made with LMM and BH are not available.\n", phenotype))
        Discoveries.LMM.BH <- tibble()
    }
    
    # Load knockoffs discoveries (if available)
    knockoffs.file <- sprintf("%s/discoveries/%s_knockoffs_%s_fold_%s.txt", scratch, phenotype, resolution, fold)
    if(file.exists(knockoffs.file)) {
        Discoveries.knockoffs <- read_delim(knockoffs.file, delim=" ", col_types=cols()) %>%
            mutate(Phenotype=phenotype, Method="Knockoffs", Importance=W, Resolution=resolution, Significance="FDR") %>%
            select(-c("W", "Group"))
        cat(sprintf("Found %d discoveries for %s made with knockoffs at resolution %s.\n", 
                    nrow(Discoveries.knockoffs), phenotype, resolution))
    } else {
        cat(sprintf("Discoveries for %s made with knockoffs at resolution %s are not available.\n", 
            phenotype, resolution))
        Discoveries.knockoffs <- tibble()
    }
    
    # Combine results
    return(rbind(Discoveries.LMM,Discoveries.LMM.BH,Discoveries.knockoffs))
})
Discoveries <- do.call("rbind", Discoveries)

Found 47 discoveries for platelet made with LMM.
Found 272 discoveries for platelet made with LMM and BH.
Found 81 discoveries for platelet made with knockoffs at resolution Radj2.


In [4]:
# Number of discoveries
Discoveries %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    summarise(Discoveries=n(), BP.width=mean(BP.width)/1e6) %>%
    arrange(Phenotype, Method, Resolution)

# Consolidate discoveries
Discoveries.clumped <- Discoveries %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    consolidate_clumps()

# Number of consolidated discoveries
Discoveries.clumped %>% 
    summarise(Discoveries=n(), BP.width=mean(BP.width)/1e6) %>%
    arrange(Phenotype, Method, Resolution)

Phenotype,Method,Significance,Resolution,Discoveries,BP.width
platelet,Knockoffs,FDR,Radj2,81,0.3188757
platelet,LMM,FDR,GWAS,272,0.4333372
platelet,LMM,FWER,GWAS,47,0.6743524


Phenotype,Method,Significance,Resolution,Discoveries,BP.width
platelet,Knockoffs,FDR,Radj2,77,0.3365592
platelet,LMM,FDR,GWAS,237,0.4103156
platelet,LMM,FWER,GWAS,43,0.7034752


## Check overlap between LMM and knockoff discoveries

In [5]:
Discoveries.lmm <- Discoveries %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    filter(Method=="LMM", Significance=="FWER", Resolution=="GWAS")

Discoveries.overlapping.1 <- Discoveries %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    filter(Method=="Knockoffs", Resolution=="Radj2") %>%
    mutate(Replicated=verify_discoveries(CHR, BP.min, BP.max, Discoveries.lmm))

Overlapping.table.1 <- Discoveries.overlapping.1 %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    summarise(BP.width=mean(BP.width)/1e6, Discoveries=n(),
              Overlapping=sum(!is.na(Replicated)), Distinct=sum(is.na(Replicated)))

Overlapping.table.1

Discoveries.knockoffs <- Discoveries %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    filter(Method=="Knockoffs", Significance=="FDR", Resolution=="Radj2")

Discoveries.overlapping.2 <- Discoveries.lmm %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    filter(Method=="LMM", Resolution=="GWAS", Significance=="FWER") %>%
    mutate(Replicated=verify_discoveries(CHR, BP.min, BP.max, Discoveries.knockoffs))

Overlapping.table.2 <- Discoveries.overlapping.2 %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    summarise(BP.width=mean(BP.width)/1e6, Discoveries=n(),
              Overlapping=sum(!is.na(Replicated)), Distinct=sum(is.na(Replicated)))

Overlapping.table.2

Phenotype,Method,Significance,Resolution,BP.width,Discoveries,Overlapping,Distinct
platelet,Knockoffs,FDR,Radj2,0.3188757,81,44,37


Phenotype,Method,Significance,Resolution,BP.width,Discoveries,Overlapping,Distinct
platelet,LMM,FWER,GWAS,0.6743524,47,42,5


In [6]:
Overlapping.table <- rbind(
    Overlapping.table.1 %>% ungroup() %>% select(Phenotype, Method, Discoveries, Overlapping, Distinct),
    Overlapping.table.2 %>% ungroup() %>% select(Phenotype, Method, Discoveries, Overlapping, Distinct))

Overlapping.table

Overlapping.table %>%
    ungroup() %>%
    kable(format="latex", align="c", digits=3) %>%
    collapse_rows(columns = 1, latex_hline = "major", valign = "middle")

Phenotype,Method,Discoveries,Overlapping,Distinct
platelet,Knockoffs,81,44,37
platelet,LMM,47,42,5



\begin{tabular}{c|c|c|c|c}
\hline
Phenotype & Method & Discoveries & Overlapping & Distinct\\
\hline
 & Knockoffs & 81 & 44 & 37\\

\multirow{-2}{*}{\centering\arraybackslash platelet} & LMM & 47 & 42 & 5\\
\hline
\end{tabular}

## Load results from large GWAS

In [7]:
Discoveries.large <- lapply(phenotype.list, function(phenotype) {
    # Load LMM discoveries (if available)
    lmm.file <- sprintf("%s/discoveries/%s_lmm_regions.txt", scratch, phenotype)
    if(file.exists(lmm.file)) {
        Discoveries.LMM <- read_delim(lmm.file, delim=" ", col_types=cols()) %>%
            mutate(Phenotype=phenotype, Method="LMM", Importance=-log10(P), Resolution="GWAS") %>%
            select(-c("P"))
        cat(sprintf("Found %d discoveries for %s made with LMM.\n", nrow(Discoveries.LMM), phenotype))
    } else {
        cat(sprintf("Discoveries for %s made with LMM are not available.\n", phenotype))
        Discoveries.LMM <- tibble()
    }
    
    # Load knockoffs discoveries (if available)
    knockoffs.file <- sprintf("%s/discoveries/%s_knockoffs_%s.txt", scratch, phenotype, resolution)
    if(file.exists(knockoffs.file)) {
        Discoveries.knockoffs <- read_delim(knockoffs.file, delim=" ", col_types=cols()) %>%
            mutate(Phenotype=phenotype, Method="Knockoffs", Importance=W, Resolution=resolution) %>%
            select(-c("W", "Group"))
        cat(sprintf("Found %d discoveries for %s made with knockoffs at resolution %s.\n", 
                    nrow(Discoveries.knockoffs), phenotype, resolution))
    } else {
        cat(sprintf("Discoveries for %s made with knockoffs at resolution %s are not available.\n", 
            phenotype, resolution))
        Discoveries.knockoffs <- tibble()
    }
    
    # Combine results
    return(rbind(Discoveries.LMM,Discoveries.knockoffs))
})
Discoveries.large <- do.call("rbind", Discoveries.large)

Found 723 discoveries for platelet made with LMM.
Found 1460 discoveries for platelet made with knockoffs at resolution Radj2.


In [8]:
# Number of discoveries
Discoveries.large %>%
    group_by(Phenotype, Method, Resolution) %>%
    summarise(Discoveries=n(), BP.width=mean(BP.width)/1e6) %>%
    arrange(Phenotype, Method, Resolution)

# Consolidate discoveries
Discoveries.large.clumped <- Discoveries.large %>% 
    group_by(Phenotype, Method, Resolution) %>%
    consolidate_clumps()

# Number of consolidated discoveries
Discoveries.large.clumped %>%
    summarise(Discoveries=n(), BP.width=mean(BP.width)/1e6) %>%
    arrange(Phenotype, Method, Resolution)

Phenotype,Method,Resolution,Discoveries,BP.width
platelet,Knockoffs,Radj2,1460,0.3076403
platelet,LMM,GWAS,723,0.7822125


Phenotype,Method,Resolution,Discoveries,BP.width
platelet,Knockoffs,Radj2,1063,0.4283797
platelet,LMM,GWAS,428,0.8768171


## Check replicability

In [9]:
Discoveries.large %>% nrow

In [10]:
Discoveries.large.lmm <- Discoveries.large %>% filter(Method=="LMM")

Discoveries.replicated <- Discoveries %>% 
    mutate(Replicated=verify_discoveries(CHR, BP.min, BP.max, Discoveries.large.lmm))

n.total <- Discoveries.large %>% filter(Method=="LMM", Resolution=="GWAS") %>% nrow()
Replication.table <- Discoveries.replicated %>% 
    group_by(Phenotype, Method, Significance, Resolution) %>%
    summarise(BP.width=mean(BP.width)/1e6, Discoveries=n(),
              True=sum(!is.na(Replicated)), False=sum(is.na(Replicated))) %>%
    mutate(FDP=False/(True+False), Power=True/n.total)

In [11]:
# Which phenotypes have at least 10 discoveries with FWER control
Replication.table <- Replication.table %>% 
    group_by(Phenotype) %>%
    filter(Significance=="FWER", Discoveries>=10) %>%
    select(Phenotype) %>%
    inner_join(Replication.table, by = "Phenotype")
Replication.table

Phenotype,Method,Significance,Resolution,BP.width,Discoveries,True,False,FDP,Power
platelet,Knockoffs,FDR,Radj2,0.3188757,81,74,7,0.08641975,0.10235131
platelet,LMM,FDR,GWAS,0.4333372,272,168,104,0.38235294,0.23236515
platelet,LMM,FWER,GWAS,0.6743524,47,47,0,0.0,0.06500692


In [12]:
library(kableExtra)
Replication.table %>%
    ungroup() %>% select(-Resolution) %>% 
    kable(format="latex", digits=3, align="c") %>%
    collapse_rows(columns = 1, latex_hline = "major", valign = "middle")


\begin{tabular}{c|c|c|c|c|c|c|c|c}
\hline
Phenotype & Method & Significance & BP.width & Discoveries & True & False & FDP & Power\\
\hline
 & Knockoffs & FDR & 0.319 & 81 & 74 & 7 & 0.086 & 0.102\\

 & LMM & FDR & 0.433 & 272 & 168 & 104 & 0.382 & 0.232\\

\multirow{-3}{*}{\centering\arraybackslash platelet} & LMM & FWER & 0.674 & 47 & 47 & 0 & 0.000 & 0.065\\
\hline
\end{tabular}