## Jupyter notebook for calculating Bayesian Information Criteria (BIC) for ATAC-seq data

This notebook is the code for calculating BIC for ATAC-seq data used in the paper **"Simultaneous analysis of open chromatin, promoter interactions and gene expression in stimulated T cells implicates causal genes for rheumatoid arthritis"** by *Jing Yang, Amanda McGovern, Paul Martin, Kate Duffus, Peyman Zarrineh, Andrew P Morris, Antony Adamson, Peter Fraser, Magnus Rattray & Stephen Eyre*.

For any questions about the code, please drop me a line at Jing.Yang@manchester.ac.uk

### loading gptk package for Gaussian process regression

In [None]:
library(gptk)
library(ggplot2)
library(gridExtra)

### read ATACseq data

In [None]:
data <- read.table('data/ATACseq_normalized.csv',sep=',',header=T)

In [None]:
head(data)

In [None]:
idx_data <- paste('T', c('0','20','60','2H','4H','24H'), sep='')


### normalized ATACseq data

In [None]:
normalized_data <- t(scale(t(data[,idx_data]), center=T, scale=T))


In [None]:
head(normalized_data)

### use logscaled time points for GP regression 

In [None]:
times_data <- log(c(0,20,60,120,240,1440)+10)


In [None]:
x <- matrix(times_data)
lld_rbf <- numeric(0)
lld_static <- numeric(0)

for (ii in (1:dim(normalized_data)[1])) {  ## this calculation will take a long time
  y <- matrix(normalized_data[ii,])
  model0 <- list() ## Allocate space for model.
  options=gpOptions(approx="ftc")
  options$kern = list(type="cmpnd",comp=list(list(type="rbf"),list(type="white"))) ### use rbf+white nosie kernel for model 0
  ## Optimise GP log likelihoods.
  model0 <- gpCreate(dim(x)[2], dim(y)[2], x, y, options)
  model0 <- gpOptimise(model0,0)
  
  lld_rbf[ii] <- gpLogLikelihood(model0) ### loglikelihood ratio for rbf model
  
  model1 <- list() ## Allocate space for model.
  options=gpOptions(approx="ftc")
  options$kern = list(type="white") ### use white noise kernel for model 1
  ## Optimise GP log likelihoods.
  model1 <- gpCreate(dim(x)[2], dim(y)[2], x, y, options)
  model1 <- gpOptimise(model1,0)
  lld_static[ii] <- gpLogLikelihood(model1) ### loglikelihood ratio for static model
}




### an example for the 4th data point

In [None]:

y <- matrix(normalized_data[4,])
model0 <- list() ## Allocate space for model.
options=gpOptions(approx="ftc")
options$kern = list(type="cmpnd",comp=list(list(type="rbf"),list(type="white"))) ### use rbf+white nosie kernel for model 0
## Optimise GP log likelihoods.
model0 <- gpCreate(dim(x)[2], dim(y)[2], x, y, options)
model0 <- gpOptimise(model0,0)
    
model1 <- list() ## Allocate space for model.
options=gpOptions(approx="ftc")
options$kern = list(type="white") ### use white noise kernel for model 1
## Optimise GP log likelihoods.
model1 <- gpCreate(dim(x)[2], dim(y)[2], x, y, options)
model1 <- gpOptimise(model1,0)


tstar <- matrix(seq(min(times_data)-0.5, max(times_data)+0.5, length=100), ncol=1)

Kx0 <- kernCompute(model0$kern, x, tstar)
Ktrain0 <- kernCompute(model0$kern, x)
invKtrain0 <- solve(Ktrain0)
yPred0 <- t(Kx0) %*% invKtrain0 %*% y
yVar0 <- diag(abs(kernCompute(model0$kern, tstar) - t(Kx0) %*% invKtrain0 %*% Kx0))
data_forplot_rbf1 <- data.frame(x=tstar, yupper = yPred0+sqrt(abs(yVar0)), ylower= yPred0-sqrt(abs(yVar0)), yPred = yPred0)
data_forplot_rbf2 <- data.frame(x=x,y=y)
p1 <- ggplot(data=data_forplot_rbf1,aes(x=tstar)) + 
   geom_ribbon(aes(ymin = ylower, ymax = yupper), fill = "lightblue", alpha=0.6) +
   geom_line(aes(y = yPred ),size=1) + geom_point(data=data_forplot_rbf2, aes(x=x,y=y),size=3) +
   theme(legend.position="none") + theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),text = element_text(size=12),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) + xlab('Time (logscaled)') + 
ylab('Normalized data') + ggtitle('RBF model fitting illustration')


Kx1 <- kernCompute(model1$kern, x, tstar)
Ktrain1 <- kernCompute(model1$kern, x)
invKtrain1 <- solve(Ktrain1)
yPred1 <- t(Kx1) %*% invKtrain1 %*% y
yVar1 <- diag(abs(kernCompute(model1$kern, tstar) - t(Kx1) %*% invKtrain1 %*% Kx1))
data_forplot_noise1 <- data.frame(x=tstar, yupper = yPred1+sqrt(abs(yVar1)), ylower= yPred1-sqrt(abs(yVar1)), yPred = yPred1)
data_forplot_noise2 <- data.frame(x=x,y=y)


p2 <- ggplot(data=data_forplot_noise1,aes(x=tstar)) + 
   geom_ribbon(aes(ymin = ylower, ymax = yupper), fill = "lightblue", alpha=0.6) +
   geom_line(aes(y = yPred ),size=1) + geom_point(data=data_forplot_noise2, aes(x=x,y=y),size=3) +
   theme(legend.position="none") + theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),text = element_text(size=12),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) + xlab('Time (logscaled)') + 
ylab('Normalized data') + ggtitle('Static model fitting illustration')

grid.arrange(p1,p2)


### Get Loglikelihood ratio results: LR = -2 ln( L<sub>RBF</sub> - L<sub>static</sub>)

In [None]:
LR_data <- -2*(lld_rbf-lld_static) ### loglikelihood ratio for the data 


### Get BIC results: BIC = k ln(n) -2ln(L)
k is the number of parameters used in each model, n is the sample size and L is the maximized likelihood 

In [None]:
### BIC_difference is used to compare the BIC difference between RBF model and the static model. Smaller BIC is preferred
BIC_rbf <- 2*log(6) - 2*lld_rbf
BIC_static <- log(6) - 2*lld_static
BIC_difference <- BIC_rbf - BIC_static
