## Jupyter notebook for creating links between CHi-C, ATAC-seq and RNA-seq

This notebook is the code for linking CHi-C, ATAC-seq and RNA-seq time course data used in the paper **\"Analysis of chromatin organinization and gene expression in T cells identifies functional genes for rheumatoid arthritis\"** by *Jing Yang, Amanda McGovern, Paul Martin, Kate Duffus, Xiangyu Ge, Peyman Zarrineh, Andrew P Morris, Antony Adamson, Peter Fraser, Magnus Rattray & Stephen Eyre* 

Author : *Jing Yang*  <br />
Date: 01-05-2020 <br />
For any questions about the code, please drop me a line at Jing.Yang@manchester.ac.uk

In [1]:
library(ggplot2)
library(gridExtra)
library(plyr)
library(RColorBrewer)
library(reshape2)
library(tidyverse)
library(GenomicRanges)
library(gtable)


── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mtibble [39m 3.0.1     [32m✔[39m [34mdplyr  [39m 1.0.0
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0
[32m✔[39m [34mpurrr  [39m 0.3.4     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32marrange()[39m   masks [34mplyr[39m::arrange()
[31m✖[39m [34mdplyr[39m::[32mcombine()[39m   masks [34mgridExtra[39m::combine()
[31m✖[39m [34mpurrr[39m::[32mcompact()[39m   masks [34mplyr[39m::compact()
[31m✖[39m [34mdplyr[39m::[32mcount()[39m     masks [34mplyr[39m::count()
[31m✖[39m [34mdplyr[39m::[32mfailwith()[39m  masks [34mplyr[39m::failwith()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mid()[39m    

### capture hic data (with replicates)

In [2]:

chic_data <- read.table("data/CHiC_data_withENSG.txt", header=TRUE)


In [3]:
dim(unique(chic_data[,c('baitID','otherEndID')]))

In [4]:
head(chic_data,n=5)

Unnamed: 0_level_0,baitID,baitchr,baitstart,baitend,otherEndID,otherEndchr,otherEndstart,otherEndend,ENSG,T0,T20,T1H,T4H,T24H
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<int>,<fct>,<int>,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,100955,chr10,122950347,122956207,92154,chr10,92572726,92594937,ENSG00000119965,1.140669,0.0,1.391779,1.0327093,0.0
2,100955,chr10,122950347,122956207,78222,chr10,45508633,45514336,ENSG00000119965,0.0,0.0,0.0,1.1305003,0.0
3,100955,chr10,122950347,122956207,77679,chr10,43644531,43648045,ENSG00000119965,1.310104,0.0,0.0,0.0,0.0
4,100955,chr10,122950347,122956207,101495,chr10,124901900,124920455,ENSG00000119965,3.882884,2.921624,1.690418,2.9919739,0.0
5,100955,chr10,122950347,122956207,89558,chr10,84128125,84134471,ENSG00000119965,0.0,1.017335,0.0,0.7708398,1.930548


In [5]:
names(chic_data)

In [6]:
dim(chic_data)

In [7]:
names(chic_data)

In [8]:
dim(unique(chic_data[,c('baitID','otherEndID')]))

In [9]:
cat("There are ", dim(chic_data)[1], "paried CHiC-gene datasets, among which there are ", dim(unique(chic_data[,c('baitID', 'otherEndID')]))[1], " CHIC interactions 
", length(unique(chic_data$baitID)), "baits, ", length(unique(chic_data$otherEndID)), "otherEnd and ",length(unique(chic_data$ENSG)), "ENSGs")

There are  290380 paried CHiC-gene datasets, among which there are  253100  CHIC interactions 
 6795 baits,  120621 otherEnd and  4903 ENSGs

### ATACSeq data

In [10]:
atacseq_data <- read.table('data/ATACSeq_data.txt', header=T)

In [11]:
head(atacseq_data)

Unnamed: 0_level_0,ATACchr,ATACstart,ATACend,ATACannotation,AT0,AT20,AT1H,AT2H,AT4H,AT24H
Unnamed: 0_level_1,<fct>,<int>,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,chr10,100006331,100006563,Intron,5.004126,4.659345,5.92887,5.524712,7.401195,6.763596
2,chr10,100009362,100010421,Promoter,8.910829,8.899573,9.033632,8.84895,9.088016,8.780725
3,chr10,100118128,100118310,Intergenic,3.677326,5.423599,3.774964,5.181859,3.997616,4.538095
4,chr10,100148537,100149145,Downstream,6.308615,5.997314,6.371118,7.022634,8.320145,8.144691
5,chr10,100164173,100164282,Intron,3.793428,3.296516,4.600715,4.917097,3.838937,5.077424
6,chr10,100167603,100168094,Intron,6.121348,5.530762,5.770208,6.469869,6.229512,5.80728


In [12]:
names(atacseq_data)

In [13]:
dim(atacseq_data)

In [14]:
atacseq_data_nopromoter <- atacseq_data[atacseq_data$Aannotation!='Promoter',]

In [15]:
cat("There are", dim(atacseq_data)[1], "peaks in Atacseq data  (minoverlap=1), among which", dim(atacseq_data_nopromoter)[1], "peaks are not within promoter region\n")

There are 74583 peaks in Atacseq data  (minoverlap=1), among which 0 peaks are not within promoter region


In [16]:
otherEnd_forcompare_data <- unique(chic_data[,c('otherEndID','otherEndchr','otherEndstart','otherEndend')])
otherEnd_forcompare_withlabel <- cbind(otherEnd_forcompare_data,rep('otherEnd',dim(otherEnd_forcompare_data)[1]))
names(otherEnd_forcompare_withlabel)<- c('ID','chr','start','end','label')
head(otherEnd_forcompare_withlabel)

Unnamed: 0_level_0,ID,chr,start,end,label
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<fct>
1,92154,chr10,92572726,92594937,otherEnd
2,78222,chr10,45508633,45514336,otherEnd
3,77679,chr10,43644531,43648045,otherEnd
4,101495,chr10,124901900,124920455,otherEnd
5,89558,chr10,84128125,84134471,otherEnd
6,101383,chr10,124409095,124414046,otherEnd


In [17]:
dim(otherEnd_forcompare_data)

### link ATACSeq to otherEnd data 

In [18]:
atacseq_forcompare_withlabel <- unique(cbind(1:dim(atacseq_data)[1],atacseq_data[,c(
    'ATACchr','ATACstart','ATACend')],rep('atacseq',dim(atacseq_data)[1])))
names(atacseq_forcompare_withlabel)<- c('ID','chr','start','end','label')
head(atacseq_forcompare_withlabel)

Unnamed: 0_level_0,ID,chr,start,end,label
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<fct>
1,1,chr10,100006331,100006563,atacseq
2,2,chr10,100009362,100010421,atacseq
3,3,chr10,100118128,100118310,atacseq
4,4,chr10,100148537,100149145,atacseq
5,5,chr10,100164173,100164282,atacseq
6,6,chr10,100167603,100168094,atacseq


In [19]:
combined_atacseq_otherEnd <- rbind(atacseq_forcompare_withlabel,otherEnd_forcompare_withlabel)

In [20]:
atacseq_otherEnd_order <- combined_atacseq_otherEnd[with(combined_atacseq_otherEnd, order(combined_atacseq_otherEnd[,'chr'], combined_atacseq_otherEnd[,'start'])),]

In [21]:
len_combined <- dim(atacseq_otherEnd_order)[1]
print(len_combined)

[1] 195204


In [22]:
idx_otherEnd2atacseq <- vector(mode="list", length=dim(otherEnd_forcompare_data)[1])
idx_otherEnd <- which(atacseq_otherEnd_order$label=='otherEnd')
names(idx_otherEnd2atacseq)<-idx_otherEnd

In [23]:
idx_otherEnd_picked <- list()
idx_atacseq_picked <- list()

In [24]:
region <- 0 ## 0kb up or down the selected otherEnd data, so that ATACSeq peaks inside the fragment
range <- 200 ## index search range, [-100,100]
jj <- 1
for (ii in idx_otherEnd){
    
    min_idx <- max(1,ii-range)
    max_idx <- min(ii+range,len_combined)
    idx_range <- which(atacseq_otherEnd_order$chr[min_idx:max_idx]==atacseq_otherEnd_order$chr[ii]) + min_idx -1 
    min_start <- atacseq_otherEnd_order$start[ii]-region
    max_end <- atacseq_otherEnd_order$end[ii]+region
    #print(idx_range)
    #print(atacseq_otherEnd_order$start[idx_range])
    #print(min_start)
    start_idx <- which(atacseq_otherEnd_order$start[idx_range] >= min_start)
    end_idx <- tail(which(atacseq_otherEnd_order$end[idx_range] <= max_end),n=1)
    #print(start_idx)
    #print(end_idx)
    
    if (length(start_idx) & length(end_idx)){
        idx1 <- start_idx[1] + idx_range[1]-1
        idx2 <- tail(end_idx,n=1) + idx_range[1]-1
    idx_atacseq <- which(atacseq_otherEnd_order$label[idx1:idx2]=='atacseq') + idx1[1] -1 
    #print(ii)
    #print(idx_atacseq)
    if (length(idx_atacseq)>0){
        idx_otherEnd_picked <- c(idx_otherEnd_picked, rep(ii,length(idx_atacseq)))
        idx_atacseq_picked <- c(idx_atacseq_picked, idx_atacseq)
        idx_otherEnd2atacseq[[as.character(ii)]] <- as.list(idx_atacseq)}
    }
} 

In [25]:
idx_otherEnd2atacseq_effec <- idx_otherEnd2atacseq[-which(sapply(idx_otherEnd2atacseq,is.null))]

In [26]:
cat("There are", length(idx_otherEnd2atacseq_effec), "otherEnd interactions with atacseq peak in +/-", region)

There are 19361 otherEnd interactions with atacseq peak in +/- 0

In [27]:
tmp_otherEnd <-  atacseq_otherEnd_order[unlist(idx_otherEnd_picked, use.names=FALSE),1:4]
tmp_atacseq <- atacseq_otherEnd_order[unlist(idx_atacseq_picked, use.names=FALSE),1:4]
names(tmp_otherEnd) <- c('otherEndID','otherEndchr','otherEndstart','otherEndend')
names(tmp_atacseq) <- c('ID','ATACchr','ATACstart','ATACend')
tmp_otherEnd_atacseq_picked <- cbind(tmp_otherEnd,tmp_atacseq)

In [28]:
head(tmp_otherEnd_atacseq_picked)

Unnamed: 0_level_0,otherEndID,otherEndchr,otherEndstart,otherEndend,ID,ATACchr,ATACstart,ATACend
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<int>,<fct>,<int>,<int>
237538.0,212,chr1,811328,817481,30072,chr1,817253,817413
724941.0,242,chr1,968260,992014,34539,chr1,975988,976499
724941.1,242,chr1,968260,992014,34614,chr1,984284,984405
724751.0,243,chr1,992014,1001574,34732,chr1,999291,999403
724751.1,243,chr1,992014,1001574,34739,chr1,999952,1000663
724911.0,246,chr1,1008296,1021819,3724,chr1,1012932,1014523


In [29]:
chic_atacseq_merged_picked <- unique(merge(merge(tmp_otherEnd_atacseq_picked,chic_data),merge(tmp_otherEnd_atacseq_picked,atacseq_data)))


In [30]:
head(chic_atacseq_merged_picked)


Unnamed: 0_level_0,otherEndID,otherEndchr,otherEndstart,otherEndend,ID,ATACchr,ATACstart,ATACend,baitID,baitchr,⋯,T1H,T4H,T24H,ATACannotation,AT0,AT20,AT1H,AT2H,AT4H,AT24H
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<int>,<fct>,<int>,<int>,<int>,<fct>,⋯,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,100003,chr10,119702248,119714492,653,chr10,119705280,119705704,102382,chr10,⋯,0.0,1.032709,2.174726,Intergenic,5.683413,5.598023,5.945469,5.562361,6.423752,7.890997
2,100008,chr10,119724542,119727941,654,chr10,119725154,119726711,99879,chr10,⋯,1.7661949,2.691034,2.565841,Promoter,9.68343,9.417568,9.715276,9.584663,9.609715,9.586125
3,100034,chr10,119824560,119831806,661,chr10,119828019,119828186,99879,chr10,⋯,2.5941914,2.354921,2.727703,3'UTR,3.677326,4.891543,4.105506,4.974079,4.872013,4.80289
4,100034,chr10,119824560,119831806,661,chr10,119828019,119828186,77614,chr10,⋯,0.0,0.0,1.636451,3'UTR,3.677326,4.891543,4.105506,4.974079,4.872013,4.80289
5,100044,chr10,119857258,119859791,662,chr10,119859344,119859770,80031,chr10,⋯,0.9665639,0.0,0.0,Intron,6.308615,6.139551,5.788724,6.279547,6.229512,5.776813
6,100044,chr10,119857258,119859791,662,chr10,119859344,119859770,99891,chr10,⋯,3.2718757,2.907554,3.341844,Intron,6.308615,6.139551,5.788724,6.279547,6.229512,5.776813


In [31]:
dim(chic_atacseq_merged_picked)

In [32]:
names(chic_atacseq_merged_picked)

### gene counts data (with replicates)

In [33]:
gene_data <- read.table('data/Gene_data.txt', header=T)

In [34]:
head(gene_data)

Unnamed: 0_level_0,ENSG,ENSGchr,ENSGstart,ENSGend,GeneName,ET0_a,ET20_a,ET1H_a,ET2H_a,ET4H_a,ET24H_a
Unnamed: 0_level_1,<fct>,<fct>,<int>,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,ENSG00000000003,chrX,100627109,100639991,TSPAN6,5.109918,4.375649,3.737093,4.315656,3.106641,4.829884
2,ENSG00000000419,chr20,50934867,50958555,DPM1,10.794659,11.046174,10.776526,10.557018,10.431918,10.598822
3,ENSG00000000457,chr1,169849631,169894267,SCYL3,8.030047,7.872484,9.221314,9.390712,9.31234,8.957959
4,ENSG00000000460,chr1,169662007,169854080,C1orf112,8.658307,8.500433,10.024752,12.261566,14.06062,11.125245
5,ENSG00000000938,chr1,27612064,27635277,FGR,7.579777,7.516027,7.231935,6.887572,7.291655,4.919571
6,ENSG00000000971,chr1,196651878,196747504,CFH,5.956084,5.683208,5.515713,5.156287,4.843552,4.201323


In [35]:
names(gene_data)

In [36]:
names(gene_data)

### merge gene and chic data

In [37]:
head(chic_data)

Unnamed: 0_level_0,baitID,baitchr,baitstart,baitend,otherEndID,otherEndchr,otherEndstart,otherEndend,ENSG,T0,T20,T1H,T4H,T24H
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<int>,<fct>,<int>,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,100955,chr10,122950347,122956207,92154,chr10,92572726,92594937,ENSG00000119965,1.140669,0.0,1.391779,1.0327093,0.0
2,100955,chr10,122950347,122956207,78222,chr10,45508633,45514336,ENSG00000119965,0.0,0.0,0.0,1.1305003,0.0
3,100955,chr10,122950347,122956207,77679,chr10,43644531,43648045,ENSG00000119965,1.310104,0.0,0.0,0.0,0.0
4,100955,chr10,122950347,122956207,101495,chr10,124901900,124920455,ENSG00000119965,3.882884,2.921624,1.690418,2.9919739,0.0
5,100955,chr10,122950347,122956207,89558,chr10,84128125,84134471,ENSG00000119965,0.0,1.017335,0.0,0.7708398,1.930548
6,100955,chr10,122950347,122956207,101383,chr10,124409095,124414046,ENSG00000119965,3.445871,3.466919,2.956566,3.0261252,3.438559


In [38]:
chic_gene_mergedbyENSG <- merge(gene_data,chic_data)

In [39]:
dim(chic_gene_mergedbyENSG)

In [40]:
names(chic_gene_mergedbyENSG)

### link ATACSeq to gene replicate

In [41]:
head(tmp_otherEnd_atacseq_picked)
head(chic_gene_mergedbyENSG)

Unnamed: 0_level_0,otherEndID,otherEndchr,otherEndstart,otherEndend,ID,ATACchr,ATACstart,ATACend
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<int>,<fct>,<int>,<int>
237538.0,212,chr1,811328,817481,30072,chr1,817253,817413
724941.0,242,chr1,968260,992014,34539,chr1,975988,976499
724941.1,242,chr1,968260,992014,34614,chr1,984284,984405
724751.0,243,chr1,992014,1001574,34732,chr1,999291,999403
724751.1,243,chr1,992014,1001574,34739,chr1,999952,1000663
724911.0,246,chr1,1008296,1021819,3724,chr1,1012932,1014523


Unnamed: 0_level_0,ENSG,ENSGchr,ENSGstart,ENSGend,GeneName,ET0_a,ET20_a,ET1H_a,ET2H_a,ET4H_a,⋯,baitend,otherEndID,otherEndchr,otherEndstart,otherEndend,T0,T20,T1H,T4H,T24H
Unnamed: 0_level_1,<fct>,<fct>,<int>,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<fct>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53548908,848879,chrX,146445592,146448497,0.0,0.0,0.0,0.0,1.266637
2,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,645243,chr6,53107181,53110286,3.858361,3.722884,4.446843,4.27528,4.863939
3,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,645412,chr6,53698015,53702395,5.079635,4.912067,5.132516,5.044923,4.964815
4,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,645419,chr6,53717909,53719885,4.832475,5.58835,5.245855,4.892268,5.40861
5,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53548908,645228,chr6,53054800,53055705,3.043141,3.013895,3.521647,2.636451,3.0054
6,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,633365,chr6,11278589,11283318,0.0,0.0,1.088724,0.0,0.0


In [42]:
head(chic_gene_mergedbyENSG)

Unnamed: 0_level_0,ENSG,ENSGchr,ENSGstart,ENSGend,GeneName,ET0_a,ET20_a,ET1H_a,ET2H_a,ET4H_a,⋯,baitend,otherEndID,otherEndchr,otherEndstart,otherEndend,T0,T20,T1H,T4H,T24H
Unnamed: 0_level_1,<fct>,<fct>,<int>,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<fct>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53548908,848879,chrX,146445592,146448497,0.0,0.0,0.0,0.0,1.266637
2,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,645243,chr6,53107181,53110286,3.858361,3.722884,4.446843,4.27528,4.863939
3,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,645412,chr6,53698015,53702395,5.079635,4.912067,5.132516,5.044923,4.964815
4,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,645419,chr6,53717909,53719885,4.832475,5.58835,5.245855,4.892268,5.40861
5,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53548908,645228,chr6,53054800,53055705,3.043141,3.013895,3.521647,2.636451,3.0054
6,ENSG00000001084,chr6,53497341,53616970,GCLC,9.230431,9.287864,9.078914,8.740437,9.547917,⋯,53507135,633365,chr6,11278589,11283318,0.0,0.0,1.088724,0.0,0.0


In [43]:
head(tmp_otherEnd_atacseq_picked)

Unnamed: 0_level_0,otherEndID,otherEndchr,otherEndstart,otherEndend,ID,ATACchr,ATACstart,ATACend
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<int>,<fct>,<int>,<int>
237538.0,212,chr1,811328,817481,30072,chr1,817253,817413
724941.0,242,chr1,968260,992014,34539,chr1,975988,976499
724941.1,242,chr1,968260,992014,34614,chr1,984284,984405
724751.0,243,chr1,992014,1001574,34732,chr1,999291,999403
724751.1,243,chr1,992014,1001574,34739,chr1,999952,1000663
724911.0,246,chr1,1008296,1021819,3724,chr1,1012932,1014523


In [44]:

atacseq_gene <- unique(merge(merge(tmp_otherEnd_atacseq_picked,chic_gene_mergedbyENSG),atacseq_data))

In [45]:
dim(atacseq_gene)

In [46]:
head(atacseq_gene)

Unnamed: 0_level_0,ATACchr,ATACstart,ATACend,otherEndID,otherEndchr,otherEndstart,otherEndend,ID,ENSG,ENSGchr,⋯,T1H,T4H,T24H,ATACannotation,AT0,AT20,AT1H,AT2H,AT4H,AT24H
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<fct>,<int>,<int>,<int>,<fct>,<fct>,⋯,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,chr10,100168185,100168481,94270,chr10,100168142,100171879,7,ENSG00000122873,chr10,⋯,0.0,2.004188,0.0,Intron,5.185079,5.386033,5.549115,5.081706,5.133666,5.469091
2,chr10,100168185,100168481,94270,chr10,100168142,100171879,7,ENSG00000099204,chr10,⋯,0.0,0.0,1.930548,Intron,5.185079,5.386033,5.549115,5.081706,5.133666,5.469091
3,chr10,100267287,100268052,94301,chr10,100265170,100287223,11,ENSG00000052749,chr10,⋯,0.0,0.0,2.174726,Promoter,7.772423,7.507165,7.616713,7.874698,7.849217,7.668432
4,chr10,100286005,100287018,94301,chr10,100265170,100287223,12,ENSG00000052749,chr10,⋯,0.0,0.0,2.174726,Promoter,9.086124,8.986858,9.306845,9.077411,9.375033,8.877224
5,chr10,100481544,100482916,94352,chr10,100476882,100482954,21,ENSG00000213341,chr10,⋯,2.480503,1.799894,1.636451,3'UTR,9.147414,9.275847,9.463431,8.709092,9.302763,8.877224
6,chr10,100508684,100509527,94364,chr10,100507722,100518706,22,ENSG00000052749,chr10,⋯,0.0,0.0,2.174726,Promoter,8.054279,8.223312,8.532925,7.67436,8.88722,8.168356


In [47]:
cat("There are ", dim(atacseq_gene)[1], "paried bait-otherEnd-atacseq-gene datasets, among which ", 
    dim(unique(atacseq_gene[,c('baitID', 'ID')]))[1], "paried bait-ATACSeq links; ", 
    dim(unique(atacseq_gene[,c('ID', 'otherEndID')]))[1], "ATACSeq-otherEnd links; ", 
    dim(unique(atacseq_gene[,c('ID', 'ENSG')]))[1], "ATACSeq-gene links; ", 
   dim(unique(atacseq_gene[,c('baitID', 'ENSG')]))[1], " bait-gene links.  There are ", dim(unique(atacseq_gene[,c('baitID', 'otherEndID')]))[1], " CHIC interactions, 
", length(unique(atacseq_gene$baitID)), "baits, ", length(unique(atacseq_gene$otherEndID)), "otherEnd, ",length(unique(atacseq_gene$ENSG)), "ENSGs and ", length(unique(atacseq_gene$ID)), "ATACSeq peaks")

There are  76914 paried bait-otherEnd-atacseq-gene datasets, among which  68369 paried bait-ATACSeq links;  24295 ATACSeq-otherEnd links;  65827 ATACSeq-gene links;  5706  bait-gene links.  There are  52883  CHIC interactions, 
 5148 baits,  18898 otherEnd,  3516 ENSGs and  24295 ATACSeq peaks

In [48]:


atacseq_gene$corr_atacseq_gene <- sapply(seq.int(dim(atacseq_gene)[1]),function(i) cor(t(atacseq_gene[i,c('AT0','AT20','AT1H','AT2H','AT4H','AT24H')]),t(atacseq_gene[i,c('ET0_a','ET20_a','ET1H_a','ET2H_a','ET4H_a','ET24H_a')])))
atacseq_gene$corr_atacseq_otherEnd <- sapply(seq.int(dim(atacseq_gene)[1]),function(i) cor(t(atacseq_gene[i,c('AT0','AT20','AT1H','AT4H','AT24H')]),t(atacseq_gene[i,c('T0','T20','T1H','T4H','T24H')])))
atacseq_gene$corr_gene_otherEnd <- sapply(seq.int(dim(atacseq_gene)[1]),function(i) cor(t(atacseq_gene[i,c('ET0_a','ET20_a','ET1H_a','ET4H_a','ET24H_a')]),t(atacseq_gene[i,c('T0','T20','T1H','T4H','T24H')])))


In [49]:
atacseq_gene_nopromoter <- atacseq_gene[atacseq_gene$ATACannotation!='Promoter',]
dim(atacseq_gene)
dim(atacseq_gene_nopromoter)

In [50]:
atacseq_BICandLR <- read.table('data/atacseq_BICandLR.txt', header=T)
chic_BICandLR <- read.table('data/chic_BICandLR.txt', header=T)
gene_BICandLR <- read.table('data/gene_BICandLR.txt', header=T)

In [51]:
head(atacseq_BICandLR)

Unnamed: 0_level_0,ATACchr,ATACstart,ATACend,BIC_atacseq,LR_atacseq
Unnamed: 0_level_1,<fct>,<int>,<int>,<dbl>,<dbl>
1,chr10,100168185,100168481,1.791775,1.592097e-05
2,chr10,100267287,100268052,0.991704,-0.8000554
3,chr10,100286005,100287018,1.791826,6.613878e-05
4,chr10,100481544,100482916,1.791772,1.285027e-05
5,chr10,100508684,100509527,1.791769,9.747732e-06
6,chr10,100510838,100511373,1.791792,3.244872e-05


In [52]:
atacseq_gene_withBICandLR <- left_join(left_join(left_join(atacseq_gene, atacseq_BICandLR), chic_BICandLR), gene_BICandLR)

Joining, by = c("ATACchr", "ATACstart", "ATACend")

Joining, by = c("otherEndID", "baitID")

Joining, by = "ENSG"



In [53]:
names(atacseq_gene_withBICandLR)

In [54]:
write.table(atacseq_gene_withBICandLR, file='data/ATACseq_CHiC_RNAseq_linked_withpromoter.txt', quote=F, row.names=F)