# Preparing edge table to test

## Libraries

In [1]:
suppressMessages({
  library(here)
  library(readr)
  library(dplyr)
  library(tidyr)
  library(purrr)
  library(ggplot2)
})

## Edge table to test

This table has been modified from `edge_table_original` manually to match the GWAS data available:

In [2]:
edge_table_harmon <- read_tsv(here('data/edge_table_harmon.tsv'), show_col_types = FALSE)
head(edge_table_harmon)

NODE1,CIS,NODE2
<chr>,<lgl>,<chr>
FG,False,AGRP
IFC,False,PON3
HDL,False,HMOX1
HDL,False,APOM
TG,False,APOM
TG,False,LDLR


## Gencode table

In [3]:
gencode <- read_tsv(
  paste0(
    '/Users/da1078co/Documents/Data/GENCODE/',
    'gencode.v47lift37.basic.annotation.gtf.gz'
  ),
  comment = '#',
  col_types = 'cccnn-c-c',
  col_names = c(
    'CHROM', 'source', 'gene_type',
    'start', 'end', 'strand', 'addinfo'
  )
) |>
  filter(gene_type == 'gene') |>
  select(-gene_type) |>
  mutate(
    CHROM = gsub('chr', '', CHROM),
    gene_id = gsub(
      '^gene_id \"([^\"]+)\";.*',
      '\\1',
      addinfo
    ),
    gene_type = gsub(
      '.*gene_type \"([^\"]+)\";.*',
      '\\1',
      addinfo
    ),
    gene_name = gsub(
      '.*gene_name \"([^\"]+)\";.*',
      '\\1',
      addinfo
    )
  ) |>
  select(-addinfo)
head(gencode)

CHROM,source,start,end,strand,gene_id,gene_type,gene_name
<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
1,HAVANA,10370,13118,+,ENSG00000308415.1_1,lncRNA,DDX11L2
1,HAVANA,11121,24894,+,ENSG00000290825.2_2,lncRNA,DDX11L16
1,HAVANA,12010,13670,+,ENSG00000223972.6_6,transcribed_unprocessed_pseudogene,DDX11L1
1,HAVANA,14356,30744,-,ENSG00000310526.1_1,lncRNA,WASH7P
1,HAVANA,14696,24886,-,ENSG00000227232.6_7,transcribed_unprocessed_pseudogene,WASH7P
1,HAVANA,28589,31109,+,ENSG00000243485.6_13,lncRNA,MIR1302-2HG


## Adding cis regions where needed

In [4]:
edge_table_totest <- edge_table_harmon |>
  mutate(NODE1J = ifelse(CIS, NODE1, NA)) |>
  left_join(gencode, by = join_by(NODE1J == gene_name)) |>
  select(-c(NODE1J, source, strand, gene_id, gene_type)) |>
  mutate(CHROM = as.numeric(CHROM))
head(edge_table_totest)
tail(edge_table_totest)

NODE1,CIS,NODE2,CHROM,start,end
<chr>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>
FG,False,AGRP,,,
IFC,False,PON3,,,
HDL,False,HMOX1,,,
HDL,False,APOM,,,
TG,False,APOM,,,
TG,False,LDLR,,,


NODE1,CIS,NODE2,CHROM,start,end
<chr>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>
IGFBP2,True,LiverFat,2,217497551,217529159
FAM3C,True,HbA1c,7,120988932,121036418
KITLG,True,HDL,12,88886570,88974628
FGF21,True,LiverFat,19,49258781,49261590
TGM2,True,PancreasFat,20,36755787,36794980
MATN2,True,xinsdG30,8,98881068,99048952


## Saving

In [5]:
write_tsv(edge_table_totest, here('data/edge_table_totest.tsv'))