# Add liftover of TSS coordinates to Anthony's df of TSS info

## Load Anthony's TSS data

In [1]:
.libPaths("/no_backup/jferrer/jmidgley/R_libs")
library(readr)
library(dplyr)
library(stringr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# Load table with TSS info - NOTE: uses hg19 positions
TSS_df <- read_tsv('/users/jferrer/public-docs/tracksDirectory/mplanas/HI_transcriptome_v2.2.tsv.gz', show_col_types = FALSE)
head(TSS_df)

chromosome,tx_start,tx_end,strand,transcript_id,gene_id,gene_name,Gene class,gene_type,transcript_type,⋯,high_tpm,low_tpm,peak,"TSS expression high glucose (Salmon, TPM)",Relative TSS usage,TSS_type,distance_to_closest_TSS,Distance to closest Gencode TSS,name,new_1st_exon
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>
chr1,14362,29370,-,ENST00000423562.1,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.58083,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes
chr1,14359,29350,-,PBT00037199,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.58083,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes
chr1,14359,29350,-,PBT00037200,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.58083,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes
chr1,14360,29364,-,PBT00037201,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.58083,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes
chr1,14362,29346,-,PBT00037202,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.58083,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes
chr1,14365,29347,-,PBT00037205,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.58083,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes


### Load LiftOver TSS coordinates and add to TSS data

Used UCSC Liftover tool to create BED file of new start and end positions 

In [3]:
liftover_tss <- read_tsv("./HumanIsletTranscriptome/data_raw/tss_liftover.bed", col_names = FALSE, show_col_types = FALSE) %>%
  setNames(c("chr", "new_start", "new_end", "old_coords", "score", "strand")) %>%
  mutate(
    old_base = str_extract(old_coords, "^chr[\\w]+:[0-9]+-[0-9]+"),
    new_coords = paste0(chr, ":", new_start + 1, "-", new_end)
  ) %>%
  distinct(old_base, .keep_all = TRUE)
head(liftover_tss)

chr,new_start,new_end,old_coords,score,strand,old_base,new_coords
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>
chr1,29335,29359,chr1:29336-29359:.,1,-,chr1:29336-29359,chr1:29336-29359
chr1,629079,629136,chr1:564460-564516:.,1,+,chr1:564460-564516,chr1:629080-629136
chr1,629638,629739,chr1:565019-565119:.,1,+,chr1:565019-565119,chr1:629639-629739
chr1,631069,631078,chr1:566450-566458:.,1,+,chr1:566450-566458,chr1:631070-631078
chr1,632535,632616,chr1:567916-567996:.,1,-,chr1:567916-567996,chr1:632536-632616
chr1,632597,632876,chr1:567978-568256:.,1,+,chr1:567978-568256,chr1:632598-632876


In [4]:
# Add hg38 coordinates to original df
TSS_df <- TSS_df %>%
  mutate(
    old_base = str_extract(name, "^chr[\\w]+:[0-9]+-[0-9]+"),
    peak_suffix = str_extract(name, "Peak_\\d+$")
  ) %>%
  left_join(liftover_tss %>% dplyr::select(old_base, new_coords), by = "old_base") %>%
  mutate(
    new_name = if_else(
      is.na(new_coords),
      paste0("conversion_failed_", peak_suffix),
      paste0(new_coords, "_", peak_suffix)
    )
  ) %>% dplyr::select(-old_base, -peak_suffix, -new_coords)
head(TSS_df)

chromosome,tx_start,tx_end,strand,transcript_id,gene_id,gene_name,Gene class,gene_type,transcript_type,⋯,low_tpm,peak,"TSS expression high glucose (Salmon, TPM)",Relative TSS usage,TSS_type,distance_to_closest_TSS,Distance to closest Gencode TSS,name,new_1st_exon,new_name
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
chr1,14362,29370,-,ENST00000423562.1,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
chr1,14359,29350,-,PBT00037199,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
chr1,14359,29350,-,PBT00037200,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
chr1,14360,29364,-,PBT00037201,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
chr1,14362,29346,-,PBT00037202,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
chr1,14365,29347,-,PBT00037205,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,⋯,17.85643,Robust,27.34853,1,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1


In [5]:
write.table(TSS_df, file = "./HumanIsletTranscriptome/data_processed/TSS_info_w_lift.tsv", sep = "\t", quote = FALSE, row.names = FALSE)