This notebook uses an R kernel.

# Sequence Fetch

Author: Zhongyi (James) Guo <br>
Date: 10/29/2024

## Import Packages

In [1]:
getwd()

In [2]:
.libPaths()

In [3]:
library(tidyverse)
library(biomaRt)
library(httr)
library(jsonlite)
library(xml2)
library(stringi)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     


── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


“package ‘httr’ was built under R version 4.3.3”



Attaching package: ‘jsonlite’




The following object is masked from ‘package:purrr’:

    flatten




“package ‘xml2’ was built under R version 4.3.3”


In [4]:
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

## Import Data

### All Genes

In [5]:
count_clean <- read_tsv('../../result/deseq2//count_clean.tsv')
head(count_clean)

[1mRows: [22m[34m58174[39m [1mColumns: [22m[34m9[39m


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): NAME
[32mdbl[39m (8): Sample_1, Sample_2, Sample_3, Sample_4, Sample_5, Sample_6, Sample_...



[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


NAME,Sample_1,Sample_2,Sample_3,Sample_4,Sample_5,Sample_6,Sample_7,Sample_8
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000223972,13,18,3,1,5,11,17,10
ENSG00000227232,1087,1002,182,531,200,114,319,172
ENSG00000278267,23,33,1,12,6,1,4,1
ENSG00000243485,0,3,0,1,2,2,0,2
ENSG00000284332,0,0,0,0,0,0,0,0
ENSG00000237613,0,0,0,0,0,0,2,0


In [6]:
gene_info <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", 
                   "start_position", "end_position", "strand"),
    filters = "ensembl_gene_id", 
    values = count_clean$NAME, 
    mart = mart
)
head(gene_info)

Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,chromosome_name,start_position,end_position,strand
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>
1,ENSG00000000457,SCYL3,1,169849631,169894267,-1
2,ENSG00000000460,FIRRM,1,169662007,169854080,1
3,ENSG00000000938,FGR,1,27612064,27635185,-1
4,ENSG00000000971,CFH,1,196651754,196752476,1
5,ENSG00000001460,STPG1,1,24356999,24416934,-1
6,ENSG00000001461,NIPAL3,1,24415802,24475252,1


In [7]:
gene_info <- gene_info |> 
    mutate(
        upstream_start = ifelse(strand == 1, start_position - 2001, end_position + 2001),
        upstream_end = ifelse(strand == 1, start_position - 1, end_position + 1),
        temp_upstream_start = upstream_start,
        temp_upstream_end = upstream_end) |>
    mutate(
        upstream_start = ifelse(strand == -1, temp_upstream_end, temp_upstream_start),
        upstream_end = ifelse(strand == -1, temp_upstream_start, temp_upstream_end)) |>
    dplyr::select(-c(temp_upstream_start, temp_upstream_end))

In [8]:
head(gene_info)

Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,chromosome_name,start_position,end_position,strand,upstream_start,upstream_end
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>
1,ENSG00000000457,SCYL3,1,169849631,169894267,-1,169894268,169896268
2,ENSG00000000460,FIRRM,1,169662007,169854080,1,169660006,169662006
3,ENSG00000000938,FGR,1,27612064,27635185,-1,27635186,27637186
4,ENSG00000000971,CFH,1,196651754,196752476,1,196649753,196651753
5,ENSG00000001460,STPG1,1,24356999,24416934,-1,24416935,24418935
6,ENSG00000001461,NIPAL3,1,24415802,24475252,1,24413801,24415801


In [9]:
dim(gene_info)

In [10]:
dim(count_clean)

565238 genes had chromosome coordinates, compared to 581749 total genes present.

Sequence fetching inspired by a post here: https://www.biostars.org/p/354170/

In [11]:
fetch_sequence <- function(chr, start, end, strand) {
    server <- "http://grch37.rest.ensembl.org"
    ext <- paste0("/sequence/region/human/", chr, ":", start, "..", end, ":", strand)
    r <- GET(paste0(server, ext), content_type("text/plain"))
    sequence <- content(r)
    
    # get the reverse complement if on the negative strand
    if (strand == -1) {
        sequence <- chartr(old="ATGC", new="TACG", sequence) |> stri_reverse()
    }
    
    return(sequence)
}

In [12]:
system.time({
  gene_info <- gene_info %>%
    mutate(upstream_region = pmap_chr(
      list(chromosome_name, upstream_start, upstream_end, strand), 
      fetch_sequence
    ))
})

     user    system   elapsed 
  429.225     5.852 11550.221 

In [13]:
head(gene_info)

Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,chromosome_name,start_position,end_position,strand,upstream_start,upstream_end,upstream_region
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<chr>
1,ENSG00000000457,SCYL3,1,169849631,169894267,-1,169894268,169896268,ACATAAAATGTGGTGTATCCCTCTAGACTAGTATATGCAACTATTATAGTACTTTTTCTTTATAGGATTACTTATTTAGTGCCATTTCACCTTTTTCCTTCGTATTCAGATAACATATTTTGGACCTTTGAATCACCAAAATAGTGTCTCTTCCTGAAAATACATAATTCTGGTCCAAAGAAAATGGTTATTGTTTATTGTGAAAAGATAACTAGGAATGATAAGGCCAAAAAGTCAATCTGCCATTAAGTTTTATCTCCCAAGAACAATTATTTCTGGGTACCTAGAAAGAAAAATGACAAATTCTTTAAAAGAGAAAAATTATATTTAAAACCCAACAACAAATAGAAAACCATATCTAGAGTGTGTTTTTTTAAATAAACAAATTTAGAAATCTAATTAGCATTCAGCAAGCAGAAGACTGAATTTTTGCTAATATTTGCAAGACTTCAAACACATTTGTAATAGCAATCATTAGAATTCTAAACTTGATATGATTCAGAGCAGGAACAAAGAAATCTGGAAATGACAAGTGATGCTTCACATGCAGCACATTACAAGGGAATGGCAGCCAGTAAAGAATCTCAGGCACTCACAGGCTCTTATTAAATTTCTGGGCCAGAGTTGACATTCCTGTAGAAAGAGAAGAAAACCACCTACCTCTGGCTGTTTGTGAGGCTTAATTAGTGCGTATTTGTAAAGCACTTTGAAATCCTTGGATGAAAAGTGCTGCATAAGTGAAAAGTATCATTATTTTATTAAAACGAGATAAAAGCAACACAAGCCTTTTTCTTCTTTTAAATTCCTAGGTCAGCAATATCAATGTAGAGCAAGTAATGAATTTATTAATAAATGGCTACAAAAAGAATAAATGCAGGCCAAATTATTTTTTCAAAGGCTCATGGAAGTCTTTGACTTGGGAACATTTTAAGAAATAATATTGGCTACACTTTTATTCACAACATCCTTGACAGGGTCTATCCAGCTCTCTTCTGAACACCTCCAGAAGGAAGTGCTCACTACTTTGTCAAGGCAGTTCATTCCATTTATGGATATGTAAAACTGTTGGAAAGTTCTTTATTACACAGCTCTAAAATCCAGTTCAACTAGTCTTAGAATTACCCTCTGAAACCACACAAGGAGTCCAATTCTTCTTTGATATGACTGCTTCACAACCCTTGTTTATTCAAGCCTCATTACATGGTCTCTAATCTTTACCATCTCAGTTGCTCTCTAAAACCACTTCCATTTTCTAGTGTTTGTCTTCAAATGAAGAAGCTAAAATTAACGTGATACTTATGATGTGATCTGAAACCAGTACAGAATAAAAGATCCTATATCTGACTTTACACTATTGTAGGCTAAAACTGTATTAATCTTAAACAAAAATTTTAAGAAGACATTATCACAAAGTTGCCTAATATTGACTGCTTATAATTGACTGTGTATATATGTGTATATATATATAACATTGAGTTATATATAGTTATATAATATTGAGTTATAGTTACATGTAACATTAAGTTTTTTCTGTATATATGTACAGTTGTGTGTGTACACACACACACACACACACACATTCAAAATTTTACTTATAACTACCATGCTAAATTTCATCTTTTTAGTCTGAGTCTGTCTTTAGGAATTCTATTCTTCCCAAGACAAAATCATTTATGACTACATATTTATAAATAGGCAGCAGTGATTTTTCTTAACAAACCAGTCTTTGTGATCCATCAGGTAATATTGAAGGCTGGAATTCTTTTGGAATTGTCAAGTGAAAGAATTTAACATGAGCTGTTATAGGAAGAAACTTTCCAGGATATTATAAAACTGTATGAAATTACTTCTTCATAATTTTTTGGAATAGTTCATTCTTTTTATCAATTTCTCTAATATATTGATTATAGAATCAAAATCAGTATGTTTATCTTTCTAAGTAGATTGTTAATTGATCATCAGGGTAATTTCAGCTTAAGTATGATTTGTCATGGGAAATTT
2,ENSG00000000460,FIRRM,1,169662007,169854080,1,169660006,169662006,GGGGGCAGGGGAAAGGAGAGCATTTCATTGTGAATCAAGGAATTTCTCCACCTGTTTTAACTCTTCCATATGACATCAAAGAGATGTCACTTGCAGCTAGCATTTCAGTGATGTTTTCTTACTAATAATATCGTGATAAAAGAAACATTGACTATAAGAAATAGGAATGGGTCTCATAAAAGGAAACAGCAAAACCCCCAAACTAAAAAACAGCGCAGGCTATTTCTCTCTTCTCTCCTTTTGCTTGGCACTCATGAGATGCTAGGTGTGGAAGTCAGCCAACTGAAAAAGAGAGGTGGCTGAAGAAGGTGGGGAGGCTGAAGCCAGTTAAATAGGATGGTCCAATTCACAGACGGCGAGGCTACAGTGCAAATAGGACTCTTTCAACTTGAGCAGGACCCCATTACTTCACTGGAGTTAGAAAGAAAGGAGAGCGTAGACTTTTTGAACTTTCTATAAGAGTGTACCTCCACAGTATACAGAAGACGACGTGAAATTTGATCTGCAAGAAAACTGAGTCCATATTCACATATGTATCAAATTTGCACTTCATTTAGAAGTGTCTGTCATCAAGTACAGCACTGAATTGAAACTGAAAACAAGAGTCAAGAAAGAGCAAAGTCAGCCATCTTTATATTCCACATGAATCCTTTCCCTTTATGGTCTTATTTGTTTCTCCTCAGAAAAGACAAAAAGCTGAGCTGTATAAACACCTGTGGGCTGGGGGTTGAGGGATAAATGAGGGGCGAAATGGAAGCTGAAGGAACTGTTGGTCAGGTAGAAATCTTCCCAGATGCACTGAAGGAAACACACTTCATGTTTGACGTAGGAGGTGCCACCACACAAAACGTTTCATGGAAGGATTTAAAGGATCTCATGATTTTTAGTATTCCAAGAATTTTCTTTCACCAAGGGCGATTTAATATGGGTCATTCATACTGAAAGAAAAACAAAAGATAATAAGAGTTTAAAAATTGCAAAACTTGGAGTGTTAGTAGTAAAGGTAAATATTCATTAGAGATGAGAAGAGGAGCAAGGAAATGCTTTCAGCTGGAAATCTCAGACAAGAGGCCAGGCTTTAGGAACCTCTGAAGATGAACAAATGTAAGCAAACCCTAGTAGCAGCACTTCTCAGATTTTCATGTGCTTACCACTCAGAGATGGTGTTAAAATGCAGACTCTGATTCAGTAGGTCTGAGTGGAGCCTGAGATTCTGCACCCCTAACAAGCTCTTTAGTGATGCTTATGCCACTGGCGCACAGACCCCACTTGGAGAAATTTTTGTGGTGCATACGGTCTTTGTCTCCAGATCTAATGAGTCTGAAGGACAGTGTAGATTGATTTTTTAAATTTATGTTTATTTTAATTTAATTTAATTTAATTTATTTATTTATTTATTTTTGAGATGGAGTCTCACTCTGTTGCCCAGTCCGGAGTGCAGTGGCACGGAGGCAGCTCATGCAACCACGGCCTCCTGGGTTCAAGCGATTCTTCCGCCTCAACTTCCTGAGTAGCTGGGAATACAGGCACGTGCCAGCACACCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCACATTGGCCAAGCTAATCTCAAACTCCTGACCTCATGATCCACCTGCCACGGCCTCCGAAAGTGCTGGGATTACAGGCGTGAGCCACCGAGCCCAGCTGTAGATTGATTTTGAGCAGTGGAAAGTCAAGGAATTAGAAGGCATGCTTAAATGGAAAGTGAAATTGGAGAAAATTTAAACTCATGAAATAGTGGTGGTTATAAACTCGTGATAAATTATATCCTGGGATATAATTTAATGAGATGGTAACACATTTAGTTTAAAGAAATAAGTGACACTTTTTTTGTGTGACACAACTGTCTTATTCTTGGAAAGGACAAGGAGAGAATGAAATATGGTATGTCTTCACAGCACCTTTCAAAGGGAGAACCAGATTCTGAGGAGCTGGTCTCATGATGAACTGTCAGGGTAAACCACAG
3,ENSG00000000938,FGR,1,27612064,27635185,-1,27635186,27637186,ACTAAATTGATTTCACATATGCAAGTTTTTGAAGTGCCCTGGATTAGATAATCTCTTAGACTATAGGTTCCTGAACTTTCTTAGCTCACAGCACCATTACCATCTCAGTAATTTTTTCATGGAGCCTCTAAGCCAAAAGAAATAACAGTTCAATTTATTAAGTAGTTAAGTCCAAATCACTTAAAAAGTATTTATGTCCTAACAAGTTAGTTGCCATTTAAAAAATAATACACCTGAATTGGAAGAAACAATATTTATTTCATTCTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTTGCTCTGTTGCCCAGGCTGGAGTGCAGTGGCATGATCTCAGCTCACTGCAACCACCGCCTCCCGGGTTCAAGTGATTCTCCTACCTCAGCCTCCCAAGTAGCTGGGATTACAGGCATGTGCCACCATGCCCAGCTATTTTTTTTTTTTTGCATTTTTATTTATTTTTTATTTTTTATTTTTTTTTTGAGACAGTTTCGTTTTTGTCGCCCAGGCTGGAGTGCAATTGTGCAATTGCGGCTCACTGCAACCTCCGCCTCCTGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGATTACAGGTGCCCGCCACCACACCCAGCTAATTTTTTGTATTTTTAGCAGAGATGGGGTTTCACCACCTTGGCCAGGCTGGTCTCAAACTCCTGACCTCAGGTGATCTGCCCGCCTCAACCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCACCGCGCCCAGCCTTCATTCTTAAACAACTGCAATTACTAATGGGATCTATGCTGCCCATTGGGCACTGAACAACTTGTTAAATTTTCTAATCAAAGCAAACACTGCTGCCTTCATTTCCTGTCCCACATTGATTTTCATTTGCTTTTTATCACAGCAACCACTAAAGACCCAGCTTCTTTTTTTTTTTTTTTTTTTTTTTTGAGGCGGAGTCTTGCTCTGTTGCCCAGGCTGGAGTGCAGTGGTGCGATTTCGGCTCACCGCAAGCTCCACCTTCTGAGTTCATGCCATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCAGCTGCCACCAGACCTGGCTTATTTTTTATGTTTTTAGTAGAGATGGGGTTTCACCACGTTAGCCAAGATGGTCTCGCTCTCCTGACCTTGTGGTCCACCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACTGCGCCCGGCCAAGACCCAGCTTCTTAAAGATGACATCATCAAAAGGATTGTGGCATAATGTAATAATGGAACTGTGAACTACCTTGAGCTAATAGTTCTCTCATTAGCCAAAGATGTTGAATATCTTTCAAAATATCCTACTTTGCCCCCTGTGAATTTGCTGTGGGTGCCCTCAGGCACCTTGATGCACAGTTGGGGAACTATGGTTTAGTCTGTGGGAATAGCAGTTCCACTACTGAAACTACTGAGAGGTCCTGCCGGAAACACAATAGGTGGTATTATAAAACAGATACTGTGGGCTGGTATCATTCCTCTGTGTACCCAAATTAACGTCTGCAGAAGCTTCTTGAAGGCATTTCCAGACCAGACAAAGGAAGGACAATGTTCCAGACCAGGGAGTAGTGCAGAGTCTCAGGTTCCTTCATGAACCCTGAGTGACTCCATCTGGCTGGATTGTAGAGGGTAGACTGGAGGAGCAGTCAAACTGAAGGACTTCCTATGGCTGGTGAGAGACCACAGACTCAATCCTGTAGGCAAGGGAGGCATTGGAAGACTTCACGAGGGTCTCCATTTCATTTGGAGTTGGTACTCCATTTTTTAAAAAATTAATAGCCTTTACTTTCTAGAGCAGTTTTAGGCTTACAGAAAATTGAGCAGATAGTATAATGAGTTTCATATAACCCTCTTTCCCCAGCTCAATTTTCCCAATTATTAACATCTTGCATTAGTGTGTTGTTGTAATTAATGAACCAATATTGATACATTATGATAACTG
4,ENSG00000000971,CFH,1,196651754,196752476,1,196649753,196651753,TCTGCTGCAATTAGTGAAACAAGGAACAGTGTTACCACATATGGTCCAGAAATCTAAAAGGGTGATACTGCCATACTGTGAGAGAAAGAAGAGTTGAACTCTTAAGAGCCCTCTGAACTTTAAATCCCATGATTCATAAATACATGTTTAAACTGTGCTTTAGTAATCCTCTTTTACAAAAGAATAGTTAATTTTGGGTTAATTTTATACACTAAAAAAATTTTGGTTTGAGCCCACAAGTGTCTCCTTGGAAAACATATTTTTCTTCACAATTGCGTGATCTCTACTTTTAGGCACTGTTAAGAGAATATTTTAGAAAGAAATTAATGAACATGAGACAGTAATAACCCACATCAGATCCATGTTTGGCTAATGGTGTGTGGGGTCCAACTATTTGTCTTCATGGCATTCGTAGTTCCTAAGCAAAGGAAGTGCATGCACAAGTCCTTTGGCATTAACTTCTACCTGAAGTATATTATTTCCAGAGCCCCTCTCAAATCTATAGTGAATTTATCCATGCAGTAATCAAAATGTAACTTATAAACCACTTATTTTACATCAGGACCTTGCTGTGTATTGAAGATAAACATTGCAATAAACCATCATTCCAGCTTTCAATGAATTCAAAATAAGACTCTGGGAATTTAGTTTTATTTGAATATGTGACTCTGTAGCATTACTACCTCATTGTAGTCATCTGATTTTTTCGTCTCTTTTACTTCTTCACTTTGATACCTACTTAAAACTCTTCCTTAGCACATATTTGTTTTCACAATGAGAGATCCATCTGTTTTTCCCAAATGTATTACTGCATTTCTATTCTTGCTTCTCTTGAACACTAATTGATTCACACACACACACACACACACACACACCCCTCAAAGACTCTACATTTATAAAACATTTTTCTTTTATAATTTTTTTCCATTTCTCAGTCACAAGTACAAGTAAAACCTGAATATTCAATACTATGCTTGCAGGTCCTGCTTTCTTTAGGAAAAGAGATACCAAGAGAAGGCACTGGGATTGGGTACTCCTTTTCAGAAGACAGTTCACTAATAATCGATATAAATAAAGGAGGATTTATCCCAAATTAGTTAAGCATACACTCTCTCACCACGTTTTATTTTATTGAAATATACTTTACGCTGATTAGATATAGAGAAGTCTTTCTTAATAAAAGAATTCCACTCTTTGTATGCCAACTAATATAGCATAAATATGACTATTTTCTTGGTACAAAGCAAGGATCTACTTGCTGATATAAGATGAATTCAAAATCTATACACATCATTTTCTCTGAAATCAGGTAACTTCAAACTCTCTTTGTTGCTTCTAAATCTATAAGAAGTTTGGAATTAAATAATAAGATAATATTAAAGTGAAATAATCTACTTAAATTTTAAATACTTTAATTTTCTTCAATCTGGTCTATGGCACACACATGTCAGAGGACTGATTTAAAGGTGAATTGTCAAAACTTTGGTCTCTGTGTCCAATAGGGACAATCTGTAACAGAAAGTTATTTATAGAAACAATCAAAGGTACGAAGAACTGCTAAAGGAAAGCTATCTTTGATCCGAATCTATCTCTTCCCTGTACAGATTGGATTTTCTATTGGTTCAAAGCATAGAGTGCTAACCTTGAATTAGTATTCTAACTTCAAATGACCAATGGTTGTTTCCAATAGTAGGAAAATATGGATAGTCTCAAGTATTGTTTTGGATAAAGGTACAGGTTCGGGCAATGTTTTAATAACATGATGTATTTTTAAAAAGATTATGAGTGTAGGAAATTAAGGTTTATCACAGGATTTATGTCTACTGCTTGAGCATAATCACCAGAAGTCTTTCTATTCTTAATTAACAAAATTGTTCCATATTATTTTATTGATTTCTTAGTTAAACTCTTGTTTTTTGCATCTCATAGCTTTTGACTTCAAAAATTGGTATTAAATTGAGCTATATCATAAGGAAAAATAATTGAAGATACATAAACTGA
5,ENSG00000001460,STPG1,1,24356999,24416934,-1,24416935,24418935,GGAGGCCGTGTCCCCGCACTCGAGCTTAAGGACATCTGACAGGTGCTGGGGTAGGGTGCTCCCTCAGACTGGTCTCTCCCATGGACAGGTTTGGTTTCTCCCATTAGACTAGCTGTATCTTCCTTCTCAGGCTAGGGGTTCCTAAGGACCGGATTGTGTCTCCTCCATCAGACTGGAAGTTCTCGAGGGCTGGCCTTTGTCTCCTTCATCAGACTAGTGTTTACTGAGGGCTGGGCTGTGCTTCCTCCACCAGACTCGGGGCTCCTGAGGGCTGGATTGTCTCCTGTGTGAGACAGTCCTGAGGGTTCCCTGACTGATGTTCTAGAAAGTCGCTTAGGACCTCTGTACCCACCTGTCTTCCTCCGGGCTGCATCATGGTCACCCATGACAACCAACTCTGAGGCCTTGGAGGGGACGCTGCTGCCTACCCTGCTGATGGCTCTCACCCGGAACCGATAGCTCTGACCTTCGACGAGGCCTTGGATTGGGCACCGACAAGTCCCTCCGGGGGCCTCATGGCAGGCGATCCATTCCCCAGACTCGCCCTGGCACCTGTTGGAGACAGGCCCCCCCTTTCAGCCCCTGCCCACAATGCACACCTTCCCAATGTGAGCCTCTACAAGGGCCTTTCTCTGACTGGGGGAGCCTGCCCAGCTCATCTCTGCCAATTAAAGTTCTTCTCTCCAATGGAAGAGCCTGGCTCCAACCTCGGGCCCCCGGGAGACCTTTTCACCATCCCTGACACTGACCAGCAGCAGTGCCTTTCATGGGGCATCATCGTGAGCCTGTGACTGTGCTGTGCCCTCCACACCCAGCCCTCGTTTTATCTCCCTGGCAGTCCTAGGGGGCACGTGTGGTCACCTTCACTGCACACATGGAGGATCTGTGGCTCAGAGAGTGACAAGACCTGCCCAAGGTCACACAGTTAGGATTTACCTGAGCTTGTCCCCTCCCTAAGTTGGCATGCATTCCAGGTTGACCCTCCAAGCCTGGGCTCTCTGTACCCTCTTGGTACCATTGTTCCAGGCAGTACGACAGTGCACGTTACCCTTATTGTTACCAGCTGGTTGAGGTAGTCGTCACATGTGGGTCCAAGTCTTGTCTCCACCACTAATGACAAATGTGTCTTTGAGCAAAGTCATCTCCCCACATACTTTGGTTTCCCCATCTATAGAAGAGGGCATTAGATAGTCCCCCTCCCCTTTAGAGACGGGTCATTGGAAGGATGGAAAGAGATGAGGCATGCCCTTAGCACGGTCATGAGCCTGTCCGTGGCAGCATTAGCTATTTTCTCAAGCTGGGCCTTCAGAGCCGGCCACACTGGAAAAAGGCCGGAGGTTCTGGCCTCTCTACCTGTTTGCCTCCTCCCTTGGGGTCAGTGCTATTCCATCTCTCTCTGTCATTAGGAAGGAAACTAGCCCACGAATTCACTGTGCTCAGGGTTCAAGCACTCCCCGTTTTGCCCCTCTGGGTCAGCTCTAGAGGGTCCACAGGGCAGTGGCCGACTCCAGCATAGAATTCCAGTTATCAGCATGACAAAGACTTGGAGGTGTGGATGCCCCTCCATTGCCCCTAAATGACACAGCACAGCCTGTCTCCTGTCCTTTTCTGAGGAGCGGGGGTGTGAGGGTTCTCTGAGGTCAGACATGGGGCCTCCTCTTTGCTGTGGCTCCCACCACCAGACAGAATTCTACCCCTAGGGCCTCTGCCACTCACGAGGTCGACTCACCGCTCAATGGTGTAGGCAGTGATGGGGTTGCCCCGGGTGTCACTGGGCGGGGCCCAAGTCAGGATGAGGCAGTCTCTGTTCACATCCAGGCATCGGACGTTCAGTGGGGAGCCTGGGGCCCCGGGGTTCTCGGCCTCGGCATCTGAAACCCGGGGGTGAGAGGGAGGCCCTTCTAGTCAGTGGCCATTTGGTGACAGGCCCTTCCCACTCCCGCCCAGCATCCTCTTCTCATCCATCCTCCAGGGGCTCAGTTTTGATCTTGAGGACTGGGG
6,ENSG00000001461,NIPAL3,1,24415802,24475252,1,24413801,24415801,TCCATTTCTACCTATAACCCCCTTCCGTCTTGTCTCAAATGGCCATCAGGGGGATCCTTTTATAGCACGTGTCAGACCAGGTCCCTGTCTGCCCAGAACCCTCCCACGTCCCGCATCTCCCTCAGAGCAAAAGCTTCCCCATCGCCTGACCCCTCGGCCTGTCTGGCCACCGTTCCCTCTGCCCTTAGTCTCTGTGCCTCTCTGCTGTTTCCTGAGGGTCCAAGTCCCACCTCTGGGCCTTTGCACTTGCTGGTCCCTCAGCCTGGAATGTTCTTCTCCGAGAAATCTCTTGGCTTCCTGTAGGCCTTTAATAAAAATCCACCTTCCTGGGCAGTGGGGAGGCCATCCCAGCCACTCCATCTAAAAATGTCAGCAGCTCCTCTCCCCACACTTCAAGTTCCCCTCCCTGCTCAATGTCTTTCTCCTTAGCGCCTACCACTCACTACTGTGCTTAATATTCTACTGATTTACTCCATTTATCATTGGCACCCCCTCCCCTGACTGCCGTGTCCCTAGGGCCAAGAACAGGACTGGCACATAGCAGGTGCTCCATCTATGTTTCTGGGACAAATGAATGAACAAATGAATGAATTCTCACAATAACCCTGCAGGAAGGTGTTAATTTCTCTGCCGATGAGGAAAGTCAGGTTCAGAGAGTTTCGGTAACTTGGCCAAAAGTCCTGCATCTAATAACTGGTGGATATAGTATTAAACTTCACTCAGGTGTGGTTGGCTGCGAACCTGTTTACTCTCCACAGCATCAGGGACCCAGAAGGCAGCCAGAAACCCCGAGTCTCTGGGCAGATGGAGGGTGGAGAGCCCAGGGTGAGACAGATGAGTGAGGTACCCAGAGACCCTTTCTGCTGAGTCTGGGCAAGAGAGTTAAGACCTAGAAGCACAAGGGGACCCACTGATTCCACGCTCTGTGGTTTTCCCCCCACAAGCCCATCTGGCTGTCAGTGGCCTAGACCAGGTAGGACAAGAAATTGCCATAAGAAACCATGAACACTGCTGAGCTCCTACTGTGTGCCAGGCTCCATTCTAAGTCTTTATGTGCGCTAATTAATTTGATCCTCACATTAACCCTACAAGCTGTGTATGATTATTATCATTTTATGGAAGAAGAAACAGAGGCACAGAGAAACTAAGTAACTTGCCCAAGGTCACGCAGCTGGTAAGTAGCAGAGATAGAATTTGAACCCAAGCAGTCTGACGCCAACATCTTCACCATGATACACTACCACCTTGTGAAACAAATCCTCAATTATCTACAGCTTTCCTATGTCGGAGAAAAAGGATTACCCATGAGTGTGAAAAGAGAGGCCTGTTTGGATAATCAGAGCCAGAGATTCCTCCCTGGAAATTTTAAATTCATTTATTCCCTAAACATTTTCTGTCCACCTAGTTTGTGCCAGGCATTGTGCTGGGCACAGAAAGTAATGAGAAGATAAACTTAGTCTCTGACTTCAGGGAGATAGTCAAGAACACCTCCTTGGGAAGCACTGACAGATCCCAGGGGTTGCTGGCTTTGGCTGTGGGCTGTGTTTTATTTCATCTTCACAGGGCAGGTTCTCTATTCTATGCTGGAGGAAACTGAGGCTCAGAGAGTTTAGGACACTCACTCAGGATCACACAGCACAAGGAAGGATTTGAATAGATTTTGACACCATGCTGTCTTCTACTCTGCCATACTGCCCCCCCTCCACTGTGAAGTTCACTCCACAAGACCTGCCTCATCCCCGCCGGGGTCTCCCCTGCCCCCACTCCCCAGTGAGAGCCACCTCCTAGTGCTAATCACACCATAGGCTCTGCAAGTGCTGCTGGTGCCTGTCCCTGGCTGGAGGGTACCAGCCTTGTTTCCCTCCCAACATAGTCCCACCAAGAAGTGCTTGGCCCCAATGTGTGGTCTGATGCCATCTGGCTTGGGGGCAGGGAGGTACCCAGGAAGTGGTGGATCGTATTGTCATTGTTGTCTGTGTCTTGTATTTTGTGTATTTCCAC


In [14]:
write_tsv(gene_info, '../../result/deseq2/all_gene_sequence.tsv')

### Significant Genes

In [15]:
sig_gene <- read_csv('../../result/deseq2/sig_gene.csv')
head(sig_gene)

[1mRows: [22m[34m8300[39m [1mColumns: [22m[34m7[39m


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): gene
[32mdbl[39m (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj



[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000227232,420.856227,1.877341,0.4063936,4.619515,3.846377e-06,3.423083e-05
ENSG00000278267,8.613466,2.615336,0.854557,3.060458,0.002209988,0.007991285
ENSG00000268903,18.716412,5.774899,0.9753198,5.921032,3.199285e-09,6.290752e-08
ENSG00000269981,12.54748,5.752313,1.2084716,4.75999,1.936023e-06,1.891486e-05
ENSG00000239906,3.579557,3.791956,1.4671741,2.584531,0.009751165,0.02777531
ENSG00000279457,551.484304,2.5326,0.3827524,6.616811,3.670306e-11,1.079563e-09


In [16]:
# only keep the gene names
sig_gene <- sig_gene[, 1]

In [17]:
sig_gene_info <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", "start_position", "end_position", "strand"),
    filters = "ensembl_gene_id", 
    values = sig_gene$gene, 
    mart = mart
)
head(sig_gene_info)

Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,chromosome_name,start_position,end_position,strand
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>
1,ENSG00000000457,SCYL3,1,169849631,169894267,-1
2,ENSG00000000971,CFH,1,196651754,196752476,1
3,ENSG00000001084,GCLC,6,53497341,53616970,-1
4,ENSG00000001167,NFYA,6,41072974,41102403,1
5,ENSG00000001617,SEMA3F,3,50155045,50189075,1
6,ENSG00000001629,ANKIB1,7,92245974,92401383,1


In [18]:
sig_gene_info <- sig_gene_info |> 
    mutate(
        upstream_start = ifelse(strand == 1, start_position - 2001, end_position + 2001),
        upstream_end = ifelse(strand == 1, start_position - 1, end_position + 1),
        temp_upstream_start = upstream_start,
        temp_upstream_end = upstream_end) |>
    mutate(
        upstream_start = ifelse(strand == -1, temp_upstream_end, temp_upstream_start),
        upstream_end = ifelse(strand == -1, temp_upstream_start, temp_upstream_end)) |>
    dplyr::select(-c(temp_upstream_start, temp_upstream_end))

In [19]:
head(sig_gene_info)

Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,chromosome_name,start_position,end_position,strand,upstream_start,upstream_end
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>
1,ENSG00000000457,SCYL3,1,169849631,169894267,-1,169894268,169896268
2,ENSG00000000971,CFH,1,196651754,196752476,1,196649753,196651753
3,ENSG00000001084,GCLC,6,53497341,53616970,-1,53616971,53618971
4,ENSG00000001167,NFYA,6,41072974,41102403,1,41070973,41072973
5,ENSG00000001617,SEMA3F,3,50155045,50189075,1,50153044,50155044
6,ENSG00000001629,ANKIB1,7,92245974,92401383,1,92243973,92245973


In [20]:
dim(sig_gene_info)

In [21]:
dim(sig_gene)

8195 significant genes had chromosome coordinates, compared to 8300 total significant genes present.

In [22]:
system.time({
    sig_gene_info <- sig_gene_info %>%
        mutate(upstream_region <-  pmap_chr(
          list(chromosome_name, upstream_start, upstream_end, strand), 
          fetch_sequence
      ))
    })

    user   system  elapsed 
  61.788    0.734 1765.618 

In [23]:
write_tsv(sig_gene_info, '../../result/deseq2/sig_gene_sequence.tsv')

## Conclusion

We extracted upstream 2000 bp sequences for all genes in the RNA-seq count matrix and significant genes identified by DESeq2.

In [24]:
sessionInfo()

R version 4.3.1 (2023-06-16)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS/LAPACK: /home/ubuntu/miniconda3/lib/libopenblasp-r0.3.28.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] stringi_1.8.4   xml2_1.3.6      jsonlite_1.8.9  httr_1.4.7     
 [5] biomaRt_2.58.2  lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1  
 [9] dplyr_1.1.4     purrr_1.0.2     readr_2.1.5     tidyr_1.3.1    
[13] tibble_3.2.1    ggplot2_3.5.1   tidyverse_2.0.0

loaded via a namespace (and not attache