DEseq2 Tutorial Jason Tsai 11th November 2021

# Initiate libraries

In [1]:
library("tidyverse")
library("DESeq2")
library("pheatmap")
library("RColorBrewer")
library("apeglm")
library("CancerRNASig")
library("DEGreport")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects

# Data import
To demonstate the use of DESeqDataSetFromMatrix, we will read in count data from the pasilla package.
We read in a count matrix, which we will name `cts`, and the sample information table, which we will name `coldata`.

In [2]:
# Define the counts matrix
cts <- read.csv(file = "data/PDAC_MICRODISSEC_NAIF/rna_seq_selected.csv", row.names = 1) |>
  as.matrix()

# Define the sample information table
coldata <- read.csv(file = "data/PDAC_MICRODISSEC_NAIF/rna_seq_metadata_selected.csv", row.names = 1)

# Normalize the counts using upper quartile normalization

In [3]:
head(cts)

Unnamed: 0,BPDAC_023_19_L1_S92,BPDAC_023_26_L1_S5,BPDAC_023_26_L2_S13,X0823_012,X0823_013,BPDAC_029_26_L1_S70,BPDAC_029_26_L2_S78,X0923_009,X0923_010,BPDAC_029_36_L1_S7,⋯,X0623_015,BPDAC_038_20_L1_S48,BPDAC_042_11_L1_S100,BPDAC_042_11_L2_S101,BPDAC_042_11_L3_S102,BPDAC_086_31_L1_S194_L003,BPDAC_086_31_L2_S202_L003,BPDAC_086_37_L1_S232_L004,BPDAC_086_38_L1_S240_L004,BPDAC_086_38_L2_S247_L004
ENSG00000160072,459,572,0,365,312,77,86,86,67,27,⋯,17,135,188,833,80,687,381,329,574,201
ENSG00000234396,0,0,0,5,0,0,0,0,13,0,⋯,0,1,10,6,0,0,0,0,0,2
ENSG00000225972,3364,3967,0,2501,4641,54,22,72,41,7,⋯,59,8,13,34,5,24,21,36,43,12
ENSG00000224315,30,0,0,25,40,0,0,85,0,0,⋯,14,2,2,2,50,58,36,20,46,0
ENSG00000198744,314,167,0,329,330,23,44,160,128,33,⋯,154,17,51,182,51,135,44,76,103,37
ENSG00000279928,1,0,0,0,0,0,0,0,0,0,⋯,1,1,6,1,0,8,3,4,1,2


In [4]:
# Define the function to normalize the counts
uqnorm <- function(rawcounts)
{
  log2(1 + (t(t(rawcounts)/apply(rawcounts, 2, function(x) {
    quantile(x[which(x > 0)], probs = 0.75)})) * 1000))
}

# Normalize the counts
cts_normalized_uqnorm <- uqnorm(cts)

head(cts_normalized_uqnorm)

Unnamed: 0,BPDAC_023_19_L1_S92,BPDAC_023_26_L1_S5,BPDAC_023_26_L2_S13,X0823_012,X0823_013,BPDAC_029_26_L1_S70,BPDAC_029_26_L2_S78,X0923_009,X0923_010,BPDAC_029_36_L1_S7,⋯,X0623_015,BPDAC_038_20_L1_S48,BPDAC_042_11_L1_S100,BPDAC_042_11_L2_S101,BPDAC_042_11_L3_S102,BPDAC_086_31_L1_S194_L003,BPDAC_086_31_L2_S202_L003,BPDAC_086_37_L1_S232_L004,BPDAC_086_38_L1_S240_L004,BPDAC_086_38_L2_S247_L004
ENSG00000160072,10.884086,10.931638,0,10.931381,10.24675,9.562055,9.499226,9.137226,9.541957,8.315816,⋯,7.281192,10.021621,11.217198,11.86099,9.130418,11.489586,10.907188,10.476992,10.747826,9.850216
ENSG00000234396,0.0,0.0,0,4.79379,0.0,0.0,0.0,0.0,7.184324,0.0,⋯,0.0,3.119739,6.995284,4.796271,0.0,0.0,0.0,0.0,0.0,3.346859
ENSG00000225972,13.757038,13.724967,0,13.707287,14.14046,9.050969,7.538181,8.881384,8.834647,6.381161,⋯,9.069755,5.966672,7.371184,7.255366,5.168523,6.664176,6.738679,7.293188,7.019507,5.808536
ENSG00000224315,6.959501,0.0,0,7.073497,7.291327,0.0,0.0,9.120382,0.0,0.0,⋯,7.003071,4.03427,4.717893,3.311586,8.453889,7.928831,7.510647,6.452531,7.116079,0.0
ENSG00000198744,10.336709,9.157261,0,10.781653,10.327605,7.823305,8.534295,10.031703,10.474945,8.604499,⋯,10.452241,7.04187,9.336662,9.667999,8.482378,9.144285,7.798715,8.366341,8.273251,7.415529
ENSG00000279928,2.354798,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,3.334984,3.119739,6.265838,2.450033,0.0,5.107387,4.010185,4.195034,1.997839,3.346859


## Verify the integrity of the data

In [5]:
head(coldata)

Unnamed: 0_level_0,patient_ID2,sample_ID,ID_DNA,N..histo,ID_Anapath,Idpatient_bloc,Idpatient,sample_ID_4_merge,path_svs,ID_scan,⋯,DECES.1.OUI,date_décès,OS..jours.,OS..360.jours.,OS..mois.,PFS..jours.,PFS.formule,PFS..mois.,Unnamed..27,infiltrat_lymphocytaire
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>
BPDAC_023_19_L1_S92,BPDAC_023,BPDAC_023_19_L1,,B_12AG01290_19_L1,12AG01290_19_L1,12AG01290_19,12AG01290,12AG01290-19,\\bob\histology\datasets\PDAC_Bjn_100\PDAC_MICRODISSEC_NAIF\old_names\12AG01290-19_MDNF01_HES.svs,12AG01290-19_MDNF01_HES.svs,⋯,0,,356,351,11.5082,316,321,10.36066,,fort
BPDAC_023_26_L1_S5,BPDAC_023,BPDAC_023_26_L1_MDNF01,,B_12AG01290_26_L1,12AG01290_26_L1,12AG01290_26,12AG01290,12AG01290-26,\\bob\histology\datasets\PDAC_Bjn_100\PDAC_MICRODISSEC_NAIF\old_names\12AG01290-26_MDNF01_HES.svs,12AG01290-26_MDNF01_HES.svs,⋯,0,,356,351,11.5082,316,321,10.36066,,modere
BPDAC_023_26_L2_S13,BPDAC_023,BPDAC_023_26_L2_MDNF01,,B_12AG01290_26_L2,12AG01290_26_L2,12AG01290_26,12AG01290,12AG01290-26,\\bob\histology\datasets\PDAC_Bjn_100\PDAC_MICRODISSEC_NAIF\old_names\12AG01290-26_MDNF01_HES.svs,12AG01290-26_MDNF01_HES.svs,⋯,0,,356,351,11.5082,316,321,10.36066,,faible
X0823_012,BPDAC_023,BPDAC_023_26_L1_MDNF02,,B_12AG01290_26_L1,12AG01290_26_L1,12AG01290_26,12AG01290,12AG01290-26,\\bob\histology\datasets\PDAC_Bjn_100\PDAC_MICRODISSEC_NAIF\old_names\12AG01290-26_MDNF02_HES.svs,12AG01290-26_MDNF02_HES.svs,⋯,0,,356,351,11.5082,316,321,10.36066,,faible
X0823_013,BPDAC_023,BPDAC_023_26_L2_MDNF02,,B_12AG01290_26_L2,12AG01290_26_L2,12AG01290_26,12AG01290,12AG01290-26,\\bob\histology\datasets\PDAC_Bjn_100\PDAC_MICRODISSEC_NAIF\old_names\12AG01290-26_MDNF02_HES.svs,12AG01290-26_MDNF02_HES.svs,⋯,0,,356,351,11.5082,316,321,10.36066,,modere
BPDAC_029_26_L1_S70,BPDAC_029,BPDAC_029_26_L1_MDNF01,,B_13AG00043_26_L1,13AG00043_26_L1,13AG00043_26,13AG00043,13AG00043-26,\\bob\histology\datasets\PDAC_Bjn_100\PDAC_MICRODISSEC_NAIF\old_names\13AG00043-26_MDNF01_HES.svs,13AG00043-26_MDNF01_HES.svs,⋯,1,,390,385,12.62295,326,330,10.68852,,modere


In [6]:
# Check again if the names are the same regardless of the order
all(rownames(coldata) %in% colnames(cts))

In [7]:
# Check again if order is the same
all(rownames(coldata) == colnames(cts))

# Save the normalized counts

In [8]:
# Save the normalized counts
cts_normalized_uqnorm |>
  write.csv(file = "data/PDAC_MICRODISSEC_NAIF/rna_seq_selected_uqnorm.csv")

# Construct a DESeqDataSet
Now both the coldata and `cts` have the same row names in the same order.
With the count matrix, `cts`, and the sample information, `coldata`, we can construct a `DESeqDataSet`:

In [9]:
# Make a factor of random positive and negative samples
coldata$condition <- factor(sample(c("Positive", "Negative"), size = nrow(coldata), replace = TRUE))

dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ condition)
dds

class: DESeqDataSet 
dim: 61806 29 
metadata(1): version
assays(1): counts
rownames(61806): ENSG00000160072 ENSG00000234396 ... ENSG00000210195
  ENSG00000210196
rowData names(0):
colnames(29): BPDAC_023_19_L1_S92 BPDAC_023_26_L1_S5 ...
  BPDAC_086_38_L1_S240_L004 BPDAC_086_38_L2_S247_L004
colData names(94): patient_ID2 sample_ID ... infiltrat_lymphocytaire
  condition

## Extracting transformed values
These transformation functions return an object of class DESeqTransform which is a subclass of RangedSummarizedExperiment.
For ~20 samples, running on a newly created DESeqDataSet. The assay function is used to extract the matrix of normalized values.

In [10]:
vsd <- dds |> vst()
vsd

class: DESeqTransform 
dim: 61806 29 
metadata(1): version
assays(1): ''
rownames(61806): ENSG00000160072 ENSG00000234396 ... ENSG00000210195
  ENSG00000210196
rowData names(4): baseMean baseVar allZero dispFit
colnames(29): BPDAC_023_19_L1_S92 BPDAC_023_26_L1_S5 ...
  BPDAC_086_38_L1_S240_L004 BPDAC_086_38_L2_S247_L004
colData names(95): patient_ID2 sample_ID ... condition sizeFactor

In [11]:
vsd |> assay() |> head()

Unnamed: 0,BPDAC_023_19_L1_S92,BPDAC_023_26_L1_S5,BPDAC_023_26_L2_S13,X0823_012,X0823_013,BPDAC_029_26_L1_S70,BPDAC_029_26_L2_S78,X0923_009,X0923_010,BPDAC_029_36_L1_S7,⋯,X0623_015,BPDAC_038_20_L1_S48,BPDAC_042_11_L1_S100,BPDAC_042_11_L2_S101,BPDAC_042_11_L3_S102,BPDAC_086_31_L1_S194_L003,BPDAC_086_31_L2_S202_L003,BPDAC_086_37_L1_S232_L004,BPDAC_086_38_L1_S240_L004,BPDAC_086_38_L2_S247_L004
ENSG00000160072,7.823556,8.057938,3.875114,8.05232,7.554533,8.584848,7.095814,6.905523,7.093728,6.577804,⋯,5.495725,7.654655,7.744428,8.563158,7.11681,7.869435,7.814429,6.845077,7.655784,7.166568
ENSG00000234396,3.875114,3.875114,3.875114,4.547513,3.875114,3.875114,3.875114,3.875114,5.515877,3.875114,⋯,3.875114,4.300203,5.029064,4.468461,3.875114,3.875114,3.875114,3.875114,3.875114,4.278114
ENSG00000225972,10.532665,10.713805,3.875114,10.690521,11.232514,8.117181,5.732987,6.708633,6.549047,5.390825,⋯,6.621571,5.049138,5.181121,5.245442,4.848546,4.864546,5.079943,5.010895,5.186815,4.846989
ENSG00000224315,5.184987,3.875114,3.875114,5.329868,5.493575,3.875114,3.875114,6.892345,3.875114,3.875114,⋯,5.358178,4.474145,4.402109,4.219286,6.592306,5.375091,5.423461,4.731096,5.228806,3.875114
ENSG00000198744,7.353787,6.593791,3.875114,7.918914,7.623993,7.064275,6.366555,7.644273,7.884681,6.795027,⋯,7.74461,5.540183,6.26745,6.695184,6.61338,6.054732,5.570897,5.483687,5.82422,5.523058
ENSG00000279928,4.12226,3.875114,3.875114,3.875114,3.875114,3.875114,3.875114,3.875114,3.875114,3.875114,⋯,4.287756,4.300203,4.778164,4.118768,3.875114,4.453738,4.341808,4.262397,4.081931,4.278114


In [12]:
# Save the variance stabilized data
vsd |>
  assay() |>
  as.data.frame() |>
  write.csv(file = "data/PDAC_MICRODISSEC_NAIF/rna_seq_selected_vst.csv")