# `pathview` Tutorial

In [1]:
# Load library and example datasets
library("pacman")

pacman::p_load("pathview", "gage", "tidyverse")
data(gse16873.d)
# Load human pathways data
data(paths.hsa)
# load demo pathway-related data, including 3 pathway ids and related plotting params
# this is in dictionary format
data(demo.paths)

## Start

In [2]:
gse16873.d

Unnamed: 0,DCIS_1,DCIS_2,DCIS_3,DCIS_4,DCIS_5,DCIS_6
10000,-3.076448e-01,-1.472277e-01,-0.0237848080,-0.0705619313,-1.323087e-03,-0.1502681305
10001,4.158680e-01,-3.347726e-01,-0.5131369072,-0.1665371241,1.111222e-01,0.1340073370
10002,1.985493e-01,3.789588e-02,0.3418653408,-0.0852741998,7.675593e-01,0.1582860892
10003,-2.315530e-01,-9.659311e-02,-0.1047272830,-0.0480140390,-2.080564e-01,0.0334444824
100048912,-4.490724e-02,-5.203146e-02,0.0363903758,0.0480782310,2.720582e-02,0.0544473911
10004,-8.756237e-02,-5.027725e-02,0.0018211331,0.0302383471,8.034394e-03,-0.0686074928
10005,-1.262668e-01,4.777874e-01,-0.1061224804,0.2346640653,9.676428e-02,0.0635200439
10006,6.502821e-01,1.950600e-01,-0.0053586469,-0.2263254562,8.259053e-02,-0.0694435042
10007,1.566912e-01,5.535167e-02,0.0988623520,-0.0257075972,2.126088e-01,0.1593592082
10009,-1.929644e-02,5.141373e-03,-0.0196603248,0.0058828281,-2.317358e-02,-0.0002808255


In [None]:
# Generate viz for only 1 column
# Generate a single image file
pv.out <- pathview(gene.data = gse16873.d[, 1], 
                   pathway.id = "04110",
                   species = "hsa", 
                   out.suffix = "gse16873")

In [None]:
i <- 1
pv.out <- pathview(gene.data = gse16873.d[, 1], 
                   pathway.id = demo.paths$sel.paths[i],
                   species = "hsa", 
                   out.suffix = "gse16873",
                   kegg.native = T)
list.files(pattern="hsa04110", full.names=T)

In [None]:
str(pv.out)

In [None]:
head(pv.out$plot.data.gene)

In [None]:
pv.out <- pathview(gene.data = gse16873.d[, 1], pathway.id = demo.paths$sel.paths[i],
                   species = "hsa", out.suffix = "gse16873.2layer", kegg.native = T,
                   same.layer = F)

## Integrating Cpd and Gene Data

### Compound and gene data

In [None]:
# simulate cpd data
sim.cpd.data = sim.mol.data(mol.type="cpd", nmol=3000)
data(cpd.simtypes)

In [None]:
# specify which pathway to retrieve
i <- 3
print(demo.paths$sel.paths[i])

In [None]:
pv.out <- suppressWarnings(pathview(gene.data = gse16873.d[, 1], 
                   cpd.data = sim.cpd.data,
                   pathway.id = demo.paths$sel.paths[i], 
                   species = "hsa", 
                   out.suffix = "gse16873.cpd",
                   keys.align = "y", 
                   kegg.native = T, 
                   key.pos = demo.paths$kpos1[i]))

In [None]:
head(pv.out$plot.data.cpd)

In [None]:
pv.out <- suppressWarnings(pathview(gene.data = gse16873.d[, 1], 
                   cpd.data = sim.cpd.data, 
                   pathway.id = demo.paths$sel.paths[i], 
                   species = "hsa", 
                   out.suffix = "gse16873.cpd",
                   keys.align = "y", 
                   kegg.native = F, 
                   key.pos = demo.paths$kpos2[i],
                   sign.pos = demo.paths$spos[i], 
                   cpd.lab.offset = demo.paths$offs[i]))

### Multiple states or samples

In [None]:
# simulate compound data with multiple replicate samples
set.seed(10)
sim.cpd.data2 = matrix(sample(sim.cpd.data, 18000,
                              replace = T), ncol = 6)
rownames(sim.cpd.data2) = names(sim.cpd.data)
colnames(sim.cpd.data2) = paste("exp", 1:6, sep = "")
head(sim.cpd.data2, 3)

In [None]:
# KEGG view
pv.out <- suppressWarnings(pathview(gene.data = gse16873.d[, 1:3],
                                    cpd.data = sim.cpd.data2[, 1:2], 
                                    pathway.id = demo.paths$sel.paths[i],
                                    species = "hsa", 
                                    out.suffix = "gse16873.cpd.3-2s", 
                                    keys.align = "y",
                                    kegg.native = T, 
                                    match.data = F, 
                                    multi.state = T, 
                                    same.layer = T))

In [None]:
# KEGG view with data match
pv.out <- suppressWarnings(pathview(gene.data = gse16873.d[, 1:3],
                                    cpd.data = sim.cpd.data2[, 1:2], 
                                    pathway.id = demo.paths$sel.paths[i],
                                    species = "hsa", 
                                    out.suffix = "gse16873.cpd.3-2s.match",
                                    keys.align = "y", 
                                    kegg.native = T, 
                                    match.data = T, 
                                    multi.state = T,
                                    same.layer = T))

In [4]:
?pathview::pathview

0,1
pathview {pathview},R Documentation

0,1
gene.data,"either vector (single sample) or a matrix-like data (multiple sample). Vector should be numeric with gene IDs as names or it may also be character of gene IDs. Character vector is treated as discrete or count data. Matrix-like data structure has genes as rows and samples as columns. Row names should be gene IDs. Here gene ID is a generic concepts, including multiple types of gene, transcript and protein uniquely mappable to KEGG gene IDs. KEGG ortholog IDs are also treated as gene IDs as to handle metagenomic data. Check details for mappable ID types. Default gene.data=NULL. numeric, character, continuous"
cpd.data,"the same as gene.data, excpet named with IDs mappable to KEGG compound IDs. Over 20 types of IDs included in CHEMBL database can be used here. Check details for mappable ID types. Default cpd.data=NULL. Note that gene.data and cpd.data can't be NULL simultaneously."
pathway.id,"character vector, the KEGG pathway ID(s), usually 5 digit, may also include the 3 letter KEGG species code."
species,"character, either the kegg code, scientific name or the common name of the target species. This applies to both pathway and gene.data or cpd.data. When KEGG ortholog pathway is considered, species=""ko"". Default species=""hsa"", it is equivalent to use either ""Homo sapiens"" (scientific name) or ""human"" (common name)."
kegg.dir,"character, the directory of KEGG pathway data file (.xml) and image file (.png). Users may supply their own data files in the same format and naming convention of KEGG's (species code + pathway id, e.g. hsa04110.xml, hsa04110.png etc) in this directory. Default kegg.dir=""."" (current working directory)."
cpd.idtype,"character, ID type used for the cpd.data. Default cpd.idtype=""kegg"" (include compound, glycan and drug accessions)."
gene.idtype,"character, ID type used for the gene.data, case insensitive. Default gene.idtype=""entrez"", i.e. Entrez Gene, which are the primary KEGG gene ID for many common model organisms. For other species, gene.idtype should be set to ""KEGG"" as KEGG use other types of gene IDs. For the common model organisms (to check the list, do: data(bods); bods), you may also specify other types of valid IDs. To check the ID list, do: data(gene.idtype.list); gene.idtype.list."
gene.annotpkg,"character, the name of the annotation package to use for mapping between other gene ID types including symbols and Entrez gene ID. Default gene.annotpkg=NULL."
min.nnodes,"integer, minimal number of nodes of type ""gene"",""enzyme"", ""compound"" or ""ortholog"" for a pathway to be considered. Default min.nnodes=3."
kegg.native,"logical, whether to render pathway graph as native KEGG graph (.png) or using graphviz layout engine (.pdf). Default kegg.native=TRUE."

0,1
plot.data.gene,"data.frame returned by node.map function for rendering mapped gene nodes, including node name, type, positions (x, y), sizes (width, height), and mapped gene.data. This data is also used as input for pseduo-color coding through node.color function. Default plot.data.gene=NULL."
plot.data.cpd,"same as plot.data.gene function, except for mapped compound node data. d plot.data.cpd=NULL. Default plot.data.cpd=NULL. Note that plot.data.gene and plot.data.cpd can't be NULL simultaneously."
cols.ts.gene,vector or matrix of colors returned by node.color function for rendering gene.data. Dimensionality is the same as the latter. Default cols.ts.gene=NULL.
cols.ts.cpd,"same as cols.ts.gene, except corresponding to cpd.data. d cols.ts.cpd=NULL. Note that cols.ts.gene and cols.ts.cpd plot.data.gene can't be NULL simultaneously."
node.data,"list returned by node.info function, which parse KGML file directly or indirectly, and extract the node data."
pathway.name,"character, the full KEGG pathway name in the format of 3-letter species code with 5-digit pathway id, eg ""hsa04612""."
out.suffix,"character, the suffix to be added after the pathway name as part of the output graph file. Sample names or column names of the gene.data or cpd.data are also added when there are multiple samples. Default out.suffix=""pathview""."
multi.state,"logical, whether multiple states (samples or columns) gene.data or cpd.data should be integrated and plotted in the same graph. Default match.data=TRUE. In other words, gene or compound nodes will be sliced into multiple pieces corresponding to the number of states in the data."
match.data,"logical, whether the samples of gene.data and cpd.data are paired. Default match.data=TRUE. When let sample sizes of gene.data and cpd.data be m and n, when m>n, extra columns of NA's (mapped to no color) will be added to cpd.data as to make the sample size the same. This will result in the smae number of slice in gene nodes and compound when multi.state=TRUE."
same.layer,"logical, control plotting layers: 1) if node colors be plotted in the same layer as the pathway graph when kegg.native=TRUE, 2) if edge/node type legend be plotted in the same page when kegg.native=FALSE."

0,1
kegg.names,standard KEGG IDs/Names for mapped nodes. It's Entrez Gene ID or KEGG Compound Accessions.
labels,Node labels to be used when needed.
all.mapped,All molecule (gene or compound) IDs mapped to this node.
type,"node type, currently 4 types are supported: ""gene"",""enzyme"", ""compound"" and ""ortholog""."
x,x coordinate in the original KEGG pathway graph.
y,y coordinate in the original KEGG pathway graph.
width,node width in the original KEGG pathway graph.
height,node height in the original KEGG pathway graph.
other columns,columns of the mapped gene/compound data and corresponding pseudo-color codes for individual samples


In [None]:
# graphviz view
pv.out <- suppressWarnings(pathview(gene.data = gse16873.d[, 1:3],
                                    cpd.data = sim.cpd.data2[, 1:2], 
                                    pathway.id = demo.paths$sel.paths[i],
                                    species = "hsa", 
                                    out.suffix = "gse16873.cpd.3-2s", 
                                    keys.align = "y",
                                    kegg.native = F, 
                                    match.data = F, 
                                    multi.state = T, 
                                    same.layer = T,
                                    key.pos = demo.paths$kpos2[i], 
                                    sign.pos = demo.paths$spos[i]))

In [None]:
# plot samples/states separately
# Doesn't seem to print out images well
pv.out <- suppressWarnings(pathview(gene.data = gse16873.d[, 1:3],
                                    cpd.data = sim.cpd.data2[, 1:2], 
                                    pathway.id = demo.paths$sel.paths[i],
                                    species = "hsa", 
                                    out.suffix = "gse16873.cpd.3-2s", 
                                    keys.align = "y",
                                    kegg.native = T, 
                                    match.data = F, 
                                    multi.state = F, 
                                    same.layer = T))

In [None]:
# KEGG layer with 2 views. Loses the original KEGG gene labels (or EC numbers)
pv.out <- suppressWarnings(pathview(gene.data = gse16873.d[, 1:3],
                                    cpd.data = sim.cpd.data2[, 1:2], 
                                    pathway.id = demo.paths$sel.paths[i],
                                    species = "hsa", 
                                    out.suffix = "gse16873.cpd.3-2s.2layer",
                                    keys.align = "y", 
                                    kegg.native = T, 
                                    match.data = F, 
                                    multi.state = T,
                                    same.layer = F))

## Feat. `GAGE`

In [None]:
# Load some datasets
data(gse16873)
hn <- grep('HN', colnames(gse16873), ignore.case =TRUEs) # indices of HN samples in colnames
dcis <- grep('DCIS', colnames(gse16873), ignore.case =TRUE) # indice of DCIS samples in colnames
data(kegg.gs)

In [None]:
# pw analysis with gage, gene data only
gse16873.kegg.p <- gage(gse16873, 
                        gsets = kegg.gs, 
                        ref = hn, 
                        samp = dcis)

In [None]:
#prepare the differential expression data
gse16873.d <- gagePrep(gse16873, ref = hn, samp = dcis)

#equivalently, you can do simple subtraction for paired samples
gse16873.d <- gse16873[,dcis]-gse16873[,hn]

#select significant pathways and extract their IDs
sel <- gse16873.kegg.p$greater[, "q.val"] < 0.1 & !is.na(gse16873.kegg.p$greater[,"q.val"])

path.ids <- rownames(gse16873.kegg.p$greater)[sel]
path.ids2 <- substr(path.ids[c(1, 2, 7)], 1, 8) # Grab paths with indices 1, 2 and 7

In [None]:
#pathview visualization
pv.out.list <- sapply(path.ids2, function(pid) pathview(gene.data = gse16873.d[,1:2], 
                                                        pathway.id = pid, 
                                                        species = "hsa"))

In [None]:
x <- as_tibble(gse16873.kegg.p$greater, rownames = "pw_name") %>% drop_na() %>% filter(q.val<0.1)


In [7]:
?gage::gage

0,1
gage {gage},R Documentation

0,1
exprs,"an expression matrix or matrix-like data structure, with genes as rows and samples as columns."
gsets,"a named list, each element contains a gene set that is a character vector of gene IDs or symbols. For example, type head(kegg.gs). A gene set can also be a ""smc"" object defined in PGSEA package. Please make sure that the same gene ID system is used for both gsets and exprs."
ref,"a numeric vector of column numbers for the reference condition or phenotype (i.e. the control group) in the exprs data matrix. Default ref = NULL, all columns are considered as target experiments."
samp,"a numeric vector of column numbers for the target condition or phenotype (i.e. the experiment group) in the exprs data matrix. Default samp = NULL, all columns other than ref are considered as target experiments."
set.size,"gene set size (number of genes) range to be considered for enrichment test. Tests for too small or too big gene sets are not robust statistically or informative biologically. Default to be set.size = c(10, 500)."
same.dir,"boolean, whether to test for changes in a gene set toward a single direction (all genes up or down regulated) or changes towards both directions simultaneously. For experimentally derived gene sets, GO term groups, etc, coregulation is commonly the case, hence same.dir = TRUE (default); In KEGG, BioCarta pathways, genes frequently are not coregulated, hence it could be informative to let same.dir = FALSE. Although same.dir = TRUE could also be interesting for pathways."
compare,"character, which comparison scheme to be used: 'paired', 'unpaired', '1ongroup', 'as.group'. 'paired' is the default, ref and samp are of equal length and one-on-one paired by the original experimental design; 'as.group', group-on-group comparison between ref and samp; 'unpaired' (used to be '1on1'), one-on-one comparison between all possible ref and samp combinations, although the original experimental design may not be one-on-one paired; '1ongroup', comparison between one samp column at a time vs the average of all ref columns. For PAGE-like analysis, the default is compare='as.group', which is the only option provided in the original PAGE method. All other comparison schemas are set here for direct comparison to gage."
rank.test,"rank.test: Boolean, whether do the optional rank based two-sample t-test (equivalent to the non-parametric Wilcoxon Mann-Whitney test) instead of parametric two-sample t-test. Default rank.test = FALSE. This argument should be used with respect to argument saaTest."
use.fold,"Boolean, whether to use fold changes or t-test statistics as per gene statistics. Default use.fold= TRUE."
FDR.adj,"Boolean, whether to do adjust for multiple testing as to control FDR (False dicovery rate). Default to be TRUE."

0,1
p.geomean,geometric mean of the individual p-values from multiple single array based gene set tests
stat.mean,"mean of the individual statistics from multiple single array based gene set tests. Normally, its absoluate value measures the magnitude of gene-set level changes, and its sign indicates direction of the changes. When saaTest=gs.KSTest, stat.mean is always positive."
p.val,gloal p-value or summary of the individual p-values from multiple single array based gene set tests. This is the default p-value being used.
q.val,FDR q-value adjustment of the global p-value using the Benjamini & Hochberg procedure implemented in multtest package. This is the default q-value being used.
set.size,"the effective gene set size, i.e. the number of genes included in the gene set test"
other columns,"columns of the individual p-values or statistics, each measures the gene set perturbation in a single experiment (vs its control or all controls, depends on the ""compare argument value)"
