-
Notifications
You must be signed in to change notification settings - Fork 0
/
Microarray_Transcriptomics_Analysis.R
137 lines (104 loc) · 3.64 KB
/
Microarray_Transcriptomics_Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
File name: Microarray Transcriptomics Analysis in R
Author: Hasani Perera
Contact: heperera826@gmail.com
Date created: 20/10/2021
Date last modified: 25/11/2021
R Version: 4.0.3
Ref: Taneera et al., 2015 (doi:10.1093/hmg/ddu610)
"""
## set path
setwd("~/Documents/HazGit/Transcriptomics_Analysis")
#install packages
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("GEOquery")
nBiocManager::install("oligo")
BiocManager::install("pd.hugene.1.0.st.v1")
BiocManager::install("hugene10sttranscriptcluster.db")
BiocManager::install("arrayQualityMetrics")
BiocManager::install("GOstats")
install.packages("ggplot2")
##------------------------ QUERY RAW DATA ------------------------------
#load packages
library(GEOquery)
## GEO Accession No: GSE50397
## Platform: [HuGene-1_0-st] Affymetrix Human Gene 1.0 ST Array - GPL6244
#extract .cell files to local machine - GEO Series records (GSExxxxx)
gse <- getGEO("GSE50397",GSEMatrix=FALSE)
#if file is already downloaded
# gse <- file.choose()
head(Meta(gse))
# show(gse)
# names of all the GSM objects contained in the GSE
names(GSMList(gse))
#first GSM object on the list
class(GSMList(gse)[[1]])
head(Meta(GSMList(gse)[[1]]))
# names of the GPLs represented
names(GPLList(gse))
# access raw data (downloaded file paths)
file_paths = getGEOSuppFiles("GPL6244")
head(file_paths)
# choose tar file
tarfile <- file.choose()
# extract tar archives
untar(tarfile, exdir="Raw_Data")
#list of all gz files in the directory
cel.files <- list.files("Raw_Data/", pattern = "[gz]")
length(cel.files)
# list/vector/dataframe ---> vector/matrix
# extract gz archives - gunzip
sapply(paste("Raw_Data", cel.files, sep="/"), gunzip)
##------------------------ 1) DATA PREPROCESSING ------------------------------
#load packages
library(oligo)
# library(affy)
#list of all cel files in the directory
celpath <- "~/Documents/HazGit/Transcriptomics_Analysis/Raw_Data"
celfiles_list <- list.files(celpath,pattern = ".CEL", full.names=TRUE)
length(celfiles_list)
head(celfiles_list)
# import CEL files containing raw probe-level data into an R AffyBatch object
cell_files <- read.celfiles(celfiles_list)
head(cell_files)
#load packages
library(pd.hugene.1.0.st.v1)
#probe set annotation
??pd.hugene.1.0.st.v1
#perform RMA normalization (Robust Multi-Array Average)
# converts an AffyBatch object into an ExpressionSet object
eset <- rma(cell_files)
nrow(eset)
#save the expression data (output - normalized and log2 transformed)
write.exprs(eset,file="rma_norm_expr.txt")
#load packages
library(Biobase)
library(hugene10sttranscriptcluster.db)
#gene annotation
??Biobase
??hugene10sttranscriptcluster.db
#get a list of retrievable data
keytypes(hugene10sttranscriptcluster.db)
#retrieve data for selected objects (ENTREZID and SYMBOL) as a data frame
gns<- select(hugene10sttranscriptcluster.db,keys(hugene10sttranscriptcluster.db),
c("ENTREZID", "SYMBOL"))
head(gns)
tail(gns)
## optional - to keep one match per gene
# gns <- gns[!duplicated(gns[,1]),]
# tail(gns)
#set row names to ProbeID (for convenience)
gns = gns[,-1]
row.names(gns) = keys(hugene10sttranscriptcluster.db)
tail(gns)
#retrieve gene expression matrix from eset as a dataframe
expr <- data.frame(exprs(eset))
head(expr)
#merge gene expression and annotation according to row names (probe IDs)
expr.anno <- merge(x=gns,y=expr,by.y=0, by.x=2,all=TRUE)
head(expr.anno)
#save the annotated gene expression matrix to local file
write.table(expr.anno, file = "rma_norm_expr.anno.txt",sep = "\t",
row.names = FALSE, col.names = TRUE, quote =FALSE)
write.csv(expr.anno, file = "rma_norm_expr.anno.csv")