In [8]:
# set working directory
setwd("/Volumes/Disk/Projects/StrokeCare/GSE16561/data")

library(tidyverse)
library(preprocessCore)

# 1. 데이터 불러오기
raw_data <- read.delim("GSE16561_RAW.txt", header = TRUE, sep = "\t", check.names = FALSE)

# 2. .bgx (probe ↔ gene) 불러오기
bgx <- read.delim("GPL6883_HumanRef-8_V3_0_R0_11282963_A.bgx",
                  header = TRUE,
                  sep = "\t",
                  skip = 8,  # [Probes] 이후
                  comment.char = "#",
                  quote = "",
                  row.names = NULL)

# 3. 병합: probe ID 기준으로 gene symbol 붙이기
merged <- merge(raw_data, bgx, by.x = "ID_REF", by.y = "Probe_Id")

# 4. 발현값 + Symbol만 선택
expr_by_gene <- merged %>%
  select(Symbol, matches("_Stroke|_Control")) %>%
  group_by(Symbol) %>%
  summarise(across(everything(), mean, na.rm = TRUE))

# 5. 행: 유전자, 열: 샘플
expr_mat <- as.data.frame(expr_by_gene)
rownames(expr_mat) <- expr_mat$Symbol
expr_mat <- expr_mat[, -1]

# 6. log2(x + 1)
log_expr <- log2(expr_mat + 1)

# 7. quantile normalization
norm_expr <- normalize.quantiles(as.matrix(log_expr))
rownames(norm_expr) <- rownames(log_expr)
colnames(norm_expr) <- colnames(log_expr)
expr_norm <- as.data.frame(norm_expr)

# 8. z-score 정규화 (샘플 단위)
expr_scaled <- t(scale(t(expr_norm)))  # 행: 샘플, 열: 유전자

# 9. 저장
write.csv(t(expr_scaled), "AE_input.csv", quote = FALSE)
cat("✅ AutoEncoder 입력 데이터 저장 완료 → AE_input.csv\n")

[1m[22m[36mℹ[39m In argument: `across(everything(), mean, na.rm = TRUE)`.
[36mℹ[39m In group 1: `Symbol = "15E1.2"`.
[1m[22m[33m![39m The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))”


✅ AutoEncoder 입력 데이터 저장 완료 → AE_input.csv
