In [61]:
# 加载必要的库
library(dplyr)
library(tidyr)
library(stringr)


In [97]:
data <- read.table("GeneExpression_GroupedData.tsv", header = TRUE, sep = "", stringsAsFactors = FALSE, colClasses = "character")


In [98]:
# 提取列名
column_names <- colnames(data)

In [99]:
# 排除第一列（GeneID）
expr_columns <- column_names[-1]

In [100]:
# 直接从列名中提取主时期、子时期和组织
parse_column_info <- function(col_name) {
  if(!grepl("__", col_name)) {
    return(list(main_stage = NA, sub_stage = NA, tissue = NA))
  }
  
  parts <- strsplit(col_name, "__")[[1]]
  
  # 主时期是第一部分的主要类别
  main_stage <- str_extract(parts[1], "^(Egg|Larva|Pupa|Adult)")
  
  # 子时期是第二部分
  sub_stage <- if(length(parts) >= 2) parts[2] else NA
  
  # 组织是第三部分
  tissue <- if(length(parts) >= 3) parts[3] else NA
  
  return(list(main_stage = main_stage, sub_stage = sub_stage, tissue = tissue))
}

In [101]:
# 提取所有列的信息
column_info_list <- lapply(expr_columns, parse_column_info)
column_info <- data.frame(
  column = expr_columns,
  main_stage = sapply(column_info_list, function(x) x$main_stage),
  sub_stage = sapply(column_info_list, function(x) x$sub_stage),
  tissue = sapply(column_info_list, function(x) x$tissue),
  stringsAsFactors = FALSE
)

In [102]:
# 过滤掉没有正确解析的列
column_info <- column_info[!is.na(column_info$tissue), ]

In [103]:
# 定义主时期的顺序
stage_order <- c("Egg", "Larva", "Pupa", "Adult")
column_info$main_stage_order <- match(column_info$main_stage, stage_order)

In [104]:
# 处理表达值的函数
process_expression <- function(expr_str) {
  if(is.na(expr_str) || expr_str == "") {
    return(NA)
  }
  
  # 尝试按逗号拆分
  vals <- unlist(strsplit(expr_str, ","))
  
  # 将值转换为数值
  numeric_vals <- suppressWarnings(as.numeric(vals))
  
  # 返回平均值
  if(all(is.na(numeric_vals))) {
    return(NA)
  } else {
    return(mean(numeric_vals, na.rm = TRUE))
  }
}

In [105]:
# 转换数据格式和类型
processed_data <- data.frame(GeneID = data$GeneID)

In [106]:
# 处理每一列
for(col in expr_columns) {
  processed_data[[col]] <- sapply(data[[col]], process_expression)
}


In [107]:
# 转换为长格式
data_long <- processed_data %>%
  pivot_longer(cols = -GeneID, names_to = "column", values_to = "expression") %>%
  merge(column_info, by = "column", all.x = TRUE) %>%
  filter(!is.na(tissue))

In [108]:
# 打印示例数据以验证解析是否正确
print("解析后的数据示例:")
print(head(data_long[, c("column", "main_stage", "sub_stage", "tissue")], 5))

[1] "解析后的数据示例:"
         column main_stage sub_stage tissue
1 Adult____Head      Adult             Head
2 Adult____Head      Adult             Head
3 Adult____Head      Adult             Head
4 Adult____Head      Adult             Head
5 Adult____Head      Adult             Head


In [109]:
# 按组织和GeneID分组，计算相邻时期的差异
results <- list()

In [110]:
# 获取唯一组织
unique_tissues <- unique(data_long$tissue)

In [111]:
for(tissue_name in unique_tissues) {
  # 过滤特定组织的数据
  tissue_data <- data_long %>%
    filter(tissue == tissue_name) %>%
    # 按主时期顺序和子时期排序
    arrange(main_stage_order, sub_stage) %>%
    group_by(GeneID) %>%
    # 按每个基因组织数据
    arrange(main_stage_order, sub_stage, .by_group = TRUE) %>%
    # 计算fold change和前一时期信息
    mutate(
      fold_change = expression - lag(expression),
      prev_main_stage = lag(main_stage),
      prev_sub_stage = lag(sub_stage),
      prev_main_stage_order = lag(main_stage_order)
    ) %>%
    # 过滤有效的fold change
    filter(!is.na(fold_change) & abs(fold_change) > 0.5) %>%
    filter(!is.na(prev_main_stage)) %>%
    select(GeneID, prev_main_stage, prev_sub_stage, main_stage, sub_stage, fold_change, expression, column)
  
  if(nrow(tissue_data) > 0) {
    results[[tissue_name]] <- tissue_data
  }
}

In [112]:
# 合并所有结果
all_results <- bind_rows(results, .id = "tissue")

In [113]:
# 删除临时列
all_results <- all_results %>% select(-column)

In [114]:
# 查看结果摘要
if(nrow(all_results) > 0) {
  print(paste("总共找到", nrow(all_results), "个差异表达事件，涉及", 
              length(unique(all_results$GeneID)), "个基因"))
  
  # 按组织统计结果
  tissue_summary <- all_results %>%
    group_by(tissue) %>%
    summarize(
      gene_count = n_distinct(GeneID),
      event_count = n(),
      avg_fold_change = mean(abs(fold_change), na.rm = TRUE),
      max_fold_change = max(abs(fold_change), na.rm = TRUE)
    )
  
  print("各组织差异表达统计：")
  print(tissue_summary)
  
  # 输出结果格式示例
  print("结果格式示例:")
  print(head(all_results[, c("tissue", "GeneID", "prev_main_stage", "prev_sub_stage", 
                         "main_stage", "sub_stage", "fold_change")], 5))
} else {
  print("未找到满足条件的差异表达基因")
}

[1] "总共找到 1494574 个差异表达事件，涉及 16358 个基因"
[1] "各组织差异表达统计："
[90m# A tibble: 15 × 5[39m
   tissue               gene_count event_count avg_fold_change max_fold_change
   [3m[90m<chr>[39m[23m                     [3m[90m<int>[39m[23m       [3m[90m<int>[39m[23m           [3m[90m<dbl>[39m[23m           [3m[90m<dbl>[39m[23m
[90m 1[39m Developmental_tissue      [4m1[24m[4m6[24m031      [4m1[24m[4m0[24m[4m7[24m328            2.23            22.2
[90m 2[39m Digestive                 [4m1[24m[4m5[24m505       [4m8[24m[4m9[24m788            2.09            21.8
[90m 3[39m Egg                       [4m1[24m[4m6[24m259      [4m1[24m[4m5[24m[4m0[24m636            2.46            21.0
[90m 4[39m Excretory                 [4m1[24m[4m5[24m408       [4m6[24m[4m1[24m243            2.37            18.1
[90m 5[39m Fat_body                  [4m1[24m[4m6[24m276      [4m1[24m[4m5[24m[4m9[24m228            2.54            22.4
[90m 6[

In [115]:
# 将结果保存到文件
write.csv(all_results, "gene_expression_fold_changes.csv", row.names = FALSE)

In [None]:
print(all_results)