In [None]:
import os
import pandas as pd

# 文件路径
ko_results_dir = "/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/KO_scan_out/filtered_results"
kegg_pathway_table = "/storage/jufengLab/luogaoyang/db/KO/KEGG_ko.txt"
rpkg_dir = "/storage/jufengLab/luogaoyang/metagenome_project/DSR/PRKG/fna"
output_dir = "/storage/jufengLab/luogaoyang/metagenome_project/DSR/combined_results"

# 创建输出目录（如果不存在）
os.makedirs(output_dir, exist_ok=True)

# 读取 KEGG Pathway 表
kegg_df = pd.read_csv(kegg_pathway_table, sep="\t", header=None, names=[
    "Category_A", "Category_B", "Pathway_ID", "KO", "Gene_Name", "Description", "EC"])

# 遍历过滤后的 KO 注释结果
for ko_file in os.listdir(ko_results_dir):
    if ko_file.endswith(".txt"):
        ko_file_path = os.path.join(ko_results_dir, ko_file)
        sample_name = ko_file.replace(".contigs_5M_contigs_fna2faa_koScan_out.txt", "")

        # 读取 KO 注释结果
        ko_df = pd.read_csv(ko_file_path, sep="\s+", header=None, names=[
            "Gene_Name", "KO", "Threshold", "Score", "E-value", "KO_Definition"])

        # 合并 KO 注释结果和 KEGG Pathway 表，按 KO 列进行映射
        merged_df = pd.merge(ko_df, kegg_df, on="KO", how="left")

        # 读取对应的 RPKG 文件
        rpkg_file = os.path.join(rpkg_dir, f"{sample_name}_RPKG.csv")
        if os.path.exists(rpkg_file):
            rpkg_df = pd.read_csv(rpkg_file)

            # 合并 RPKG 数据和注释结果，按基因名 (Gene_Name) 匹配
            final_df = pd.merge(merged_df, rpkg_df, left_on="Gene_Name", right_on="#Name", how="left")
        else:
            print(f"RPKG 文件未找到：{rpkg_file}")
            continue

        # 保存结果
        output_file = os.path.join(output_dir, f"{sample_name}_combined_results.csv")
        final_df.to_csv(output_file, index=False)
        print(f"处理完成：{ko_file} -> {output_file}")

print(f"所有文件均已处理完成，结果保存在 {output_dir}")