In [1]:
import os
import pandas as pd

# 定义文件路径
rpkm_folder = '/storage/jufengLab/luogaoyang/metagenome_project/DSR/rpkm_contigs/'
ags_folder = '/storage/jufengLab/luogaoyang/metagenome_project/DSR/AGS/'
output_dir='/storage/jufengLab/luogaoyang/metagenome_project/DSR/PRKG/contig'

# 函数：读取 AGS 文件中的 genome_equivalents
def get_genome_equivalents(ags_file):
    with open(ags_file, 'r') as f:
        for line in f:
            if line.startswith("genome_equivalents"):
                return float(line.split('\t')[1].strip())
    return None

# 函数：计算 RPKG
def calculate_rpkg(reads, length, genome_equivalents):
    if length > 0 and genome_equivalents > 0:
        return (reads * 1e3) / (length * genome_equivalents)
    return 0

# 获取所有的 .rpkm 文件
rpkm_files = [f for f in os.listdir(rpkm_folder) if f.endswith('.rpkm')]
ags_files = {f.split('.')[0]: f for f in os.listdir(ags_folder) if f.endswith('.AGS.txt')}

# 创建结果字典
result = {}

# 遍历每个 rpkm 文件
for rpkm_file in rpkm_files:
    sample_name = rpkm_file.split('_')[0]  # 提取样本名
    sample_name = sample_name.split('.')[0]
    # 获取对应的 AGS 文件
    ags_file = os.path.join(ags_folder, ags_files.get(sample_name, ''))
    if not os.path.exists(ags_file):
        print(f"AGS file for {sample_name} not found!")
        continue

    # 获取 genome_equivalents
    genome_equivalents = get_genome_equivalents(ags_file)
    if genome_equivalents is None:
        print(f"Genome equivalents not found in AGS file for {sample_name}!")
        continue
    
    # 读取 rpkm 文件，跳过前四行
    rpkm_path = os.path.join(rpkm_folder, rpkm_file)
    data = pd.read_csv(rpkm_path, sep='\t', skiprows=4)
    
    # 打印列名以检查是否正确
    print(f"Columns in {rpkm_file}: {data.columns}")

    # 计算 RPKG 并存储
    data['RPKG'] = data.apply(lambda row: calculate_rpkg(row['Reads'], row['Length'], genome_equivalents), axis=1)
    
    # 保存结果
    result[sample_name] = data[['#Name', 'Length', 'Reads', 'RPKG']]

    # 可以选择将结果保存为新的文件
    output_file = os.path.join(output_dir, f"{sample_name}_RPKG.csv")
    data.to_csv(output_file, index=False)
    print(f"RPKG for {sample_name} saved to {output_file}.")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Columns in S0WOOD.contigs_5M_contigs.rpkm: Index(['#Name', 'Length', 'Bases', 'Coverage', 'Reads', 'RPKM', 'Frags',
       'FPKM'],
      dtype='object')
RPKG for S0WOOD saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/PRKG/contig/S0WOOD_RPKG.csv.
Columns in S2PHA.contigs_5M_contigs.rpkm: Index(['#Name', 'Length', 'Bases', 'Coverage', 'Reads', 'RPKM', 'Frags',
       'FPKM'],
      dtype='object')
RPKG for S2PHA saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/PRKG/contig/S2PHA_RPKG.csv.
Columns in S1WOOD.contigs_5M_contigs.rpkm: Index(['#Name', 'Length', 'Bases', 'Coverage', 'Reads', 'RPKM', 'Frags',
       'FPKM'],
      dtype='object')
RPKG for S1WOOD saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/PRKG/contig/S1WOOD_RPKG.csv.
Columns in S0WE.contigs_5M_contigs.rpkm: Index(['#Name', 'Length', 'Bases', 'Coverage', 'Reads', 'RPKM', 'Frags',
       'FPKM'],
      dtype='object')
RPKG for S0WE saved to /storage/jufengLab/luogaoyang/metagenome_proj