In [5]:
import os
import pandas as pd

# 定义路径
m8_directory = "/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out"  # 修改为你的8m文件路径
metadata_file = "/storage/jufengLab/luogaoyang/db/SOS_ROS_gene_full/combined_metadata.tsv"  # metadata文件路径
group_file = "/storage/jufengLab/luogaoyang/db/SOS_ROS_gene_full/group.txt"  # group文件路径

# 读取metadata文件
metadata = pd.read_csv(metadata_file, sep='\t')

# 检查metadata中是否存在Gene列和Entry列
if 'Entry' not in metadata.columns or 'Gene' not in metadata.columns:
    raise ValueError("Metadata file is missing required 'Entry' or 'Gene' columns.")

# 从metadata中提取需要的列：Entry 和 Gene
metadata = metadata[['Entry', 'Gene']]

# 读取group文件，提取Gene和其对应的group
group_df = pd.read_csv(group_file, sep="\t", header=0)

# 检查group_df中是否存在Gene列
if 'Gene' not in group_df.columns:
    raise ValueError("Group file is missing required 'Gene' column.")

# 遍历目录中的每个8m文件
for filename in os.listdir(m8_directory):
    if filename.endswith(".m8"):
        # 获取文件的完整路径
        m8_file = os.path.join(m8_directory, filename)
        
        # 从文件名中提取样本名（假设文件名格式为 sample1.8m）
        sample_name = os.path.splitext(filename)[0]  # 去掉扩展名以提取样本名
        sample_name= sample_name.split('.')[0]
        # 读取8m文件
        df = pd.read_csv(m8_file, sep="\t", header=None)
        df.columns = ['id', 'sub_id', 'identity', 'alignLen', 'mismat', 'gapOpens', 'qStart', 'qEnd', 'sStart', 'sEnd', 'eval', 'bit']
        
        # 过滤条件：identity > 0.7 且 alignLen > 40
        filter_df = df[(df['identity'] > 0.6) & (df['alignLen'] > 40)]

        # 对每个id，选择identity最高的行
        best_hits = filter_df.loc[filter_df.groupby('id')['identity'].idxmax()]

        # 检查是否存在 'sub_id' 列，并确保与 metadata 的 'Entry' 列匹配
        if best_hits['sub_id'].isnull().any():
            raise ValueError(f"Missing 'sub_id' values in {filename}")

        # 根据sub_id（即Entry）将结果和metadata进行合并，添加Gene信息
        merged_df = pd.merge(best_hits, metadata, how='left', left_on='sub_id', right_on='Entry')

        # 检查是否成功合并Gene信息
        if 'Gene' not in merged_df.columns or merged_df['Gene'].isnull().any():
            print(f"Warning: Some sub_ids in {filename} do not have matching entries in metadata!")
            print(merged_df[['sub_id', 'Entry', 'Gene']].head())  # 打印部分结果调试

        # 检查是否存在 'Gene' 列，确保后续合并不会出错
        if 'Gene' in merged_df.columns:
            # 根据Gene将结果和group_df进行合并，添加group信息
            merged_df = pd.merge(merged_df, group_df, how='left', on='Gene')
        else:
            raise ValueError(f"'Gene' column is missing after merging with metadata for file {filename}")

        # 添加样本名列
        merged_df['Sample'] = sample_name

        # 检查是否成功合并group信息
        if 'group' not in merged_df.columns or merged_df['group'].isnull().any():
            print(f"Warning: Some Genes in {filename} do not have matching group information!")

        # 输出到新的文件，添加Sample和group列
        output_file = os.path.join(m8_directory, "filter_m8",f"filtered_{filename}")
        merged_df.to_csv(output_file, sep='\t', index=False)

        print(f"Processed {filename} and saved to {output_file}")

Processed S0PHA.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8/filtered_S0PHA.contigs_5M_contigs_fna2faa_all_sos.m8
Processed S1PP.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8/filtered_S1PP.contigs_5M_contigs_fna2faa_all_sos.m8
Processed S0PCL.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8/filtered_S0PCL.contigs_5M_contigs_fna2faa_all_sos.m8
Processed S2PCL.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8/filtered_S2PCL.contigs_5M_contigs_fna2faa_all_sos.m8
Processed S3WOOD.conti

加入rpkg信息

In [3]:
import os
import pandas as pd

rpkm_type='contig' #contig or fna
filter_id_cutoff="60"

# 定义文件路径
m8_directory = "/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8"
rpkg_directory = "/storage/jufengLab/luogaoyang/metagenome_project/DSR/PRKG/"+rpkm_type
out_dir=os.path.join(m8_directory,"with_rpkg_"+rpkm_type)
if not os.path.exists(out_dir):
    print(f"{out_dir} not found!")
    print(f"makedir {out_dir}...")
    os.makedirs(out_dir)
    
# 遍历m8文件
for filename in os.listdir(m8_directory):
    if filename.endswith(".m8"):
        # 获取文件的完整路径
        m8_file = os.path.join(m8_directory, filename)

        # 提取sample_name
        sample_name = filename.split('_')[1]  # 假设第二部分为sample_name，比如S0PCL
        sample_name = sample_name.split('.')[0]

        # 对应的RPKG文件路径
        rpkg_file = os.path.join(rpkg_directory, f"{sample_name}_RPKG.csv")

        # 读取RPKG文件
        if not os.path.exists(rpkg_file):
            print(f"RPKG file not found for sample {sample_name}. Skipping...")
            continue

        rpkg_df = pd.read_csv(rpkg_file)

        # 确保RPKG文件包含必要的列
        if '#Name' not in rpkg_df.columns or 'RPKG' not in rpkg_df.columns:
            print(f"RPKG file {rpkg_file} is missing required columns. Skipping...")
            continue

        # 创建RPKG字典：#Name -> RPKG
        rpkg_dict = pd.Series(rpkg_df.RPKG.values, index=rpkg_df['#Name']).to_dict()

        # 读取m8文件
        m8_df = pd.read_csv(m8_file, sep="\t", header=0)
        m8_df.columns = ['id', 'sub_id', 'identity', 'alignLen', 'mismat', 'gapOpens', 'qStart', 'qEnd',
                         'sStart', 'sEnd', 'eval', 'bit', 'Entry', 'Gene', 'group', 'Sample']

        # 去除m8文件中id的最后一个下划线后缀
        if rpkm_type=='contig':
            m8_df['id_modified'] = m8_df['id'].apply(lambda x: '_'.join(x.split('_')[:-2]))
        else: #rpkm_type=='fna':
            m8_df['id_modified'] = m8_df['id'].apply(lambda x: '_'.join(x.split('_')[:-1]))
            

        # 查找RPKG值并加入到m8文件中
        m8_df['RPKG'] = m8_df['id_modified'].map(rpkg_dict)

        # 检查是否有未匹配到的RPKG值
        unmatched_ids = m8_df[m8_df['RPKG'].isnull()]
        if not unmatched_ids.empty:
            print(f"Warning: {len(unmatched_ids)} ids in {filename} did not match any RPKG values.")

        # 输出新的文件，包含原有列和 RPKG 列
        output_filename = f"filtered_{filename.replace('.m8', '')}_rpkg_{filter_id_cutoff}.txt"  # 生成新文件名并去掉 .m8 后缀
        output_file = os.path.join(m8_directory, "with_rpkg_"+rpkm_type,output_filename)
        
        # 保存文件，避免重复列名
        m8_df.to_csv(output_file, sep='\t', index=False)

        print(f"Processed {filename} and saved to {output_file}")

Processed filtered_S3PCL.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8/with_rpkg_contig/filtered_filtered_S3PCL.contigs_5M_contigs_fna2faa_all_sos_rpkg_60.txt
Processed filtered_S2PHA.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8/with_rpkg_contig/filtered_filtered_S2PHA.contigs_5M_contigs_fna2faa_all_sos_rpkg_60.txt
Processed filtered_S3PHA.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/ROS_gene_mmseq_contig_level_out/sustech_gene_out/filter_m8/with_rpkg_contig/filtered_filtered_S3PHA.contigs_5M_contigs_fna2faa_all_sos_rpkg_60.txt
Processed filtered_S2PP.contigs_5M_contigs_fna2faa_all_sos.m8 and saved to /storage/jufengLab/luogaoyang/metagenome_project/DSR/4_ann