## 处理物种信息

In [23]:
import pandas as pd
import os

# 读取物种分类文件
gtdb_file = '/storage/jufengLab/luogaoyang/metagenome_project/DSR/1_clean/bio_all_reads/GTDB_OUT/classify/gtdbtk.bac120.summary.tsv'  # 替换为实际文件路径
gtdb_file = '/storage/jufengLab/luogaoyang/metagenome_project/DSR/1_clean/nonbio_all_reads/GTDB_OUT/classify/gtdbtk.bac120.summary.tsv'  # 替换为实际文件路径
# gtdb_file = '/storage/jufengLab/luogaoyang/metagenome_project/DSR/1_clean/water_all_reads/GTDB_OUT/classify/gtdbtk.bac120.summary.tsv'  # 替换为实际文件路径
# gtdb_file = '/storage/jufengLab/luogaoyang/metagenome_project/DSR/1_clean/wood_all_reads/GTDB_OUT/classify/gtdbtk.bac120.summary.tsv'  # 替换为实际文件路径
gtdb_df = pd.read_csv(gtdb_file,sep="\t")

# 构建物种分类字典
species_dict = dict(zip(gtdb_df['user_genome'], gtdb_df['classification']))

## ARG处理

In [24]:
# DeepARG 输出文件夹路径
sample_type="bio"
sample_type="nonbio"
# sample_type="water"
# sample_type="wood"
deeparg_dir = os.path.join('/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation',sample_type+'_bin','deeparg_bin_out')

arg_metadata = []

# 遍历deeparg文件夹中的每个文件
for filename in os.listdir(deeparg_dir):
    if filename.endswith('_DeepARG.out.mapping.ARG'):
        bin_num = filename.split('_')[0]  # 提取bin编号
        bin_name = f'bin.{bin_num.split(".")[1]}'
        
        # 读取ARG文件
        arg_file = os.path.join(deeparg_dir, filename)
        if os.path.getsize(arg_file) > 0:  # 处理非空文件
            arg_df = pd.read_csv(arg_file, sep="\t",header=0)
            for _, row in arg_df.iterrows():
                entry_id = f"{bin_name}_{row['read_id']}"
                contig_name =bin_num+"_"+entry_id.split('_')[1] + "_" + entry_id.split('_')[2]
                arg_metadata.append({
                    'id': entry_id,
                    'bin_id':bin_name,
                    'orf_name':row['read_id'],
                    'species': species_dict.get(bin_name, 'Unknown'),
                    'ARG': row['#ARG'],
                    'predicted_ARG_class': row['predicted_ARG-class'],
                    'contig_name': contig_name
                })


# 将ARG注释信息转换为DataFrame
arg_df = pd.DataFrame(arg_metadata)

## 毒理因子注释

In [25]:
# VFDB metadata文件

vfdb_metadata_file = '/storage/jufengLab/luogaoyang/db/VFDB/CompRanking_Virulence_Summary.tsv'
vfdb_df = pd.read_csv(vfdb_metadata_file,sep="\t",index_col=0)
vfdb_df
# 构建VF字典
vf_dict = dict(zip(vfdb_df['fasID'], vfdb_df['VF_Name']))

# 毒力因子注释目录
vf_dir = os.path.join('/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation',sample_type+'_bin','VFDB_mmseq_out')  # 替换为实际路径

vf_metadata = []

# 遍历毒力因子的注释文件
for vf_file in os.listdir(vf_dir):
    if vf_file.endswith('.m8'):
        bin_num = vf_file.split('_')[0]
        vf_file_path = os.path.join(vf_dir, vf_file)
        print(vf_file_path)
         # 检查文件是否为空
        if os.path.getsize(vf_file_path) == 0:
            print(f"File {vf_file} is empty, skipping.")
            continue
        
        # 读取毒力因子注释文件
        try:
            df = pd.read_csv(vf_file_path, sep="\t", header=None)
        except pd.errors.EmptyDataError:
            print(f"File {vf_file} is empty or cannot be parsed, skipping.")
            continue
        
        df.columns = ['id', 'sub_id', 'identity', 'alignLen', 'mismat', 'gapOpens', 'qStart', 'qEnd', 'sStart', 'sEnd', 'eval', 'bit']
        
        # 过滤出符合条件的毒力因子注释
        filter_df = df[(df['identity'] > 0.6) & (df['alignLen'] > 40)]
        # 去重操作：对于每个 id，选择 identity 最大的行
        filter_df = filter_df.loc[filter_df.groupby('id')['identity'].idxmax()]
        
        for _, row in filter_df.iterrows():
            entry_id = f"{bin_num}_{row['id']}"
            contig_name =bin_num+"_"+entry_id.split('_')[1] + "_" + entry_id.split('_')[2]
            vf_metadata.append({
                'id': entry_id,
                'bin_id':bin_num,
                'orf_name':row['id'],
                'sub_id': row['sub_id'],
                'species': species_dict.get(bin_num, 'Unknown'),
                'VF_NAME': vf_dict.get(row['sub_id'], '-'),
                'contig_name': contig_name
            })

# 将毒力因子注释信息转换为DataFrame
vf_df = pd.DataFrame(vf_metadata)
vf_df

/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/VFDB_mmseq_out/bin.85_vf.m8
File bin.85_vf.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/VFDB_mmseq_out/bin.71_vf.m8
File bin.71_vf.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/VFDB_mmseq_out/bin.7_vf.m8
File bin.7_vf.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/VFDB_mmseq_out/bin.22_vf.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/VFDB_mmseq_out/bin.89_vf.m8
File bin.89_vf.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/VFDB_mmseq_out/bin.87_vf.m8
File bin.87_vf.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/VFDB_mmseq_out/bin.96_vf.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/

Unnamed: 0,id,bin_id,orf_name,sub_id,species,VF_NAME,contig_name
0,bin.22_k141_2000038_length_16138_cov_25.0723_8,bin.22,k141_2000038_length_16138_cov_25.0723_8,VFG014004(gb|WP_003148848),d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,bin.22_k141_2000038
1,bin.96_k141_7707864_length_9660_cov_12.2338_6,bin.96,k141_7707864_length_9660_cov_12.2338_6,VFG041187(gb|WP_012849267),d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,bin.96_k141_7707864
2,bin.92_k141_1962748_length_1612_cov_10.0843_2,bin.92,k141_1962748_length_1612_cov_10.0843_2,VFG011430(gb|WP_002963616),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.92_k141_1962748
3,bin.178_k141_817152_length_30588_cov_23.4263_19,bin.178,k141_817152_length_30588_cov_23.4263_19,VFG011430(gb|WP_002963616),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.178_k141_817152
4,bin.129_k141_5151809_length_283910_cov_32.2217...,bin.129,k141_5151809_length_283910_cov_32.2217_102,VFG011430(gb|WP_002963616),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.129_k141_5151809
5,bin.160_k141_7916819_length_12396_cov_25.3947_3,bin.160,k141_7916819_length_12396_cov_25.3947_3,VFG011430(gb|WP_002963616),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.160_k141_7916819
6,bin.38_k141_3482740_length_5717_cov_9.0000_1,bin.38,k141_3482740_length_5717_cov_9.0000_1,VFG011430(gb|WP_002963616),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.38_k141_3482740
7,bin.38_k141_987689_length_10950_cov_10.0000_7,bin.38,k141_987689_length_10950_cov_10.0000_7,VFG011429(gb|WP_002963985),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.38_k141_987689
8,bin.123_k141_1899457_length_23673_cov_13.6425_15,bin.123,k141_1899457_length_23673_cov_13.6425_15,VFG011430(gb|WP_002963616),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.123_k141_1899457
9,bin.45_k141_4774825_length_2608_cov_10.1536_2,bin.45,k141_4774825_length_2608_cov_10.1536_2,VFG011430(gb|WP_002963616),d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,LPS,bin.45_k141_4774825


## 塑料降解基因

In [26]:
# PlasticDB注释目录
plastic_dir = os.path.join('/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation',sample_type+'_bin','plasticDB_mmseq_out')  # 替换为实际路径

plastic_metadata = []

# 遍历塑料降解基因的注释文件
for plastic_file in os.listdir(plastic_dir):
    if plastic_file.endswith('.m8'):
        bin_num = plastic_file.split('_')[0]
        plastic_file_path = os.path.join(plastic_dir, plastic_file)
        print(plastic_file_path)
        
        # 检查文件是否为空
        if os.path.getsize(plastic_file_path) == 0:
            print(f"File {plastic_file} is empty, skipping.")
            continue
        
        # 读取塑料降解基因的注释文件
        try:
            df = pd.read_csv(plastic_file_path, sep="\t", header=None)
        except pd.errors.EmptyDataError:
            print(f"File {plastic_file} is empty or cannot be parsed, skipping.")
            continue
        
        # 指定列名
        df.columns = ['id', 'sub_id', 'identity', 'alignLen', 'mismat', 'gapOpens', 'qStart', 'qEnd', 'sStart', 'sEnd', 'eval', 'bit']
        
        # 过滤出符合条件的塑料降解基因注释：identity > 0.6 且 alignLen > 40
        filter_df = df[(df['identity'] > 0.5) & (df['alignLen'] > 40)]
        
        # 去重操作：对于每个 id，选择 identity 最大的行
        filter_df = filter_df.loc[filter_df.groupby('id')['identity'].idxmax()]
        
        # 处理剩下的行并构建结果
        for _, row in filter_df.iterrows():
            entry_id = f"{bin_num}_{row['id']}"
            contig_name = bin_num + "_" + entry_id.split('_')[1] + "_" + entry_id.split('_')[2]
            
            # 提取 sub_id 中的最后一个字段（塑料降解基因名称）
            plastic_gene_name = row['sub_id'].split('||')[-1]
            
            plastic_metadata.append({
                'id': entry_id,
                'bin_id': bin_num,
                'orf_name': row['id'],
                'sub_id': row['sub_id'],
                'species': species_dict.get(bin_num, 'Unknown'),
                'PLASTIC_GENE': plastic_gene_name,  # 使用提取的塑料降解基因名称
                'contig_name': contig_name
            })

# 将塑料降解基因注释信息转换为 DataFrame
plastic_df = pd.DataFrame(plastic_metadata)
plastic_df

/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/plasticDB_mmseq_out/bin.153_plsdb.m8
File bin.153_plsdb.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/plasticDB_mmseq_out/bin.90_plsdb.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/plasticDB_mmseq_out/bin.93_plsdb.m8
File bin.93_plsdb.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/plasticDB_mmseq_out/bin.116_plsdb.m8
File bin.116_plsdb.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/plasticDB_mmseq_out/bin.62_plsdb.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/plasticDB_mmseq_out/bin.175_plsdb.m8
File bin.175_plsdb.m8 is empty, skipping.
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/plasticDB_mmseq_out/bin.110_plsdb.m8
/storage/jufengLab/luogaoyang/metageno

Unnamed: 0,id,bin_id,orf_name,sub_id,species,PLASTIC_GENE,contig_name
0,bin.90_k141_2477271_length_4624_cov_11.0029_3,bin.90,k141_2477271_length_4624_cov_11.0029_3,00160||PHA_depolymerase||Cupriavidus_necator||...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,PHA_PHB,bin.90_k141_2477271
1,bin.90_k141_4893777_length_2696_cov_10.4325_2,bin.90,k141_4893777_length_2696_cov_10.4325_2,00161||PHA_depolymerase||Cupriavidus_necator||...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,PHA_PHB,bin.90_k141_4893777
2,bin.90_k141_9840046_length_4167_cov_14.0169_2,bin.90,k141_9840046_length_4167_cov_14.0169_2,00071||3HV_dehydrogenase||Paracoccus_denitrifi...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,P3HV_PHBV_PHA,bin.90_k141_9840046
3,bin.62_k141_8160052_length_3539_cov_9.0000_3,bin.62,k141_8160052_length_3539_cov_9.0000_3,00071||3HV_dehydrogenase||Paracoccus_denitrifi...,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,P3HV_PHBV_PHA,bin.62_k141_8160052
4,bin.110_k141_1989103_length_40976_cov_26.7137_20,bin.110,k141_1989103_length_40976_cov_26.7137_20,00164||PHA_depolymerase||Cupriavidus_necator||...,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,PHA_PHB,bin.110_k141_1989103
...,...,...,...,...,...,...,...
128,bin.36_k141_7487470_length_7688_cov_9.0000_3,bin.36,k141_7487470_length_7688_cov_9.0000_3,00071||3HV_dehydrogenase||Paracoccus_denitrifi...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,P3HV_PHBV_PHA,bin.36_k141_7487470
129,bin.36_k141_7744773_length_4289_cov_7.8566_2,bin.36,k141_7744773_length_4289_cov_7.8566_2,00161||PHA_depolymerase||Cupriavidus_necator||...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,PHA_PHB,bin.36_k141_7744773
130,bin.36_k141_9866550_length_2739_cov_13.0000_2,bin.36,k141_9866550_length_2739_cov_13.0000_2,00053||PHB_depolymerase||Diaphorobacter_sp.||P...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,PHB_PHBV_PHA,bin.36_k141_9866550
131,bin.96_k141_10050135_length_1922_cov_8.0000_2,bin.96,k141_10050135_length_1922_cov_8.0000_2,00087||Polyesterase||Pseudomonas_pseudoalcalig...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,PBAT,bin.96_k141_10050135


## 处理MobileOG

In [27]:
# MobileOG注释目录
mge_dir = os.path.join('/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation',sample_type+'_bin','mobileOG_out')  # 替换为实际路径

mge_metadata = []

# 遍历MGE的注释文件
for mge_file in os.listdir(mge_dir):
    if mge_file.endswith('.m8'):
        bin_num = mge_file.split('_')[0]
        mge_file_path = os.path.join(mge_dir, mge_file)
        print(mge_file_path)
        
        # 检查文件是否为空
        if os.path.getsize(mge_file_path) == 0:
            print(f"File {mge_file} is empty, skipping.")
            continue
        
        # 读取MGE注释文件
        try:
            df_MobileOG = pd.read_csv(mge_file_path, sep="\t", header=None)
        except pd.errors.EmptyDataError:
            print(f"File {mge_file} is empty or cannot be parsed, skipping.")
            continue
        
        # 指定列名
        df_MobileOG.columns = ['id', 'sub_id', 'identity', 'alignLen', 'mismat', 'gapOpens', 'qStart', 'qEnd', 'sStart', 'sEnd', 'eval', 'bit']
        
        # 处理sub_id
        df_MobileOG_tmp = df_MobileOG["sub_id"].str.split("|", expand=True)
        
        # 如果列数少于7，填充缺失列
        if df_MobileOG_tmp.shape[1] < 7:
            if df_MobileOG_tmp.shape[1] == 5:
                df_MobileOG_tmp[5] = "-"
            if df_MobileOG_tmp.shape[1] == 6:
                df_MobileOG_tmp[6] = "-"
        
        # 选择需要的列并重命名
        df_MobileOG_tmp = df_MobileOG_tmp[[0, 1, 3, 4, 5]]
        df_MobileOG_tmp.columns = ["mobileOG_ID", "Gene_Name", "Taxonomy", "Major_Category", "MGE_Database"]
        
        # 合并处理后的sub_id信息
        df_MobileOG_concat = pd.concat((df_MobileOG, df_MobileOG_tmp), axis=1)
        
        # 过滤出符合条件的MGE注释：identity > 0.6 且 alignLen > 40
        filter_df = df_MobileOG_concat[(df_MobileOG_concat['identity'] > 0.6) & (df_MobileOG_concat['alignLen'] > 40)]
        
        # 去重操作：对于每个 id，选择 identity 最大的行
        filter_df = filter_df.loc[filter_df.groupby('id')['identity'].idxmax()]
        
        # 处理剩下的行并构建结果
        for _, row in filter_df.iterrows():
            entry_id = f"{bin_num}_{row['id']}"
            contig_name = bin_num + "_" + entry_id.split('_')[1] + "_" + entry_id.split('_')[2]
            
            mge_metadata.append({
                'id': entry_id,
                'bin_id': bin_num,
                'orf_name': row['id'],
                'species': species_dict.get(bin_num, 'Unknown'),
                'mobileOG_ID': row['mobileOG_ID'],
                'MGE_Name': row['Gene_Name'],
                'MGE_class': row['Taxonomy'],
                'MGE_Major_Category': row['Major_Category'],
                'MGE_Database': row['MGE_Database'],
                'contig_name': contig_name
            })

# 将MGE注释信息转换为DataFrame
mge_df = pd.DataFrame(mge_metadata)
mge_df

/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.189_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.117_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.67_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.124_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.99_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.176_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.32_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/mobileOG_out/bin.57_mobileOG_mmseq2.m8
/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/no

Unnamed: 0,id,bin_id,orf_name,species,mobileOG_ID,MGE_Name,MGE_class,MGE_Major_Category,MGE_Database,contig_name
0,bin.189_k141_2766875_length_1634_cov_12.8399_2,bin.189,k141_2766875_length_1634_cov_12.8399_2,d__Bacteria;p__Desulfobacterota_B;c__Binatia;o...,mobileOG_000057348,S4062,integration/excision,,COMPASS,bin.189_k141_2766875
1,bin.189_k141_6474206_length_2126_cov_6.0000_2,bin.189,k141_6474206_length_2126_cov_6.0000_2,d__Bacteria;p__Desulfobacterota_B;c__Binatia;o...,mobileOG_000282639,recA,replication/recombination/repair,,Multiple,bin.189_k141_6474206
2,bin.189_k141_9635482_length_3266_cov_7.0000_1,bin.189,k141_9635482_length_3266_cov_7.0000_1,d__Bacteria;p__Desulfobacterota_B;c__Binatia;o...,mobileOG_000231079,uvrA,replication/recombination/repair,,GPD,bin.189_k141_9635482
3,bin.117_k141_10181993_length_10614_cov_22.9657_7,bin.117,k141_10181993_length_10614_cov_22.9657_7,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,mobileOG_000040226,trp1400A,integration/excision,,COMPASS,bin.117_k141_10181993
4,bin.117_k141_2504969_length_15247_cov_12.3442_6,bin.117,k141_2504969_length_15247_cov_12.3442_6,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,mobileOG_000740027,NA:Keyword,transfer,,Plasmid,bin.117_k141_2504969
...,...,...,...,...,...,...,...,...,...,...
2146,bin.88_k141_7190286_length_1952_cov_14.0000_2,bin.88,k141_7190286_length_1952_cov_14.0000_2,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,mobileOG_000367080,dut,transfer,regulation,Plasmid,bin.88_k141_7190286
2147,bin.88_k141_9080213_length_1556_cov_11.0000_2,bin.88,k141_9080213_length_1556_cov_11.0000_2,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,mobileOG_000340056,nth,replication/recombination/repair,,Plasmid,bin.88_k141_9080213
2148,bin.88_k141_9097861_length_3149_cov_12.0000_4,bin.88,k141_9097861_length_3149_cov_12.0000_4,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,mobileOG_000302947,stbC,stability/transfer/defense,,Plasmid,bin.88_k141_9097861
2149,bin.88_k141_9097861_length_3149_cov_12.0000_5,bin.88,k141_9097861_length_3149_cov_12.0000_5,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,mobileOG_000051360,vapC,stability/transfer/defense,,Multiple,bin.88_k141_9097861


## 合并所有

In [28]:
# 合并ARG与毒力因子注释
merged_df_arg_vf = pd.merge(arg_df, vf_df, on=['id', 'contig_name', 'bin_id', 'orf_name','species'], how='outer')

# 填充空值为'-'
merged_df_arg_vf.fillna('-', inplace=True)

# 再合并塑料降解基因注释
merged_df_arg_vf_plastic = pd.merge(merged_df_arg_vf, plastic_df, on=['id', 'contig_name', 'bin_id', 'orf_name','species'], how='outer')

# 填充空值为'-'
merged_df_arg_vf_plastic.fillna('-', inplace=True)

# 最后合并MGE注释
merged_df_all = pd.merge(merged_df_arg_vf_plastic, mge_df, on=['id', 'contig_name', 'bin_id', 'orf_name','species'], how='outer')

# 填充空值为'-'
merged_df_all.fillna('-', inplace=True)

# 保存最终的metadata表
output_dir=os.path.join('/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation',sample_type+'_bin','merge_result')
output_file = os.path.join(output_dir,'metadata_arg_vf_plastic_mge.csv')
print(output_file)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
merged_df_all.to_csv(output_file, index=False)

# # 打印结果以检查
# print(merged_df_all)


/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/nonbio_bin/merge_result/metadata_arg_vf_plastic_mge.csv


## 加入orf的起始位置

In [29]:
import os
import pandas as pd
import re

# 加载现有的DataFrame

input_file = output_file
merged_df_all = pd.read_csv(input_file)

# 查看merged_df_all的列名，检查是否有 orf_name
print("Columns in merged_df_all:", merged_df_all.columns)

# 提取 orf_name 中的 contig_id 和 编号（编号是最后一个下划线后面的部分）
merged_df_all['contig_id'] = merged_df_all['orf_name'].apply(lambda x: '_'.join(x.split('_')[:-1]))
merged_df_all['orf_number'] = merged_df_all['orf_name'].apply(lambda x: x.split('_')[-1])

# 添加 start、end、length 和 strand 位置信息
start_end_metadata = []

# GFF文件所在目录
gff_dir=os.path.join('/storage/jufengLab/luogaoyang/metagenome_project/DSR/1_clean',sample_type+'_all_reads','PREDICT_GENES')

# 遍历每个 bin 的 GFF 文件
for bin_id in merged_df_all['bin_id'].unique():
    # 提取 bin_id 中的数字部分，假设格式为 "bin.xx"
    bin_num = bin_id.split('.')[1]  # 提取 . 后面的数字部分

    # 构建 GFF 文件路径
    gff_file = os.path.join(gff_dir, f'bin.{bin_num}.gff')
    
    if not os.path.exists(gff_file):
        print(f"GFF file for bin {bin_num} not found, skipping.")
        continue

    # 读取 GFF 文件
    with open(gff_file, 'r') as f:
        for line in f:
            if not line.startswith('#'):  # 跳过注释行
                columns = line.strip().split('\t')
                if len(columns) > 8 and columns[2] == 'CDS':  # 只处理 CDS 行
                    contig_name = columns[0]  # 第一列是 contig_name
                    start = int(columns[3])  # 第四列是 start
                    end = int(columns[4])  # 第五列是 end
                    strand = columns[6]  # 第七列是方向，+ 或者 -
                    attributes = columns[8]  # 最后一列是 attributes

                    # 简化提取 ORF 的 ID，如 "ID=a_b"，直接从 columns[8] 中查找 ID
                    orf_id_match = re.search(r'ID=([^;]+)', attributes)  # 查找 ID= 后面的内容
                    if orf_id_match:
                        orf_id = orf_id_match.group(1)  # 获取 ID=a_b 的值
                        orf_number = orf_id.split('_')[-1]  # 提取 ORF 编号（b 部分）
                        
                        # 组合 contig_name 和 orf_number 以匹配 orf_name
                        orf_name_in_gff = f"{contig_name}_{orf_number}"
                        
                        # 记录 ORF 的位置信息
                        start_end_metadata.append({
                            'orf_name': orf_name_in_gff,  # 组合后的 orf_name 与 merged_df_all 中的相匹配
                            'start': start,               # start 位置信息
                            'end': end,                   # end 位置信息
                            'strand': strand              # 方向
                        })

# 将位置信息转换为 DataFrame
start_end_df = pd.DataFrame(start_end_metadata)

# 查看 start_end_df 的列名，检查是否正确提取
print("Columns in start_end_df:", start_end_df.columns)

# 去除前后空格
merged_df_all['orf_name'] = merged_df_all['orf_name'].str.strip()
start_end_df['orf_name'] = start_end_df['orf_name'].str.strip()

# 检查是否存在不匹配的 orf_name
unmatched_orf_names = merged_df_all[~merged_df_all['orf_name'].isin(start_end_df['orf_name'])]

print("Number of unmatched orf_name:", len(unmatched_orf_names))

# 合并位置信息到原始 DataFrame，按照 'orf_name' 匹配
merged_df_with_positions = pd.merge(merged_df_all, start_end_df, on='orf_name', how='left')

# 填充空值为 '-'
merged_df_with_positions.fillna('-', inplace=True)

# 删除临时的 'contig_id' 和 'orf_number' 列
merged_df_with_positions.drop(columns=['contig_id', 'orf_number'], inplace=True)

merged_df_with_positions

Columns in merged_df_all: Index(['id', 'bin_id', 'orf_name', 'species', 'ARG', 'predicted_ARG_class',
       'contig_name', 'sub_id_x', 'VF_NAME', 'sub_id_y', 'PLASTIC_GENE',
       'mobileOG_ID', 'MGE_Name', 'MGE_class', 'MGE_Major_Category',
       'MGE_Database'],
      dtype='object')
Columns in start_end_df: Index(['orf_name', 'start', 'end', 'strand'], dtype='object')
Number of unmatched orf_name: 0


Unnamed: 0,id,bin_id,orf_name,species,ARG,predicted_ARG_class,contig_name,sub_id_x,VF_NAME,sub_id_y,PLASTIC_GENE,mobileOG_ID,MGE_Name,MGE_class,MGE_Major_Category,MGE_Database,start,end,strand
0,bin.100_k141_10009953_length_3003_cov_10.0000_2,bin.100,k141_10009953_length_3003_cov_10.0000_2,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,-,bin.100_k141_10009953,-,-,-,-,mobileOG_000199360,hfq,phage,"infection,regulation",GPD,543,782,-
1,bin.100_k141_5044243_length_4663_cov_10.0000_5,bin.100,k141_5044243_length_4663_cov_10.0000_5,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,-,bin.100_k141_5044243,-,-,-,-,mobileOG_000334520,thyA,phage,"replication,infection,regulation",Plasmid,3888,4661,-
2,bin.100_k141_5181451_length_1829_cov_5.0000_2,bin.100,k141_5181451_length_1829_cov_5.0000_2,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,PMRF,peptide,bin.100_k141_5181451,-,-,-,-,-,-,-,-,-,814,1818,+
3,bin.100_k141_71747_length_11945_cov_10.0000_10,bin.100,k141_71747_length_11945_cov_10.0000_10,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,-,bin.100_k141_71747,-,-,-,-,mobileOG_000334176,radA,replication/recombination/repair,competence,Plasmid,8826,10190,+
4,bin.100_k141_8412174_length_1605_cov_8.0034_2,bin.100,k141_8412174_length_1605_cov_8.0034_2,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,GOLS,multidrug,bin.100_k141_8412174,-,-,-,-,-,-,-,-,-,251,694,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540,bin.9_k141_8293951_length_9442_cov_10.2556_2,bin.9,k141_8293951_length_9442_cov_10.2556_2,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,-,bin.9_k141_8293951,-,-,-,-,mobileOG_000025523,uvrB,replication/recombination/repair,-,Plasmid,1455,3509,+
2541,bin.9_k141_8872801_length_6441_cov_11.0000_5,bin.9,k141_8872801_length_6441_cov_11.0000_5,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,BACA,bacitracin,bin.9_k141_8872801,-,-,-,-,-,-,-,-,-,4450,5271,+
2542,bin.9_k141_9283942_length_2918_cov_14.6104_1,bin.9,k141_9283942_length_2918_cov_14.6104_1,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,-,bin.9_k141_9283942,-,-,-,-,mobileOG_000049871,groL,phage,"replication,chaperone",COMPASS,3,854,-
2543,bin.9_k141_9283942_length_2918_cov_14.6104_2,bin.9,k141_9283942_length_2918_cov_14.6104_2,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-,-,bin.9_k141_9283942,-,-,-,-,mobileOG_000279396,groS,phage,structural,Plasmid,895,1185,-


In [30]:
# 保存最终的 metadata 表，包含 start、end 和 strand 信息
output_file_with_positions = os.path.join(output_dir,"metadata_arg_vf_plastic_mge_with_positions.csv")
merged_df_with_positions.to_csv(output_file_with_positions, index=False)

# 打印部分结果以检查
print(merged_df_with_positions.head())

                                                id   bin_id  \
0  bin.100_k141_10009953_length_3003_cov_10.0000_2  bin.100   
1   bin.100_k141_5044243_length_4663_cov_10.0000_5  bin.100   
2    bin.100_k141_5181451_length_1829_cov_5.0000_2  bin.100   
3   bin.100_k141_71747_length_11945_cov_10.0000_10  bin.100   
4    bin.100_k141_8412174_length_1605_cov_8.0034_2  bin.100   

                                  orf_name  \
0  k141_10009953_length_3003_cov_10.0000_2   
1   k141_5044243_length_4663_cov_10.0000_5   
2    k141_5181451_length_1829_cov_5.0000_2   
3   k141_71747_length_11945_cov_10.0000_10   
4    k141_8412174_length_1605_cov_8.0034_2   

                                             species   ARG  \
0  d__Bacteria;p__Proteobacteria;c__Gammaproteoba...     -   
1  d__Bacteria;p__Proteobacteria;c__Gammaproteoba...     -   
2  d__Bacteria;p__Proteobacteria;c__Gammaproteoba...  PMRF   
3  d__Bacteria;p__Proteobacteria;c__Gammaproteoba...     -   
4  d__Bacteria;p__Proteobacteria;c

## 整理arg， sepcies，vf，arg class

In [31]:
import pandas as pd
import re

# 读取 CSV 文件
input_file = output_file_with_positions
merged_df = pd.read_csv(input_file)

# 添加一列是否为 plastic_gene，基于 PLASTIC_GENE 列的值，使用 0/1 表示
merged_df['plastic_gene'] = merged_df['PLASTIC_GENE'].apply(lambda x: 1 if x != '-' else 0)

# 提取 unique 的 ARG class，去掉 'unclassified'
arg_classes = merged_df['predicted_ARG_class'].unique()
arg_classes = [arg_class for arg_class in arg_classes if arg_class != 'unclassified']

# 为每个 ARG class 创建一列，判断每个 bin 是否含有该 class，使用 0/1
for arg_class in arg_classes:
    merged_df[arg_class] = (merged_df['predicted_ARG_class'] == arg_class).astype(int)

# 对每个 bin_id 进行汇总，按 bin_id 来统计每种 ARG class 是否存在
arg_class_summary = merged_df.groupby('bin_id')[list(arg_classes)].max().reset_index()

# 统计每个 bin_id 的 ARG richness （即 ARG 的数量）
arg_counts = merged_df[merged_df['ARG'] != '-'].groupby('bin_id').size().reset_index(name='arg_num')

# 判断每个 bin_id 是否有 VF 注释，使用 0/1 表示
vf_presence = merged_df[merged_df['VF_NAME'] != '-'].groupby('bin_id').size().reset_index(name='vf_count')
vf_presence['if_vf'] = (vf_presence['vf_count'] > 0).astype(int)

# 提取到 genus 级别的信息，从 species 列中提取 genus 信息
def extract_genus(species_str):
    match = re.search(r'g__([^;]+)', species_str)
    return match.group(1) if match else 'Unknown'

merged_df['genus'] = merged_df['species'].apply(extract_genus)

# 提取 genus 信息，按 bin_id 汇总
genus_summary = merged_df[['bin_id', 'genus']].drop_duplicates()

# 提取 plastic_gene 信息，按 bin_id 汇总
plastic_gene_summary = merged_df.groupby('bin_id')['plastic_gene'].max().reset_index()

# 合并 ARG class、ARG richness、VF、genus 和 plastic_gene 信息
bin_summary = pd.merge(arg_class_summary, arg_counts, on='bin_id', how='left')
bin_summary = pd.merge(bin_summary, vf_presence[['bin_id', 'if_vf']], on='bin_id', how='left')
bin_summary = pd.merge(bin_summary, genus_summary, on='bin_id', how='left')
bin_summary = pd.merge(bin_summary, plastic_gene_summary, on='bin_id', how='left')

# 填充缺失值为 0
bin_summary.fillna(0, inplace=True)
bin_summary.drop(columns=['-'], inplace=True)  # 如果有多余的列 '-'

# 保存为 CSV 文件
output_file=os.path.join(output_dir,sample_type+"_bin_summary_with_arg_vf_plastic.csv")
bin_summary.to_csv(output_file, index=False)

# 打印结果以检查
bin_summary

Unnamed: 0,bin_id,peptide,multidrug,fosmidomycin,bacitracin,MLS,aminoglycoside,sulfonamide,tetracycline,fluoroquinolone,beta-lactam,triclosan,rifamycin,fosfomycin,diaminopyrimidine,glycopeptide,arg_num,if_vf,genus,plastic_gene
0,bin.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,Unknown,0
1,bin.10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1.0,Novosphingobium,1
2,bin.100,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,3.0,0.0,FEB-7,0
3,bin.101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1.0,Novosphingobium,1
4,bin.102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,SG-bin7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,bin.95,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1.0,TMP-24,1
181,bin.96,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,8.0,0.0,Rubrivivax,1
182,bin.97,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,3.0,0.0,WLNW01,0
183,bin.98,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,PNKF01,0


## 统计难降解塑料

In [None]:
# 创建 DataFrame
df = pd.DataFrame(bin_summary)

# 1. 统计总的 bin 数量
total_bins = len(df)

# 2. 统计带有毒力因子 (if_vf == 1) 的 bin 数量
vf_bins = df[df['if_vf'] == 1.0]
vf_bin_count = len(vf_bins)
vf_bin_ratio = vf_bin_count / total_bins * 100  # 占总数的百分比

# 3. 统计在 vf bins 中，携带抗性基因 (arg_num != 0) 的 bin 数量
vf_with_arg_bins = vf_bins[vf_bins['arg_num'] != 0]
vf_with_arg_count = len(vf_with_arg_bins)
vf_with_arg_ratio = vf_with_arg_count / vf_bin_count * 100 if vf_bin_count > 0 else 0  # 占 vf bin 的百分比

# 4. 统计在 vf bins 中，携带抗性基因且同时携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_arg_and_degradation_bins = vf_with_arg_bins[vf_with_arg_bins['plastic_gene'] == 1]
vf_with_arg_and_degradation_count = len(vf_with_arg_and_degradation_bins)
vf_with_arg_and_degradation_ratio = vf_with_arg_and_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 5. 统计在 vf bins 中，携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_degradation_bins = vf_bins[vf_bins['plastic_gene'] == 1]
vf_with_degradation_count = len(vf_with_degradation_bins)
vf_with_degradation_ratio = vf_with_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 输出结果
print(f"Total number of bins: {total_bins}")
print(f"Number of vf bins (if_vf == 1): {vf_bin_count}")
print(f"Percentage of vf bins: {vf_bin_ratio:.2f}%")
print(f"Number of vf bins with ARGs (arg_num != 0): {vf_with_arg_count}")
print(f"Percentage of vf bins with ARGs: {vf_with_arg_ratio:.2f}%")
print(f"Number of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_count}")
print(f"Percentage of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_ratio:.2f}%")
print(f"Number of vf bins with degradation genes: {vf_with_degradation_count}")
print(f"Percentage of vf bins with degradation genes: {vf_with_degradation_ratio:.2f}%")

Total number of bins: 185
Number of vf bins (if_vf == 1): 32
Percentage of vf bins: 17.30%
Number of vf bins with ARGs (arg_num != 0): 9
Percentage of vf bins with ARGs: 28.12%
Number of vf bins with ARGs and degradation genes: 9
Percentage of vf bins with ARGs and degradation genes: 28.12%
Number of vf bins with degradation genes: 28
Percentage of vf bins with degradation genes: 87.50%


## 统计可降解塑料

In [37]:
bin_summary_bio=pd.read_csv("/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/bio_bin/merge_result/bin_summary_with_arg_vf_plastic.csv")

# 创建 DataFrame
df_bio = pd.DataFrame(bin_summary_bio)

# 1. 统计总的 bin 数量
total_bins = len(df_bio)

# 2. 统计带有毒力因子 (if_vf == 1) 的 bin 数量
vf_bins = df_bio[df_bio['if_vf'] == 1.0]
vf_bin_count = len(vf_bins)
vf_bin_ratio = vf_bin_count / total_bins * 100  # 占总数的百分比

# 3. 统计在 vf bins 中，携带抗性基因 (arg_num != 0) 的 bin 数量
vf_with_arg_bins = vf_bins[vf_bins['arg_num'] != 0]
vf_with_arg_count = len(vf_with_arg_bins)
vf_with_arg_ratio = vf_with_arg_count / vf_bin_count * 100 if vf_bin_count > 0 else 0  # 占 vf bin 的百分比

# 4. 统计在 vf bins 中，携带抗性基因且同时携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_arg_and_degradation_bins = vf_with_arg_bins[vf_with_arg_bins['plastic_gene'] == 1]
vf_with_arg_and_degradation_count = len(vf_with_arg_and_degradation_bins)
vf_with_arg_and_degradation_ratio = vf_with_arg_and_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 5. 统计在 vf bins 中，携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_degradation_bins = vf_bins[vf_bins['plastic_gene'] == 1]
vf_with_degradation_count = len(vf_with_degradation_bins)
vf_with_degradation_ratio = vf_with_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 输出结果
print(f"Total number of bins: {total_bins}")
print(f"Number of vf bins (if_vf == 1): {vf_bin_count}")
print(f"Percentage of vf bins: {vf_bin_ratio:.2f}%")
print(f"Number of vf bins with ARGs (arg_num != 0): {vf_with_arg_count}")
print(f"Percentage of vf bins with ARGs: {vf_with_arg_ratio:.2f}%")
print(f"Number of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_count}")
print(f"Percentage of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_ratio:.2f}%")
print(f"Number of vf bins with degradation genes: {vf_with_degradation_count}")
print(f"Percentage of vf bins with degradation genes: {vf_with_degradation_ratio:.2f}%")

Total number of bins: 129
Number of vf bins (if_vf == 1): 38
Percentage of vf bins: 29.46%
Number of vf bins with ARGs (arg_num != 0): 25
Percentage of vf bins with ARGs: 65.79%
Number of vf bins with ARGs and degradation genes: 24
Percentage of vf bins with ARGs and degradation genes: 63.16%
Number of vf bins with degradation genes: 35
Percentage of vf bins with degradation genes: 92.11%


## 统计水

In [12]:
bin_summary_water=pd.read_csv("/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/water_bin/merge_result/water_bin_summary_with_arg_vf_plastic.csv")

# 创建 DataFrame
df_bio = pd.DataFrame(bin_summary_water)

# 1. 统计总的 bin 数量
total_bins = len(df_bio)

# 2. 统计带有毒力因子 (if_vf == 1) 的 bin 数量
vf_bins = df_bio[df_bio['if_vf'] == 1.0]
vf_bin_count = len(vf_bins)
vf_bin_ratio = vf_bin_count / total_bins * 100  # 占总数的百分比

# 3. 统计在 vf bins 中，携带抗性基因 (arg_num != 0) 的 bin 数量
vf_with_arg_bins = vf_bins[vf_bins['arg_num'] != 0]
vf_with_arg_count = len(vf_with_arg_bins)
vf_with_arg_ratio = vf_with_arg_count / vf_bin_count * 100 if vf_bin_count > 0 else 0  # 占 vf bin 的百分比

# 4. 统计在 vf bins 中，携带抗性基因且同时携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_arg_and_degradation_bins = vf_with_arg_bins[vf_with_arg_bins['plastic_gene'] == 1]
vf_with_arg_and_degradation_count = len(vf_with_arg_and_degradation_bins)
vf_with_arg_and_degradation_ratio = vf_with_arg_and_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 5. 统计在 vf bins 中，携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_degradation_bins = vf_bins[vf_bins['plastic_gene'] == 1]
vf_with_degradation_count = len(vf_with_degradation_bins)
vf_with_degradation_ratio = vf_with_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 输出结果
print(f"Total number of bins: {total_bins}")
print(f"Number of vf bins (if_vf == 1): {vf_bin_count}")
print(f"Percentage of vf bins: {vf_bin_ratio:.2f}%")
print(f"Number of vf bins with ARGs (arg_num != 0): {vf_with_arg_count}")
print(f"Percentage of vf bins with ARGs: {vf_with_arg_ratio:.2f}%")
print(f"Number of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_count}")
print(f"Percentage of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_ratio:.2f}%")
print(f"Number of vf bins with degradation genes: {vf_with_degradation_count}")
print(f"Percentage of vf bins with degradation genes: {vf_with_degradation_ratio:.2f}%")

Total number of bins: 169
Number of vf bins (if_vf == 1): 27
Percentage of vf bins: 15.98%
Number of vf bins with ARGs (arg_num != 0): 20
Percentage of vf bins with ARGs: 74.07%
Number of vf bins with ARGs and degradation genes: 16
Percentage of vf bins with ARGs and degradation genes: 59.26%
Number of vf bins with degradation genes: 22
Percentage of vf bins with degradation genes: 81.48%


## 统计木头

In [22]:
bin_summary_wood=pd.read_csv("/storage/jufengLab/luogaoyang/metagenome_project/DSR/4_annotation/wood_bin/merge_result/wood_bin_summary_with_arg_vf_plastic.csv")

# 创建 DataFrame
df_bio = pd.DataFrame(bin_summary_wood)

# 1. 统计总的 bin 数量
total_bins = len(df_bio)

# 2. 统计带有毒力因子 (if_vf == 1) 的 bin 数量
vf_bins = df_bio[df_bio['if_vf'] == 1.0]
vf_bin_count = len(vf_bins)
vf_bin_ratio = vf_bin_count / total_bins * 100  # 占总数的百分比

# 3. 统计在 vf bins 中，携带抗性基因 (arg_num != 0) 的 bin 数量
vf_with_arg_bins = vf_bins[vf_bins['arg_num'] != 0]
vf_with_arg_count = len(vf_with_arg_bins)
vf_with_arg_ratio = vf_with_arg_count / vf_bin_count * 100 if vf_bin_count > 0 else 0  # 占 vf bin 的百分比

# 4. 统计在 vf bins 中，携带抗性基因且同时携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_arg_and_degradation_bins = vf_with_arg_bins[vf_with_arg_bins['plastic_gene'] == 1]
vf_with_arg_and_degradation_count = len(vf_with_arg_and_degradation_bins)
vf_with_arg_and_degradation_ratio = vf_with_arg_and_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 5. 统计在 vf bins 中，携带降解基因 (plastic_gene == 1) 的 bin 数量
vf_with_degradation_bins = vf_bins[vf_bins['plastic_gene'] == 1]
vf_with_degradation_count = len(vf_with_degradation_bins)
vf_with_degradation_ratio = vf_with_degradation_count / vf_bin_count * 100 if vf_bin_count > 0 else 0

# 输出结果
print(f"Total number of bins: {total_bins}")
print(f"Number of vf bins (if_vf == 1): {vf_bin_count}")
print(f"Percentage of vf bins: {vf_bin_ratio:.2f}%")
print(f"Number of vf bins with ARGs (arg_num != 0): {vf_with_arg_count}")
print(f"Percentage of vf bins with ARGs: {vf_with_arg_ratio:.2f}%")
print(f"Number of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_count}")
print(f"Percentage of vf bins with ARGs and degradation genes: {vf_with_arg_and_degradation_ratio:.2f}%")
print(f"Number of vf bins with degradation genes: {vf_with_degradation_count}")
print(f"Percentage of vf bins with degradation genes: {vf_with_degradation_ratio:.2f}%")

Total number of bins: 19
Number of vf bins (if_vf == 1): 1
Percentage of vf bins: 5.26%
Number of vf bins with ARGs (arg_num != 0): 1
Percentage of vf bins with ARGs: 100.00%
Number of vf bins with ARGs and degradation genes: 1
Percentage of vf bins with ARGs and degradation genes: 100.00%
Number of vf bins with degradation genes: 1
Percentage of vf bins with degradation genes: 100.00%
