# Calculate each ARG class and their carriers counts

In [None]:
import os
import pandas as pd
from collections import defaultdict

# 文件夹路径
folder_path = '/lomi_home/gaoyang/software/CompRanking/tmp_DSR/DSR/CompRanking_result'

# 初始化最终结果的字典
final_dict = defaultdict(lambda: defaultdict(int))

# 遍历文件夹中的所有文件
for filename in os.listdir(folder_path):
    if filename.endswith('.contigs_AMR_MOB_prediction.tsv'):
        filepath = os.path.join(folder_path, filename)
        
        # 提取样本名
        sample_name = filename.split('_')[1].split(".")[0]
        
        # 初始化当前文件的结果字典
        current_dict = defaultdict(lambda: [0, 0, 0, 0, 0])
        
        # 使用 chunksize 分块读取文件
        for chunk in pd.read_csv(filepath, sep='\t', chunksize=10000):
            # 按条件过滤数据
            filtered_chunk = chunk[chunk['CompRanking_MGE_prediction'].isin(['plasmid', 'phage', 'unclassified', 'IS', 'IE'])]
            
            # 预处理 ARG_class 列
            def preprocess_arg_class(arg_class):
                if arg_class == '-':
                    return None
                arg_class = arg_class.split('/')[0]
                arg_class = arg_class.split(':')[0]
                return arg_class
            
            filtered_chunk['ARG_class'] = filtered_chunk['ARG_class'].apply(preprocess_arg_class)
            filtered_chunk = filtered_chunk.dropna(subset=['ARG_class'])
            
            # 使用 groupby 和 size 进行计数
            grouped = filtered_chunk.groupby(['ARG_class', 'CompRanking_MGE_prediction']).size().reset_index(name='count')
            
            for _, row in grouped.iterrows():
                arg_class = row['ARG_class']
                mge_prediction = row['CompRanking_MGE_prediction']
                count = row['count']
                
                if mge_prediction == 'plasmid':
                    current_dict[arg_class][0] += count
                elif mge_prediction == 'phage':
                    current_dict[arg_class][1] += count
                elif mge_prediction == 'unclassified':
                    current_dict[arg_class][2] += count
                elif mge_prediction == 'IS':
                    current_dict[arg_class][3] += count
                elif mge_prediction == 'IE':
                    current_dict[arg_class][4] += count
        
        # 更新最终结果字典
        for arg_class, counts in current_dict.items():
            final_dict[arg_class][f'{sample_name}_x'] += counts[0] #plasmid
            final_dict[arg_class][f'{sample_name}_y'] += counts[1] #phage
            final_dict[arg_class][f'{sample_name}_z'] += counts[2] #unclassified
            final_dict[arg_class][f'{sample_name}_m'] += counts[3] #IS
            final_dict[arg_class][f'{sample_name}_n'] += counts[4] #IE

In [None]:
final_df = pd.DataFrame(final_dict).T.fillna(0).astype(int)
final_df 

In [15]:
final_df.to_csv('/lomi_home/gaoyang/software/CompRanking/tmp_DSR/DSR/CompRanking_result/ori_3_RGI_with_multidrug_adjust_cell_rpkg/MGE_carried_ARGs_type_count_final_result.tsv', sep='\t')