In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
import itertools

In [13]:
match_performance_list = glob.glob('/root/apabenchmark/data/performance/*/*match_performance.tsv')
pas_quantification_performance_list = glob.glob('/root/apabenchmark/data/performance/*/*pas_quantify_performance.tsv')
de_apa_performance_list = glob.glob('/root/apabenchmark/data/performance/*/*de_apa_performance.tsv')
match_te_list = glob.glob('/root/apabenchmark/data/performance/*/*match_te.csv')
match_pas_list = glob.glob('/root/apabenchmark/data/performance/*/*match_pas.csv')
te_gap_list = glob.glob('/root/apabenchmark/data/performance/*/*te_gap.tsv')

In [107]:
match_performance_df = pd.DataFrame()
for match_performance in tqdm(match_performance_list):
    match_performance_df = pd.concat([match_performance_df, pd.read_csv(match_performance, sep='\t')])

100%|██████████| 1512/1512 [00:02<00:00, 608.20it/s]


In [108]:
match_performance_df.to_csv('/root/apabenchmark/data/performance/match_performance.tsv', sep='\t', index=False)

In [109]:
pas_quantification_performance_df = pd.DataFrame()
for pas_quantification_performance in tqdm(pas_quantification_performance_list):
    pas_quantification_performance_df = pd.concat([pas_quantification_performance_df, pd.read_csv(pas_quantification_performance, sep='\t')])

100%|██████████| 1512/1512 [00:02<00:00, 693.79it/s]


In [110]:
pas_quantification_performance_df.to_csv('/root/apabenchmark/data/performance/pas_quantification_performance.tsv', sep='\t', index=False)

In [111]:
de_apa_performance_df = pd.DataFrame()
for de_apa_performance in tqdm(de_apa_performance_list):
    de_apa_performance_df = pd.concat([de_apa_performance_df, pd.read_csv(de_apa_performance, sep='\t')])

100%|██████████| 1512/1512 [01:11<00:00, 21.14it/s]


In [112]:
de_apa_performance_df.to_csv('/root/apabenchmark/data/performance/de_apa_performance.tsv', sep='\t', index=False)

In [113]:
de_apa_performance_df[
    de_apa_performance_df["filter_type_2"].str.endswith("fc_0.5") | de_apa_performance_df["filter_type_2"].str.endswith("0.2")
    ].to_csv('/root/apabenchmark/data/performance/de_apa_performance_filtered.tsv', sep='\t', index=False)

In [114]:
te_dict = {}
for match_te in tqdm(match_te_list):
    tool = match_te.split('/')[5]
    protocol = match_te.split('/')[-1].split('_')[0]
    pas_group = match_te.split('/')[-1].split('_')[4]
    sample = "_".join(match_te.split('/')[-1].split('_')[:7])
    match_te_set = set(pd.read_csv(match_te)["value"].to_list())
    if pas_group not in te_dict:
        te_dict[pas_group] = []
    
    te_dict[pas_group].append({
        "tool": tool,
        "protocol": protocol,
        "pas_group": pas_group,
        "sample": sample,
        "te": match_te_set
    })

100%|██████████| 1512/1512 [00:10<00:00, 146.44it/s]


In [115]:
pas_dict = {}
for match_pas in tqdm(match_pas_list):
    tool = match_pas.split('/')[5]
    protocol = match_pas.split('/')[-1].split('_')[0]
    pas_group = match_pas.split('/')[-1].split('_')[4]
    sample = "_".join(match_pas.split('/')[-1].split('_')[:7])
    match_pas_set = set(pd.read_csv(match_pas)["value"].to_list())
    if pas_group not in pas_dict:
        pas_dict[pas_group] = []
    pas_dict[pas_group].append({
        "tool": tool,
        "protocol": protocol,
        "pas": pas_group,
        "sample": sample,
        "pas": match_pas_set
    })

100%|██████████| 1512/1512 [00:40<00:00, 36.95it/s] 


In [116]:
pas_jaccard_list = []
for pas_group in pas_dict:
    combinations = list(itertools.combinations(pas_dict[pas_group], 2))
    for combination in combinations:
        pas_jaccard_list.append({
            "tool_1": combination[0]["tool"],
            "tool_2": combination[1]["tool"],
            "protocol_1": combination[0]["protocol"],
            "protocol_2": combination[1]["protocol"],
            "sample_1": combination[0]["sample"],
            "sample_2": combination[1]["sample"],
            "pas_group": pas_group,
            "jaccard": len(combination[0]["pas"].intersection(combination[1]["pas"])) / len(combination[0]["pas"].union(combination[1]["pas"]))
        })
pas_jaccard_df = pd.DataFrame(pas_jaccard_list)
pas_jaccard_df_reversed = pas_jaccard_df.copy()
pas_jaccard_df_reversed["tool_1"] = pas_jaccard_df["tool_2"]
pas_jaccard_df_reversed["tool_2"] = pas_jaccard_df["tool_1"]
pas_jaccard_df_reversed["protocol_1"] = pas_jaccard_df["protocol_2"]
pas_jaccard_df_reversed["protocol_2"] = pas_jaccard_df["protocol_1"]
pas_jaccard_df_reversed["sample_1"] = pas_jaccard_df["sample_2"]
pas_jaccard_df_reversed["sample_2"] = pas_jaccard_df["sample_1"]
pas_jaccard_df_reversed = pas_jaccard_df_reversed.drop_duplicates()
pas_jaccard_df = pd.concat([pas_jaccard_df, pas_jaccard_df_reversed])
pas_jaccard_df.to_csv('/root/apabenchmark/data/performance/pas_jaccard.tsv', sep='\t', index=False)

In [117]:
te_jaccard_list = []
for key in te_dict:
    combinations = list(itertools.combinations(te_dict[pas_group], 2))
    for combination in combinations:
        te_jaccard_list.append({
            "tool_1": combination[0]["tool"],
            "tool_2": combination[1]["tool"],
            "protocol_1": combination[0]["protocol"],
            "protocol_2": combination[1]["protocol"],
            "sample_1": combination[0]["sample"],
            "sample_2": combination[1]["sample"],
            "pas_group": pas_group,
            "jaccard": len(combination[0]["te"].intersection(combination[1]["te"])) / len(combination[0]["te"].union(combination[1]["te"]))
        })
te_jaccard_df = pd.DataFrame(te_jaccard_list)
te_jaccard_df_reversed = te_jaccard_df.copy()
te_jaccard_df_reversed["tool_1"] = te_jaccard_df["tool_2"]
te_jaccard_df_reversed["tool_2"] = te_jaccard_df["tool_1"]
te_jaccard_df_reversed["protocol_1"] = te_jaccard_df["protocol_2"]
te_jaccard_df_reversed["protocol_2"] = te_jaccard_df["protocol_1"]
te_jaccard_df_reversed["sample_1"] = te_jaccard_df["sample_2"]
te_jaccard_df_reversed["sample_2"] = te_jaccard_df["sample_1"]
te_jaccard_df_reversed = te_jaccard_df_reversed.drop_duplicates()
te_jaccard_df = pd.concat([te_jaccard_df, te_jaccard_df_reversed])
te_jaccard_df.to_csv('/root/apabenchmark/data/performance/te_jaccard.tsv', sep='\t', index=False)

In [19]:
te_gap_df_list = []
for te_gap in tqdm(te_gap_list):
    tool = te_gap.split('/')[5]
    protocol = te_gap.split('/')[-1].split('_')[0]
    pas_group = te_gap.split('/')[-1].split('_')[4]
    sample = "_".join(te_gap.split('/')[-1].split('_')[:7])
    te_gap_df = pd.read_csv(te_gap, sep='\t')
    te_gap_df["tool"] = tool
    te_gap_df["protocol"] = protocol
    te_gap_df["pas_group"] = pas_group
    te_gap_df["sample"] = sample
    te_gap_df_list.append(te_gap_df)

all_te_gap_df = pd.concat(te_gap_df_list)

100%|██████████| 1512/1512 [00:33<00:00, 44.80it/s]


In [24]:
all_te_gap_df = pd.concat(te_gap_df_list).reset_index(drop=True)
# all_te_gap_df["match"] = all_te_gap_df["pas_num"] == all_te_gap_df["pd_pas_num"]

In [25]:
from pyarrow import feather
all_te_gap_df.to_feather('/root/apabenchmark/data/performance/te_gap.feather')

In [17]:
all_te_gap_df.to_csv('/root/apabenchmark/data/performance/te_gap.tsv', sep='\t', index=False)