In [1]:
import os
import pandas as pd
from collections import defaultdict

In [9]:
BASE_DIR = "/home/biolab-office-1/DATALAB/DEV/TOOL/evaluation"
METHODS = [("NCBI", "ncbi.gff"), ("Prodigal", "prodigal.gff"), ]#("TIS_Annotator", "our_tool.gff")]

In [10]:
def parse_gff(file_path):
    cds_positions = set()
    
    with open(file_path, "r") as f:
        for line in f:
            if line.startswith("#") or "\tCDS\t" not in line:
                continue
            parts = line.strip().split("\t")
            start, end, strand = int(parts[3]), int(parts[4]), parts[6]
            cds_positions.add((start, end, strand))
    
    return cds_positions

In [11]:
comparison_results = []

for bacteria_folder in sorted(os.listdir(BASE_DIR)):
    bacteria_path = os.path.join(BASE_DIR, bacteria_folder)
    if not os.path.isdir(bacteria_path):
        continue

    verified_file = os.path.join(bacteria_path, "verified.gff")
    verified_cds = parse_gff(verified_file)
    
    bacteria_data = {"Bacteria": bacteria_folder, "Total Verified CDS": len(verified_cds)}

    for method_name, method in METHODS:
        method_file = os.path.join(bacteria_path, method)
        method_cds = parse_gff(method_file)

        matched_cds = method_cds.intersection(verified_cds)
        missed_cds = verified_cds - method_cds
        extra_cds = method_cds - verified_cds

        bacteria_data[f"{method_name}_Matched"] = len(matched_cds)
        bacteria_data[f"{method_name}_Missed"] = len(missed_cds)
        bacteria_data[f"{method_name}_Total Found"] = len(method_cds)

    comparison_results.append(bacteria_data)

In [12]:
df_results = pd.DataFrame(comparison_results)
df_results.head(100)

Unnamed: 0,Bacteria,Total Verified CDS,NCBI_Matched,NCBI_Missed,NCBI_Total Found,Prodigal_Matched,Prodigal_Missed,Prodigal_Total Found
0,Escherichia_coli_K_12_substr__MG1655_uid57779,769,769,0,4140,338,431,4347
1,Halobacterium_salinarum_R1_uid61571,530,529,1,2749,243,287,2851
2,Mycobacterium_tuberculosis_H37Rv_uid57777,701,701,0,3906,311,390,4204
3,Natronomonas_pharaonis_DSM_2160_uid58435,315,315,0,2820,169,146,2873
4,Roseobacter_denitrificans_Och114,526,504,22,4057,248,278,4120
