In [None]:
import pandas as pd
import os
def csv_to_fasta(csv_file, output_fasta):
    try:
        df = pd.read_csv(csv_file)
        if not os.path.exists('fasta_files'):
            os.makedirs('fasta_files')
        with open(output_fasta, 'w') as f:
            for _, row in df.iterrows():
                entry = str(row.get('Entry', '')).strip()
                sequence = str(row.get('Sequence', '')).strip()
                protein_name = str(row.get('Protein names', '')).strip()

                header = f"{entry} | {protein_name}".replace('\n', ' ').replace('\r', ' ')
                f.write(f'>{header}\n{sequence}\n')
        print(f"[INFO] FASTA written to: {output_fasta}")
    except Exception as e:
        print(f"[ERROR] Failed to convert {csv_file} to FASTA: {e}")

csv_to_fasta('datasets/arch_retrieval.csv', 'fasta_files/arch.fasta')
csv_to_fasta('datasets/euk_retrieval.csv', 'fasta_files/euk.fasta')


[INFO] FASTA written to: fasta_files/arch.fasta
[INFO] FASTA written to: fasta_files/euk.fasta


In [None]:
import pandas as pd
from collections import defaultdict
import os
import json

def generate_ground_truth_mapping(input_path, output_filename):
    try:
        df = pd.read_csv(input_path)
    except Exception as e:
        print(f"Error reading file: {e}")
        return

    if not {'Entry', 'Protein names'}.issubset(df.columns):
        print("Input file must contain 'Entry' and 'Protein names' columns.")
        return

    protein_to_entries = defaultdict(list)
    for _, row in df.iterrows():
        protein_to_entries[row['Protein names']].append(row['Entry'])

    relevance_data = []
    for _, row in df.iterrows():
        qid = row['Entry']
        relevant_ids = [
            entry_id for entry_id in protein_to_entries[row['Protein names']]
            if entry_id != qid
        ]
        relevance_data.append({
            'qid': qid,
            'relevant_sqids': relevant_ids  
        })

    output_path = os.path.join(os.path.dirname(input_path), output_filename)
    with open(output_path, 'w') as f:
        json.dump(relevance_data, f, indent=2)
    print(f"Ground truth mapping saved to: {output_path}")

generate_ground_truth_mapping("datasets/arch_retrieval.csv", "arch_ground_truth.json")
generate_ground_truth_mapping("datasets/euk_retrieval.csv", "euk_ground_truth.json")


Ground truth mapping saved to: datasets/arch_ground_truth.json
Ground truth mapping saved to: datasets/euk_ground_truth.json


In [None]:
# makeblastdb -in arch.fasta -dbtype prot -out database/arch_db
# makeblastdb -in euk.fasta -dbtype prot -out database/euk_db


SyntaxError: invalid syntax (1132907889.py, line 1)

In [None]:
import subprocess

def run_blast(query_fasta, db_name, output_file):
    cmd = [
        "blastp",
        "-query", query_fasta,
        "-db", db_name,
        "-out", output_file,
        "-outfmt", "6 qseqid sseqid evalue bitscore",
        "-evalue", "1e-3",
        "-max_target_seqs", "5"
    ]
    subprocess.run(cmd)

scoring_matrix = ["BLOSUM62", "BLOSUM80", "PAM30", "PAM70"]
run_blast("arch.fasta", "database/arch_db", "arch_blast_results.txt")
run_blast("euk.fasta", "database/euk_db", "euk_blast_results.txt")




In [52]:
from collections import defaultdict

def load_annotations(csv_file):
    df = pd.read_csv(csv_file)
    return dict(zip(df['Entry'], df['Protein names']))

def load_blast_results(blast_file):
    results = defaultdict(list)
    with open(blast_file, 'r') as f:
        for line in f:
            qid, sid, *_ = line.strip().split()
            if qid != sid:
                results[qid].append(sid)
    return results
