In [None]:
from pathlib import Path
import pandas as pd

code_name_txt = Path("./peptidePredictionTestingList.txt")
fasta_path = Path("../data/fulldesign_2019-02-27_wGBKsw.fasta")
meta_path = Path("../data/input_data_20_4_10_all.tsv")
output_path = Path("../data/predictions_input_data.fasta")

In [None]:
codes = pd.read_csv(code_name_txt, header=None, names=["CodeName"], dtype=str)
codes["CodeName"] = codes["CodeName"].str.strip()
codes = codes.dropna().drop_duplicates()

print(f"Loaded {len(codes)} unique code names.")
codes.head()

In [None]:
meta = pd.read_csv(meta_path, sep="\t", dtype=str, low_memory=False)

print(f"Loaded metadata with {len(meta)} rows.")
meta.head()

In [None]:
needed = meta.merge(codes, on="CodeName", how="inner")
needed = needed[["CodeName", "Peptide", "FullName"]].copy()
needed["Peptide"] = needed["Peptide"].str.strip()
needed["FullName"] = needed["FullName"].str.strip()

print(f"Matched peptides: {len(needed)}")
needed.head()

In [None]:
def fasta_to_dict(fasta_path):
    seqs = {}
    current_id = None

    with fasta_path.open("r", encoding="utf-8") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue

            if line.startswith(">"):
                current_id = line[1:].split()[0]
                seqs[current_id] = []

            else:
                if current_id is None:
                    raise ValueError("FASTA format error")
                seqs[current_id].append(line)

    return {k: "".join(v) for k, v in seqs.items()}

In [None]:
prot_map = fasta_to_dict(fasta_path)
print(f"Loaded {len(prot_map)} proteins.")
prot_map

In [None]:
def extract_id(fullname):
    s = fullname.split("ID=")[1].split()[0]
    return f"ID={s}"

needed["protein_id"] = needed["FullName"].apply(extract_id)
needed.head()

In [None]:
needed["protein_seq"] = needed["protein_id"].map(prot_map)
missing = needed["protein_seq"].isna().sum()
print(f"Missing protein sequences for {missing} peptides.")

In [None]:
with open(output_path, "w", encoding="utf-8") as fout:
    for _, row in needed.iterrows():
        fout.write(f">{row['Peptide']}\n")
        fout.write(f"{row['protein_seq']}\n")

print(f"Wrote FASTA prediction data to {output_path}. Saved {len(needed)} records.")