In [10]:
import pandas as pd
import requests
from tqdm import tqdm
import os

# 📥 Load and Clean Data
url_pos = (
    "https://raw.githubusercontent.com/Jeffateth/AllergenPredict/main/positive.csv"
)
url_neg = (
    "https://raw.githubusercontent.com/Jeffateth/AllergenPredict/main/negative.csv"
)
use_columns = ["Epitope.1", "Epitope.11"]

positive_df = pd.read_csv(url_pos, usecols=use_columns)
negative_df = pd.read_csv(url_neg, usecols=use_columns)
positive_df["label"] = 1
negative_df["label"] = 0

df = pd.concat([positive_df, negative_df], ignore_index=True)
df = df.rename(columns={"Epitope.1": "sequence", "Epitope.11": "protein_url"})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# ✅ Filter valid amino acid sequences
valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
df = df[
    df["sequence"].apply(lambda s: isinstance(s, str) and set(s).issubset(valid_aa))
]

# 🧼 Drop rows with missing or malformed URLs
df = df[
    df["protein_url"].apply(lambda x: isinstance(x, str) and x.startswith("http"))
].reset_index(drop=True)

# 🧬 Function to fetch protein sequence from UniProt


def fetch_protein_sequence(url):
    try:
        uniprot_id = url.strip().split("/")[-1]
        fasta_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
        response = requests.get(fasta_url)
        if response.status_code == 200:
            lines = response.text.strip().splitlines()
            sequence = "".join(line for line in lines if not line.startswith(">"))
            return sequence
        else:
            return None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None


# 📁 Check if we already have a temp file
temp_path = "temp_allergen_sequences.csv"
if os.path.exists(temp_path):
    df_existing = pd.read_csv(temp_path)
    print(f"🔁 Resuming from {temp_path}, {len(df_existing)} rows already processed.")
else:
    df_existing = df.copy()
    df_existing["full_protein_sequence"] = None

# ⏳ Process rows and save intermediate results
for i in tqdm(range(len(df_existing))):
    if pd.isna(df_existing.at[i, "full_protein_sequence"]):
        url = df_existing.at[i, "protein_url"]
        seq = fetch_protein_sequence(url)
        df_existing.at[i, "full_protein_sequence"] = seq

        # 🧾 Print preview
        print(f"\n✅ Row {i}: {url}")
        print(f"   Sequence (first 60 aa): {seq[:60] if seq else 'None'}")

        # 💾 Save every 10 rows
        if i % 10 == 0:
            df_existing.to_csv(temp_path, index=False)

# ✅ Final save
df_existing.to_csv("allergen_data_with_full_sequences.csv", index=False)
print("\n🎉 Done! Final result saved.")

Output hidden; open in https://colab.research.google.com to view.