In [None]:
import requests
import pandas as pd

# Configuration
INPUT_FILE = "https://raw.githubusercontent.com/Jeffateth/AllergenPredict/main/allergen_data_with_full_sequences.csv"  # change input file
SEQUENCE_COLUMN = (
    "full_parent_protein_sequence"  # specify in which column the sequence is saved
)


def search_rcsb_by_sequence(sequence):
    """Search the RCSB PDB by an amino acid sequence using a POST request."""
    url = "https://search.rcsb.org/rcsbsearch/v2/query"

    query = {
        "query": {
            "type": "terminal",
            "service": "sequence",
            "parameters": {
                "evalue_cutoff": 0.1,
                "identity_cutoff": 0.9,
                "target": "pdb_protein_sequence",
                "value": sequence,
            },
        },
        "return_type": "entry",
    }

    headers = {"Content-Type": "application/json"}

    response = requests.post(url, json=query, headers=headers)

    if response.status_code == 200:
        data = response.json()
        pdb_ids = [result["identifier"] for result in data.get("result_set", [])]
        return pdb_ids
    elif response.status_code == 204:
        return []  # No content (no matches found)
    else:
        print(f"Error {response.status_code} for sequence: {sequence}")
        return []


def main():
    # Read the input CSV
    df = pd.read_csv(INPUT_FILE)

    # Check if the expected column exists
    if SEQUENCE_COLUMN not in df.columns:
        print(f"Column '{SEQUENCE_COLUMN}' not found in the input file.")
        return

    # Work with the first 1000 sequences and filter out short ones
    # df_subset = df.head(1000).copy()
    df_subset = df[df[SEQUENCE_COLUMN].str.len() >= 20]

    # Create a new column to hold matching PDB codes
    df_subset["pdb_matches"] = df_subset[SEQUENCE_COLUMN].apply(
        lambda seq: search_rcsb_by_sequence(seq)
    )

    # Save results
    df_subset.to_csv("sequence_pdb_matches.csv", index=False)
    print("Results saved to sequence_pdb_matches.csv")


if __name__ == "__main__":
    main()

Results saved to sequence_pdb_matches.csv


In [None]:
df_matches = pd.read_csv("sequence_pdb_matches.csv")
print(df_matches)

                    epitope_sequence  \
0    FGGRAEWGTNTADNDDTDGNGHGTHTASTAA   
1               TEEEKNRLNFLKKISQRYQK   
2      TAIFQDTVRAEMTKVLAPAFKKELERNNQ   
3               RQRVEQEQEQEQDEYPYSQR   
4           PKHADADNILVIQQGQATVTVANG   
..                               ...   
155             TALKKAITAMSQAQKAAKPA   
156             ELFRQFYQLDAYPSGAWYYV   
157             KAKFETFKKEMKAKEAELAK   
158         ARQQWELQEDRRCQSQLERANLRP   
159             PYSPSQDPDRRDPYSPSPYD   

                                   protein_url  label  \
0        http://www.uniprot.org/uniprot/P9WEW4      0   
1        http://www.uniprot.org/uniprot/P02663      0   
2        http://www.uniprot.org/uniprot/P49273      1   
3        http://www.uniprot.org/uniprot/Q9SQH1      1   
4        http://www.uniprot.org/uniprot/B3IXL2      1   
..                                         ...    ...   
155      http://www.uniprot.org/uniprot/P22286      0   
156      http://www.uniprot.org/uniprot/P02662      1   
157  h