In [1]:
import requests
from requests.adapters import HTTPAdapter, Retry
import json
import re

### 1. Configure retry strategy to handle temporary errors

In [2]:
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

### 2. Definition of batch size and the url from which recover the information that has to be filtered

In [3]:
# We define a basic URL for the search.
# We look for all non-fragment reviewed human protein having a coiled-coil region in the first 100 residues.
# The URL has been generated from the UniProtKB website, using the
# Advanced search function.
batch_size = 500
url = "https://rest.uniprot.org/uniprotkb/search?format=json&query=%28%28existence%3A1%29+AND+%28length%3A%5B40+TO+*%5D%29+AND+%28reviewed%3Atrue%29+AND+%28taxonomy_id%3A2759%29+AND+%28fragment%3Afalse%29+AND+%28ft_signal_exp%3A*%29%29&size=500"

def get_next_link(headers):
    if "Link" in headers:
        # The regular expression is used to extract the next link for pagination
        re_next_link = re.compile(r'<(.+)>; rel="next"')
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

# This function actually retrieve the next data batch in the search.
# The function act as an iterator, yielding the next result batch at every call
# The function terminates after the last batch has been returned. In this case,
# the next link will be None
def get_batch(batch_url):
    while batch_url:
        # Run the API call
        response = session.get(batch_url)
        # Will raise an error if an error status code is obtained
        response.raise_for_status()
        # Get the total number of entries in the search
        total = response.headers["x-total-results"]
        # Yield the response and the total number of entries
        yield response, total
        # Get the link to the API call for the next data batch
        batch_url = get_next_link(response.headers)





###3. Definition filters and fields that has to be inserted in tsv file

In [4]:

# All search criteria, except the location of the signal peptide shorter than 14 residues, and existence of the cleavage site, can be specified using the search URL
# To only include proteins with signal peptide longer than 14 residues and with the existence of the, we define a filter function
# It returns True if the entry passess the filter, False otherwise

def filter_entry(entry):
    # We iterate over the features of the entry
    for feature in entry["features"]:
        # We only consider features of type Coiled coil
        if feature["type"] == "Signal":
            # Check if the coiled-coil starts before position 100
            if type(feature["location"]["end"]["value"]) == int:
              if feature["location"]["end"]["value"] >= 14 and feature["description"]=="":
                return True
    return False

# We set the name of the output file, we want TSV output and FASTA output
output_file = "positive_dataset.tsv"
output_fasta_file="positive_dataset.fasta"

# We define a function to better control the TSV format in output.
# In particular, we run the API call requiring JSON format and build our own TSV file
# The this aim, the following function extract and process specific fields from the JSON file
organisms=["Metazoa","Fungi","Viridiplantae"]
def extract_fields(entry):
    # We iterate over the features of the entry
    for f in entry["features"]:
        # We only consider the signal fragment
        if f["type"] == "Signal":
            # Check if the signal peptide is longer than 14 and description is empty:
          if f["location"]["end"]["value"] >= 14 and f["description"]=="":
            s=f["location"]["start"]["value"]
            e=f["location"]["end"]["value"]
            break
    if entry["organism"]["lineage"][1] in organisms: # Check if the kingdom is Metazoa, Fungi, or Viridiplantae; if not, assign "Other"
      return (entry["primaryAccession"], entry["organism"]["scientificName"], entry["organism"]["lineage"][1],entry["sequence"]["length"],s,e)
    else:
      return (entry["primaryAccession"], entry["organism"]["scientificName"], "Other",entry["sequence"]["length"],s,e)

### 4. Define dataset creation: apply filters, extract fields, and save results in a TSV file


In [5]:
def get_dataset(search_url, filter_function, extract_function, output_file_name,output_fasta_file_name):
    filtered_json = []
    intestation=("EntryID","OrganismName","Kingdom","SequenceLength","SPStart","SPEnd")
    n_total, n_filtered = 0, 0
    # Run the API call in batches
    for batch, total in get_batch(search_url):
        # parse the JSON body of the response
        batch_json = json.loads(batch.text)
        print("Status code:", batch.status_code)
        print("Content-Type:", batch.headers.get("Content-Type"))
        print("Text preview:", batch.text[:200])
        # filter the entries
        for entry in batch_json["results"]:
            n_total += 1
            # Check if the entry passes the filter
            if filter_function(entry):
                n_filtered += 1
                filtered_json.append(entry) # or filtered_json.append(extract_function(entry)) if you want optimize avoiding to save the entire json
    print(n_total, n_filtered)
    with open(output_file_name, "w") as ofs:
        print(*intestation, sep="\t", file=ofs)
        for entry in filtered_json:
            # Extract the fields of interest
            fields = extract_fields(entry)
            # Print the fields in TSV format
            print(*fields, sep="\t", file=ofs)
            ofs.close
    #Insert Accession number and sequence for each filtered protein
    with open(output_fasta_file_name,"w") as ofs_fasta:
        for entry in filtered_json:
            print(">"+entry["primaryAccession"],file=ofs_fasta)
            print(entry["sequence"]["value"],file=ofs_fasta)
        ofs_fasta.close


In [6]:
get_dataset(url, filter_entry, extract_fields, output_file,output_fasta_file) #get tsv file

Status code: 200
Content-Type: application/json
Text preview: {"results":[{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"O00300","secondaryAccessions":["B2R9A8","O60236","Q53FX6","Q9UHP4"],"uniProtkbId":"TR11B_HUMAN","entryAudit":{"firstPubli
Status code: 200
Content-Type: application/json
Text preview: {"results":[{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"Q8NHP8","secondaryAccessions":["F5H5E2"],"uniProtkbId":"PLBL2_HUMAN","entryAudit":{"firstPublicDate":"2007-05-01","lastAn
Status code: 200
Content-Type: application/json
Text preview: {"results":[{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"P25031","secondaryAccessions":["Q64102","Q64231"],"uniProtkbId":"REG3B_RAT","entryAudit":{"firstPublicDate":"1992-05-01",
Status code: 200
Content-Type: application/json
Text preview: {"results":[{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"P84516","secondaryAccessions":["C5XIP7"],"uniProtkbId":"PER1_SORBI","e