In [1]:
from Bio import SeqIO, Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import requests
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")  # Required for Docker
chrome_options.add_argument("--disable-dev-shm-usage")  # Prevent resource issues
chrome_options.add_argument("--disable-gpu")  # Disable GPU (for headless environments)
chrome_options.add_argument("--remote-debugging-port=9222")  # Enable remote debugging

Entrez.email = "lukas.becker@hhu.de"

In [2]:
def query_interpro(sequence, email):
    """
    Submit a protein sequence to the InterProScan REST API and retrieve domain annotations.
    
    Args:
        sequence (str): Protein sequence in FASTA format.
        email (str): User email for API submission.
    
    Returns:
        list: A list of dictionaries with domain annotations.
    """
    url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5/run/"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    payload = {
        "email": email,
        "sequence": sequence,
    }

    # Submit the job
    response = requests.post(url, data=payload, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to submit to InterProScan: {response.text}")
    job_id = response.text
    print("[*] Your Job ID is: {}".format(job_id))
    # Check job status
    status_url = f"https://www.ebi.ac.uk/Tools/services/rest/iprscan5/status/{job_id}"
    result_url = f"https://www.ebi.ac.uk/Tools/services/rest/iprscan5/result/{job_id}/tsv"
    while True:
        status = requests.get(status_url).text
        if status == "FINISHED":
            break
        elif status in ["RUNNING", "PENDING","QUEUED"]:
            print("[*] ...")
            sleep(5)  # Wait and check again
        else:
            raise Exception(f"Job failed with status: {status}")

    # Retrieve results
    result = requests.get(result_url)
    if result.status_code != 200:
        raise Exception(f"Failed to retrieve results: {result.text}")
    
    pfam_domains = []
    domains = []
    for line in result.text.splitlines():
        parts = line.split("\t")
        if len(parts) > 5:  # Ensure we have enough columns
            if parts[4].startswith("PF"):
                pfam_domains.append({
                    "start": int(parts[6]),
                    "end": int(parts[7]),
                    "name": parts[4],
                })
            else:
                domains.append({
                    "start": int(parts[6]),
                    "end": int(parts[7]),
                    "name": parts[4],
                })                
    
    return result, pfam_domains, domains

In [40]:
def extract_pfam_short_names(url:str)->str:
    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        
        sleep(5)
        
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        driver.quit()
        short_name_row = soup.find('td', string='Short name')
        
        if short_name_row:
            short_name = short_name_row.find_next_sibling('td').get_text(strip=True)
            print("[+] Extracted short name:", short_name)
            return short_name
        else:
            short_name = url.split("/")[-1]
            print("[-] Short name equals PFAM-ID: {}".format(short_name))
            return short_name
    except Exception as e:
        raise Exception("[-] ERROR with exception: {}".format(e))

In [49]:
def plot_domains(sequence, domains, name):
    """Visualize protein domains."""
    fig, ax = plt.subplots(figsize=(10, 2))
    
    protein_length = len(sequence)
    ax.plot([0, protein_length], [0.5, 0.5], color="black", lw=1.5)  # Protein backbone
    
    for domain in domains:
        start, end = int(domain["start"]), int(domain["end"])
        ax.add_patch(Rectangle((start, 0.25), end - start, 0.5, color="lightblue", edgecolor="black", lw=1))
        ax.text((start + end) / 2, 0.75, domain["name"], ha="center", va="bottom", fontsize=8)
    
    ax.set_xlim(0, protein_length)
    ax.set_ylim(0, 1)
    #ax.set_xlabel("Position in Protein Sequence")
    ax.set_yticks([])
    ax.set_title("{} Protein Domains".format(name))
    plt.tight_layout()
    savep="../results/figures/{}.svg".format(name)
    plt.savefig(savep)
    plt.close()

In [None]:
def fetch_protein_sequences(protein_ids, database="protein"):
    sequences = {}
    for protein_id in protein_ids:
        try:
            # Step 2: Fetch the record from the database
            with Entrez.efetch(db=database, id=protein_id, rettype="fasta", retmode="text") as handle:
                record = SeqIO.read(handle, "fasta")
                sequences[protein_id] = str(record.seq)
        except Exception as e:
            print(f"Error fetching {protein_id}: {e}")
    return sequences

In [8]:
protein_ids_curvibacter = {
    "WP_087493643":"LysC",
    "WP_087496890":"asd",
    "WP_087494180":"hom",
    "WP_087497334":"MetX",
    "WP_087493700":"MetC",
    "WP_087495570":"MetH",
    "WP_087494645":"MetE",
    "WP_087497000":"MetY",
    "WP_087494706":"MetZ",
    "WP_087495830":"MetW",
    "WP_087497263":"MetF",
}

In [11]:
protein_ids_ecoli = {
    "NP_418448":"LysC",
    "NP_417891":"asd",
    "NP_417481":"MetC",
    "NP_418443":"MetH",
    "NP_418273":"MetE"
}

In [12]:
protein_ids_pseudomonas = {
    "NP_252425":"hom",
    "NP_249081":"MetX",
    "NP_251797":"MetZ",
    "NP_253712":"MetY",
    "WP_003084530.1":"MetW",
    
}

In [52]:
title_dict = {
    "WP_087493643":"LysC_curvi",
    "WP_087496890":"asd_curvi",
    "WP_087494180":"hom_curvi",
    "WP_087497334":"MetX_curvi",
    "WP_087493700":"MetC_curvi",
    "WP_087495570":"MetH_curvi",
    "WP_087494645":"MetE_curvi",
    "WP_087497000":"MetY_curvi",
    "WP_087494706":"MetZ_curvi",
    "WP_087495830":"MetW_curvi",
    "WP_087497263":"MetF_curvi",
    
    "NP_418448":"LysC_coli",
    "NP_417891":"asd_coli",
    "NP_417481":"MetC_coli",
    "NP_418443":"MetH_coli",
    "NP_418273":"MetE_coli",
    
    "NP_252425":"hom_pseudo",
    "NP_249081":"MetX_pseudo",
    "NP_251797":"MetZ_pseudo",
    "NP_253712":"MetY_pseudo",
    "WP_003084530.1":"MetW_pseudo",
}

In [9]:
protein_ids = list(protein_ids_curvibacter.keys())
protein_sequences = fetch_protein_sequences(protein_ids)
with open("../results/processed_data/methionine_synthesis_proteins_curvibacter.faa","w") as protein_file:
    for protein_id, sequence in protein_sequences.items():
        #print(f">{protein_id}\n{sequence}")
        protein_file.write(f">{protein_id}\n{sequence}\n")

In [13]:
protein_ids = list(protein_ids_ecoli.keys())
protein_sequences = fetch_protein_sequences(protein_ids)
with open("../results/processed_data/methionine_synthesis_proteins_ecoli.faa","w") as protein_file:
    for protein_id, sequence in protein_sequences.items():
        #print(f">{protein_id}\n{sequence}")
        protein_file.write(f">{protein_id}\n{sequence}\n")

In [14]:
protein_ids = list(protein_ids_pseudomonas.keys())
protein_sequences = fetch_protein_sequences(protein_ids)
with open("../results/processed_data/methionine_synthesis_proteins_pseudomonas.faa","w") as protein_file:
    for protein_id, sequence in protein_sequences.items():
        #print(f">{protein_id}\n{sequence}")
        protein_file.write(f">{protein_id}\n{sequence}\n")

In [15]:
protein_sequences

{'NP_252425': 'MKPVKVGICGLGTVGGGTFNVLERNAEEIARRAGRGIEVAQIAARRPNPKCDTGATPITADIFDVACNPEIDVVVELIGGYTLAHELVLKAIENGKHVVTANKALIAVHGNEIFAKAREKGVIVAFEAAVAGGIPVIKAIREGLSANRINWLAGIINGTGNFILSEMREKGRTFPDVLAEAQALGYAEADPTFDVEGIDAAHKLTILASIAFGIPLQFDKAYTEGISKLTSADVNYADALGYRIKHLGVARRTESGFELRVHPTLIPSDRLIANVNGVMNAVMVNGDAVGSTLYYGAGAGMEPTASSVVADLVDVVRAMTSDPENRVPHLAFQPDALSDHPILPIEACESAYYLRIQAKDHPGVLAQVATILSERGINIESIMQKEAEEQDGLVPMILVTHRVIEQRINDAIAALEALEGVSGPVVRIRVEQLN',
 'NP_249081': 'MPTVFPDDSVGLVSPQTLHFNEPLELTSGKSLAEYDLVIETYGELNATQSNAVLICHALSGHHHAAGYHSVDERKPGWWDSCIGPGKPIDTRKFFVVALNNLGGCNGSSGPASINPATGKVYGADFPMVTVEDWVHSQARLADRLGIRQWAAVVGGSLGGMQALQWTISYPERVRHCLCIASAPKLSAQNIAFNEVARQAILSDPEFLGGYFQEQGVIPKRGLKLARMVGHITYLSDDAMGAKFGRVLKTEKLNYDLHSVEFQVESYLRYQGEEFSTRFDANTYLLMTKALDYFDPAAAHGDDLVRTLEGVEADFCLMSFTTDWRFSPARSREIVDALIAAKKNVSYLEIDAPQGHDAFLMPIPRYLQAFSGYMNRISV',
 'NP_251797': 'MTQDWDAGRLDSDLEGAAFDTLAVRAGQRRTPEGEHGEALFTTSSYVFRTAADAAARFAGEVPGNVYSRYTNPTVRTFEERIAALEGAEQAVATASGMSAILALVMSLCSSGDHVLVSRSVFGSTISLFDKYFKRF

In [20]:
files = ["../results/processed_data/methionine_synthesis_proteins_curvibacter.faa",
         "../results/processed_data/methionine_synthesis_proteins_ecoli.faa",
        "../results/processed_data/methionine_synthesis_proteins_pseudomonas.faa"]

protein_sequences = {}
for file in files:
    
    with open(file,"r") as inputfile:
        for line in inputfile.readlines():
            if line.startswith(">"):
                header=line.split(">")[-1].strip()
                protein_sequences[header] = ""
            else:
                protein_sequences[header] += line.strip()

In [22]:
protein_domains = []
for protein in protein_sequences.keys():
    result, pfam_domains, domains = query_interpro(protein_sequences[protein],"lukas.becker@hhu.de")
    protein_domains.append([protein, pfam_domains, domains])

[*] Your Job ID is: iprscan5-R20241211-145731-0332-47706203-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID is: iprscan5-R20241211-145825-0562-13070711-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID is: iprscan5-R20241211-145925-0703-58956792-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID is: iprscan5-R20241211-150019-0818-88089678-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID is: iprscan5-R20241211-150112-0913-17393573-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID is: iprscan5-R20241211-150211-0705-45082756-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID is: iprscan5-R20241211-150304-0531-42933888-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID is: iprscan5-R20241211-150404-0794-79829618-p1m
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] ...
[*] Your Job ID 

In [41]:
with open("../results/processed_data/full_domains.txt","w") as domainfile:
    for protein in protein_domains:
        domainfile.write(">"+protein[0]+"\n")
        for pfam_domain in protein[1]:
            domainfile.write("PFAM;start:{}\tend:{}\tname:{}\n".format(
                pfam_domain['start'],pfam_domain['end'],pfam_domain['name']))
            
        for other_domain in protein[2]:
            domainfile.write("OTHER;start:{}\tend:{}\tname:{}\n".format(
                other_domain['start'],other_domain['end'],other_domain['name']))

In [2]:
data_dict = {}
with open("../results/processed_data/full_domains.txt","r") as domainfile:
    for line in domainfile.readlines():
        if line.startswith(">"):
            protein = line.split(">")[1].strip()
            if protein not in list(data_dict.keys()):
                data_dict[protein] = [[],[]]
        elif line.startswith("PFAM"):
            line = line.split("PFAM;")[-1]
            line = line.split("\t")
            start = line[0].split(":")[1]
            end = line[1].split(":")[1]
            name = line[2].split(":")[1].strip()
            
            pfam_entry = {"start":start,"end":end,"name":name}
            data_dict[protein][0].append(pfam_entry)
        elif line.startswith("OTHER"):
            line = line.split("OTHER;")[-1]
            line = line.split("\t")
            start = line[0].split(":")[1]
            end = line[1].split(":")[1]
            name = line[2].split(":")[1].strip()
            
            domain_entry = {"start":start,"end":end,"name":name}
            data_dict[protein][1].append(domain_entry)
        else:
            print(line)

In [3]:
data_dict

{'WP_087493643': [[{'start': '403', 'end': '461', 'name': 'PF22468'},
   {'start': '9', 'end': '299', 'name': 'PF00696'}],
  [{'start': '331', 'end': '397', 'name': 'G3DSA'},
   {'start': '401', 'end': '462', 'name': 'SSF55021'},
   {'start': '323', 'end': '407', 'name': 'SSF55021'},
   {'start': '9', 'end': '314', 'name': 'G3DSA'},
   {'start': '324', 'end': '461', 'name': 'G3DSA'},
   {'start': '11', 'end': '313', 'name': 'SSF53633'},
   {'start': '9', 'end': '462', 'name': 'PTHR21499'}]],
 'WP_087496890': [[{'start': '147', 'end': '360', 'name': 'PF02774'},
   {'start': '5', 'end': '122', 'name': 'PF01118'}],
  [{'start': '2', 'end': '374', 'name': 'MF_02121'},
   {'start': '5', 'end': '363', 'name': 'G3DSA'},
   {'start': '4', 'end': '123', 'name': 'SM00859'},
   {'start': '268', 'end': '282', 'name': 'PS01103'},
   {'start': '5', 'end': '148', 'name': 'SSF51735'},
   {'start': '135', 'end': '361', 'name': 'SSF55347'},
   {'start': '2', 'end': '376', 'name': 'PIRSF000148'},
   {'st

In [46]:
for entry in data_dict.keys():
    print("[*] Working with: {}".format(entry))
    
    for pfam_entry in data_dict[entry][0]:
        
        pfam_id = pfam_entry["name"]
        url = "https://www.ebi.ac.uk/interpro/entry/pfam/{}/".format(pfam_id)
        pfam_name = extract_pfam_short_names(url)
        pfam_entry["name"] = pfam_name
    print("[*] DONE")

[*] Working with: WP_087493643
[+] Extracted short name: ACT_9
[+] Extracted short name: AA_kinase
[*] DONE
[*] Working with: WP_087496890
[+] Extracted short name: Semialdhyde_dhC
[+] Extracted short name: Semialdhyde_dh
[*] DONE
[*] Working with: WP_087494180
[+] Extracted short name: ACT
[+] Extracted short name: NAD_binding_3
[+] Extracted short name: Homoserine_dh
[*] DONE
[*] Working with: WP_087497334
[+] Extracted short name: Abhydrolase_1
[*] DONE
[*] Working with: WP_087493700
[+] Extracted short name: Cys_Met_Meta_PP
[*] DONE
[*] Working with: WP_087495570
[+] Extracted short name: B12-binding_2
[+] Extracted short name: Pterin_bind
[+] Extracted short name: B12-binding
[+] Extracted short name: Met_synt_B12
[*] DONE
[*] Working with: WP_087494645
[+] Extracted short name: Meth_synt_2
[*] DONE
[*] Working with: WP_087497000
[+] Extracted short name: Cys_Met_Meta_PP
[*] DONE
[*] Working with: WP_087494706
[+] Extracted short name: Cys_Met_Meta_PP
[*] DONE
[*] Working with: WP

In [53]:
for entry in data_dict.keys():
    print("[*] Plotting: {}".format(entry))
    plot_domains(protein_sequences[entry], data_dict[entry][0], title_dict[entry])
    print("[+] DONE ...")

[*] Plotting: WP_087493643
[+] DONE ...
[*] Plotting: WP_087496890
[+] DONE ...
[*] Plotting: WP_087494180
[+] DONE ...
[*] Plotting: WP_087497334


  ax.add_patch(Rectangle((start, 0.25), end - start, 0.5, color="lightblue", edgecolor="black", lw=1))


[+] DONE ...
[*] Plotting: WP_087493700
[+] DONE ...
[*] Plotting: WP_087495570
[+] DONE ...
[*] Plotting: WP_087494645
[+] DONE ...
[*] Plotting: WP_087497000
[+] DONE ...
[*] Plotting: WP_087494706
[+] DONE ...
[*] Plotting: WP_087495830
[+] DONE ...
[*] Plotting: WP_087497263
[+] DONE ...
[*] Plotting: NP_418448
[+] DONE ...
[*] Plotting: NP_417891
[+] DONE ...
[*] Plotting: NP_417481
[+] DONE ...
[*] Plotting: NP_418443
[+] DONE ...
[*] Plotting: NP_418273
[+] DONE ...
[*] Plotting: NP_252425
[+] DONE ...
[*] Plotting: NP_249081
[+] DONE ...
[*] Plotting: NP_251797
[+] DONE ...
[*] Plotting: NP_253712
[+] DONE ...
[*] Plotting: WP_003084530.1
[+] DONE ...


In [4]:
data_dict

{'WP_087493643': [[{'start': '403', 'end': '461', 'name': 'PF22468'},
   {'start': '9', 'end': '299', 'name': 'PF00696'}],
  [{'start': '331', 'end': '397', 'name': 'G3DSA'},
   {'start': '401', 'end': '462', 'name': 'SSF55021'},
   {'start': '323', 'end': '407', 'name': 'SSF55021'},
   {'start': '9', 'end': '314', 'name': 'G3DSA'},
   {'start': '324', 'end': '461', 'name': 'G3DSA'},
   {'start': '11', 'end': '313', 'name': 'SSF53633'},
   {'start': '9', 'end': '462', 'name': 'PTHR21499'}]],
 'WP_087496890': [[{'start': '147', 'end': '360', 'name': 'PF02774'},
   {'start': '5', 'end': '122', 'name': 'PF01118'}],
  [{'start': '2', 'end': '374', 'name': 'MF_02121'},
   {'start': '5', 'end': '363', 'name': 'G3DSA'},
   {'start': '4', 'end': '123', 'name': 'SM00859'},
   {'start': '268', 'end': '282', 'name': 'PS01103'},
   {'start': '5', 'end': '148', 'name': 'SSF51735'},
   {'start': '135', 'end': '361', 'name': 'SSF55347'},
   {'start': '2', 'end': '376', 'name': 'PIRSF000148'},
   {'st

In [None]:
def plot_metH(sequence, domains, name):
    """Visualize protein domains."""
    fig, ax = plt.subplots(figsize=(10, 2))
    
    protein_length = len(sequence)
    ax.plot([0, protein_length], [0.5, 0.5], color="black", lw=1.5)  # Protein backbone
    
    for domain in domains:
        start, end = int(domain["start"]), int(domain["end"])
        ax.add_patch(Rectangle((start, 0.25), end - start, 0.5, color="lightblue", edgecolor="black", lw=1))
        ax.text((start + end) / 2, 0.75, domain["name"], ha="center", va="bottom", fontsize=8)
    
    ax.set_xlim(0, protein_length)
    ax.set_ylim(0, 1)
    #ax.set_xlabel("Position in Protein Sequence")
    ax.set_yticks([])
    ax.set_title("{} Protein Domains".format(name))
    plt.tight_layout()
    savep="../results/figures/{}.svg".format(name)
    plt.savefig(savep)
    plt.close()