In [21]:
import requests

In [22]:
import re

In [29]:
file_path = r"C:\Users\Jofu2\Desktop\rosalind_mprt_5.txt"

In [30]:
results = {}

In [31]:
# Function to extract the protein sequence from UniProt database
def sequence_extraction(uniprotID):
    proteinurl = "http://uniprot.org/uniprot/" + uniprotID + ".txt"
    file_contents = requests.get(proteinurl)

    if file_contents.status_code == 200:  # http code to say that the webpage is ok 
        lines = file_contents.text.split('\n')
        sequence = ""  # variable sequence is empty but I can append data to it 
        in_sequence = False

        for line in lines:  # this is the for loop to search the .txt file for "SQ" going through each line of the text file
            if line.startswith("SQ"):
                in_sequence = True
                continue  # continue used in loops to skip the rest of the code and move to the next iteration.

            if in_sequence and line.strip():
                line = line.replace(" ", "").replace("/", "")  # replaces any spaces or / with nothing
                sequence += line  # Concatenate the sequence lines to the variable sequence 

        if sequence:
            return sequence
        else:
            return "No sequence found for the provided UniProt ID."
    else:
        return "UNIPROT ERROR: Invalid UniProt ID or resource not found."

In [32]:
# Function to search for the N-glycosylation motif
def n_glycosylation_search(protein_sequence):
    n_glycosylation = re.finditer(r"N(?=([^P][ST][^P]))", protein_sequence)  # Corrected regex pattern
    positions = [match.start() + 1 for match in n_glycosylation]  # +1 to convert from counting from 0 to counting from 1
    return positions

In [34]:
# Opening the file and processing each UniProt ID
#This last because sequence extraction and n glycosylation search are within it 
with open(file_path, 'r') as file:
    for line in file:
        uniprotID = line.strip()
        protein_sequence = sequence_extraction(uniprotID)
        if protein_sequence:
            glycosylation_sites = n_glycosylation_search(protein_sequence)  # searches and saves the locations
            results[uniprotID] = glycosylation_sites

In [35]:
# Printing the results
for uniprotID, sites in results.items():
    print(f"UniProt ID: {uniprotID}, N-glycosylation site: {sites}")

UniProt ID: Q05557, N-glycosylation site: []
UniProt ID: Q9LHF1, N-glycosylation site: [3, 4, 60, 94, 106, 289, 340]
UniProt ID: P13838_LEUK_RAT, N-glycosylation site: []
UniProt ID: P02725_GLP_PIG, N-glycosylation site: []
UniProt ID: P47002, N-glycosylation site: [35, 552, 608]
UniProt ID: Q8ZRE7, N-glycosylation site: []
UniProt ID: Q81QB7, N-glycosylation site: [27]
UniProt ID: P81447_MPP3_CAPHI, N-glycosylation site: []
UniProt ID: B4R8K2, N-glycosylation site: []
UniProt ID: P07987_GUX2_TRIRE, N-glycosylation site: []
UniProt ID: Q4JAS3, N-glycosylation site: []
UniProt ID: Q9D9T0, N-glycosylation site: [154]
UniProt ID: Q1LI56, N-glycosylation site: []
