In [None]:
# default_exp uniprot_integration

In [1]:
#export
import pandas as pd
import numpy as np

# Integration of Uniprot Data

This notebook allows to download Uniprot data and save them in a csv format.  

The preprocessed downloaded data will include information about:  
- __the preprocessing events__ known for proteins, such as signal peptide, transit peptide, propeptide, chain, peptide;
- information on all available in Uniprot __post translational modificatios__, like modified residues (Phosphorylation, Methylation, Acetylation, etc.), Lipidation, Glycosylation, etc.;
- information on __sequence similarities__ with other proteins and __the domain(s)__ present in a protein, such as domain, repear, region, motif, etc.;
- information on __the secondary and tertiary structure__ of proteins, such as turn, beta strand, helix.

### Specify a path to the downloaded from Uniprot DB flat txt file

1. Go to the Uniprot website(https://www.uniprot.org/uniprot/), select a needed organism in the "Popular organisms" section and click on it.
2. Click the "Download" button and select "Text" format.
3. Select the "Compressed" radio button and click "Go".
4. Unzip the downloaded file and specify a path to this file.

In [2]:
path_downloaded_uniprot = '../testdata/uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_.txt'

### Define helper functions to preprocess Uniprot data

In [4]:
#export
def extract_note(string, splitted=False):
    """
    Function to extract information about note of the protein from Uniprot using regular expression
    """
    if not splitted:
        regex = r"\/note=\"(?P<note>.+?)\""
    else:
        regex = r"\/note=\"(?P<note>.*)"
    result = re.findall(regex, string)
    return result

def extract_note_end(string, has_mark=True):
    if has_mark:
        regex = r"FT\s+(?P<note>.*)\""
    else:
        regex = r"FT\s+(?P<note>.*)"
    result = re.findall(regex, string)
    return result

In [6]:
#export
def resolve_unclear_position(value):
    """
    Replace unclear position of the start/end of the modification defined as '?' with -1 and if it's defined as '?N' 
    or ">N" - by removing the '?'/'>'/'<' signs
    """
    # if it's "1..?" or "?..345" for start or end -> remove -1 that we can filter later
    # if it's "31..?327" or "?31..327" -> remove the question mark
    # if it's "<1..106" or "22..>115" -> remove the "<" or ">" signs
    if value == '?':
        return -1
    value = value.replace('?', '').replace('>', '').replace('<', '')
    return int(value)

In [5]:
#export
def extract_positions(posit_string):
    """
    Extract isoform_id(str) and start/end positions(int, int/float) of any feature key from the string 
    """
    isoform = ''
    start = end = np.nan
    if '..' in posit_string:
        start, end = posit_string.split('..')
    if ':' in posit_string:
        if isinstance(start, str):
            isoform, start = start.split(':')
        else:
            isoform, start = posit_string.split(':')
    # in the case when we have only one numeric value as a posit_string
    if isinstance(start, float):
        start = posit_string
    # change the type of start and end into int/float(np.nan)
    if isinstance(start, str):
        start = resolve_unclear_position(start)
    if isinstance(end, str):
        end = resolve_unclear_position(end)
    return isoform, start, end

# Export notebook to script

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()