In [None]:
# default_exp uniprot_integration

# UniProt data formatting

In [None]:
#export
import re

import pandas as pd
import numpy as np

This notebook contains functions to import a uniport annotation file and to format it as pandas dataframe for further usage in alphamap.

The preprocessed uniprot annotation includes information about:  
- __the known preprocessing events__ for proteins, such as signal peptide, transit peptide, propeptide, chain, peptide;
- information on all available in Uniprot __post translational modificatios__, like modified residues (Phosphorylation, Methylation, Acetylation, etc.), Lipidation, Glycosylation, etc.;
- information on __sequence similarities__ with other proteins and __the domain(s)__ present in a protein, such as domain, repeat, region, motif, etc.;
- information on __the secondary and tertiary structure__ of proteins, such as turn, beta strand, helix.

## Instructions on how to download a UniProt annotation file

1. Go to the Uniprot website(https://www.uniprot.org/uniprot/), select the organism of interest in the "Popular organisms" section and click on it.
2. Click the "Download" button and select "Text" format.
3. Select the "Compressed" radio button and click "Go".
4. Unzip the downloaded file and specify the path to this file.

## Helper functions

In [None]:
#export
def extract_note(string: str, splitted:bool = False):
    """
    Helper function to extract information about note of the protein from Uniprot using regular expression.

    Args:
        string (str): Uniprot annotation string.
        splitted (bool, optional): Flag to allow linebreaks. Default is 'False'.
    Returns:
        str: Extracted string of the uniprot note section.
    """
    if not splitted:
        regex = r"\/note=\"(?P<note>.+?)\""
    else:
        regex = r"\/note=\"(?P<note>.*)"
    result = re.findall(regex, string)
    return result

def extract_note_end(string: str, has_mark:bool = True):
    """
    Helper function to extract information about note of the protein from Uniprot using regular expression.

    Args:
        string (str): Uniprot annotation string.
        has_mark (bool, optional): Flag if end quotation marks are present. Default is 'False'.
    Returns:
        str: Extracted string of the uniprot note section.
    """
    if has_mark:
        regex = r"FT\s+(?P<note>.*)\""
    else:
        regex = r"FT\s+(?P<note>.*)"
    result = re.findall(regex, string)
    return result

In [None]:
#hide
# write tests for extract_note and extract_note_end functions
def test_extract_note_not_splitted():
    string = 'FT                   /note="Missing (in isoform 2)"'
    output = extract_note(string)
    assert "Missing (in isoform 2)" == output[0]
    
def test_extract_note_splitted():
    string = 'FT                   /note="MAAALFVLLGF -> MKQSD'
    output = extract_note(string, splitted=True)
    assert "MAAALFVLLGF -> MKQSD" == output[0]
    
def test_extract_note_end_finished():
    string = 'FT                   ASPQER (in isoform 4)"'
    output = extract_note_end(string)
    assert "ASPQER (in isoform 4)" == output[0]
    
def test_extract_note_end_not_finished():
    string = 'FT                   ASPQER (in isoform 4)'
    output = extract_note_end(string, has_mark=False)
    assert "ASPQER (in isoform 4)" == output[0]
    
test_extract_note_not_splitted()
test_extract_note_splitted()
test_extract_note_end_finished()
test_extract_note_end_not_finished()

In [None]:
#export
def resolve_unclear_position(value: str):
    """
    Replace unclear position of the start/end of the modification defined as '?' with -1 and if it's defined as '?N'
    or ">N" - by removing the '?'/'>'/'<' signs.

    Args:
        value (str): Unclear sequence position from uniprot.
    Returns:
        float: Resolved sequence position.
    """
    # if it's "1..?" or "?..345" for start or end -> remove -1 that we can filter later
    # if it's "31..?327" or "?31..327" -> remove the question mark
    # if it's "<1..106" or "22..>115" -> remove the "<" or ">" signs
    if value == '?':
        return -1
    value = value.replace('?', '').replace('>', '').replace('<', '')
    return float(value)

def extract_positions(posit_string: str):
    """
    Extract isoform_id(str) and start/end positions(float) of any feature key from the string.

    Args:
        posit_string (str): Uniprot position string.
    Returns:
        [str, float, float]: str: Uniprot isoform accession, float: start position, float: end position
    """
    isoform = ''
    start = end = np.nan
    if '..' in posit_string:
        start, end = posit_string.split('..')
    if ':' in posit_string:
        if isinstance(start, str):
            isoform, start = start.split(':')
        else:
            isoform, start = posit_string.split(':')
    # in the case when we have only one numeric value as a posit_string
    if isinstance(start, float):
        start = posit_string
    # change the type of start and end into int/float(np.nan)
    if isinstance(start, str):
        start = resolve_unclear_position(start)
    if isinstance(end, str):
        end = resolve_unclear_position(end)
    return isoform, start, end

In [None]:
#hide
# write tests for extract_positions and resolve_unclear_position functions
def test_extract_positions():
    string = '34..65'
    isoform, start, end = extract_positions(string)
    np.testing.assert_equal(['', 34, 65], [isoform, start, end])

def test_extract_positions_with_isoform():
    string = 'P35613-2:195..199'
    isoform, start, end = extract_positions(string)
    np.testing.assert_equal(['P35613-2', 195, 199], [isoform, start, end])

def test_extract_positions_start_with_isoform():
    string = 'Q9C0I9-2:256'
    isoform, start, end = extract_positions(string)
    np.testing.assert_equal(['Q9C0I9-2', 256, np.nan], [isoform, start, end])
    
def test_extract_positions_start():
    string = '256'
    isoform, start, end = extract_positions(string)
    np.testing.assert_equal(['', 256, np.nan], [isoform, start, end])

def test_resolve_unclear_position_unknown():
    string = '?'
    message = f"For unknown position resolve_unclear_position function returns wrong output instead of -1."
    assert -1 == resolve_unclear_position(string), message
    
def test_resolve_unclear_position_unclear():
    string1 = '>117'
    string2 = '<1'
    string3 = '?327'
    string4 = '?10'
    message = f"For unclear position resolve_unclear_position function returns wrong output."
    assert 117 == resolve_unclear_position(string1), message
    assert 1 == resolve_unclear_position(string2), message
    assert 327 == resolve_unclear_position(string3), message
    assert 10 == resolve_unclear_position(string4), message

test_extract_positions()
test_extract_positions_with_isoform()
test_extract_positions_start_with_isoform()
test_extract_positions_start()
test_resolve_unclear_position_unknown() 
test_resolve_unclear_position_unclear()

## Uniprot preprocessing function

In [None]:
#export
def preprocess_uniprot(path_to_file: str):
    """
    A complex complete function to preprocess Uniprot data from specifying the path to a flat text file
    to the returning a dataframe containing information about:
        - protein_id(str)
        - feature(category)
        - isoform_id(str)
        - start(float)
        - end(float)
        - note information(str)

    Args:
        path_to_file (str): Path to a .txt annotation file directly downloaded from uniprot.
    Returns:
        pd.DataFrame: Dataframe with formatted uniprot annotations for alphamap.

    """
    all_data = []
    with open(path_to_file) as f:

        is_splitted = False
        new_instance = False
        combined_note = []
        line_type = ''

        for line in f:

            if line.startswith(('AC', 'FT')):
                if is_splitted:
                    # in case when the note information is splitted into several lines
                    if line.rstrip().endswith('"'):
                        # if it's the final part of the note
                        combined_note.extend(extract_note_end(line))
                        all_data.append([protein_id, feature, isoform, start, end, " ".join(combined_note)])
                        is_splitted = False
                        new_instance = False
                    else:
                        # if it's the middle part of the note
                        combined_note.extend(extract_note_end(line, has_mark=False))
                elif line.startswith('AC'):
                    # contains the protein_id information
                    if line_type != 'AC':
                        # to prevent a situation when the protein has several AC lines with different names
                        # in this case we are taking the first name in the first line
                        protein_id = line.split()[1].replace(';', '')
                    line_type = 'AC'
                elif line.startswith('FT'):
                    line_type = 'FT'
                    # contains all modifications/preprocessing events/etc., their positions, notes
                    data = line.split()
                    if data[1].isupper() and not data[1].startswith('ECO'):
                            feature = data[1]
                            isoform, start, end = extract_positions(data[2])
                            new_instance = True
                    else:
                        if data[1].startswith('/note'):
                            note = extract_note(line)
                            if note:
                                # if note was created > it contains just one line and can be already added to the data
                                all_data.append([protein_id, feature, isoform, start, end, note[0]])
                                new_instance = False
                            else:
                                # if note is empty > it's splitted into several lines and we create combined_note
                                combined_note = extract_note(line, splitted=True)
                                is_splitted = True
                        else:
                            if new_instance:
                                # in case when we don't have any note but need to add other information about instance
                                all_data.append([protein_id, feature, isoform, start, end, ''])
                                new_instance = False

    # create a dataframe for preprocessed data
    uniprot_df = pd.DataFrame(all_data, columns=['protein_id', 'feature', 'isoform_id', 'start', 'end', 'note'])
    # change the dtypes of the columns
    uniprot_df.feature = uniprot_df.feature.astype('category')
    # to filter the instances that don't have a defined start/end position(start=-1 or end=-1)
    uniprot_df = uniprot_df[(uniprot_df.start != -1) & (uniprot_df.end != -1)].reset_index(drop=True)

    return uniprot_df

In [None]:
#hide
# for testing of the function a text file for P11532 protein was downloaded from the Uniprot
path_to_test_file = '../testdata/P11532_test_file.txt'

def test_preprocess_uniprot():
    
    test_df = preprocess_uniprot(path_to_test_file)
    np.testing.assert_equal((167, 6), test_df.shape, err_msg = 'The shape of the returned file is incorrect.')
    assert test_df.feature.dtype == 'category', 'The type of the feature column is not a category.'

    # to check the cases when protein had no note but had feature, start and end, f.e.
    # FT   HELIX           14..31
    # FT                   /evidence="ECO:0000244|PDB:1DXX"
    np.testing.assert_array_equal(['P11532', 'HELIX', '', 14.0, 31.0, ''], 
                                  test_df[(test_df.feature == 'HELIX') & (test_df.start == 14)].values.tolist()[0],
                                 err_msg = "The output for the protein that doesn't have a note but has \
                                 feature information, a start and an end position is incorrect.")

    # to check the cases when protein had a note written in one line and doesn't have end, f.e.
    # FT   MOD_RES         3500
    # FT                   /note="Phosphoserine"
    np.testing.assert_array_equal(['P11532', 'MOD_RES', '', 3500.0, np.nan, 'Phosphoserine'],
                                  test_df[(test_df.feature == 'MOD_RES') & (test_df.start == 3500)].values.tolist()[0],
                                 err_msg = "The output for the protein that has the note written in one line \
                                 and doesn't have an end position for the feature is incorrect.")

    # to check the cases when protein had a note split into several line, f.e.
    # FT   VARIANT         3340
    # FT                   /note="C -> Y (in DMD; results in highly reduced protein
    # FT                   levels and expression at the sarcolemma)"
    assert 'C -> Y (in DMD; results in highly reduced protein levels and expression at the sarcolemma)' == \
    test_df[(test_df.feature == 'VARIANT') & (test_df.start == 3340)]['note'].values[0], \
    "The output for the protein that has a note split into several lines is incorrect."

    # to check the cases when protein had protein_ids written in several line, f.e.
    # AC   P11532; A1L0U9; E7EQR9; E7EQS5; E7ESB2; E9PDN1; E9PDN5; F5GZY3; F8VX32;
    # AC   Q02295; Q14169; Q14170; Q5JYU0; Q6NSJ9; Q7KZ48; Q8N754; Q9UCW3; Q9UCW4;
    assert 1 == test_df.protein_id.nunique(), "A preprocess_uniprot function returns a non-unique protein_id."
    assert 'P11532' == test_df.protein_id.unique()[0], 'A preprocess_uniprot function returns a wrong protein_id.'

test_preprocess_uniprot()

## UniProt feature dictionary

The following is a dictionary that maps feature names to the feature entries in the processed uniprot annotation file.

In [None]:
#export
uniprot_feature_dict = {
    'Chain': 'CHAIN',
    'Initiator methionine': 'INIT_MET',
    'Peptide': 'PEPTIDE',
    'Propeptide': 'PROPEP',
    'Signal peptide': 'SIGNAL',
    'Transit peptide': 'TRANSIT',
    'Cross-link': 'CROSSLNK',
    'Disulfide bond': 'DISULFID',
    'Glycosylation': 'CARBOHYD',
    'Lipidation': 'LIPID',
    'Modified residue': 'MOD_RES',
    'Coiled coil': 'COILED',
    'Compositional bias': 'COMPBIAS',
    'Domain': 'DOMAIN',
    'Motif': 'MOTIF',
    'Region': 'REGION',
    'Repeat': 'REPEAT',
    'Zinc finger': 'ZN_FING',
    'Intramembrane': 'INTRAMEM',
    'Topological domain': 'TOPO_DOM',
    'Transmembrane': 'TRANSMEM',
    'Beta strand': 'STRAND',
    'Helix': 'HELIX',
    'Turn': 'TURN',
    'Active site': 'ACT_SITE',
    'Binding site': 'BINDING',
    'Calcium binding': 'CA_BIND',
    'DNA binding': 'DNA_BIND',
    'Metal binding': 'METAL',
    'Nucleotide binding': 'NP_BIND',
    'Site': 'SITE',
    'Non-standard residue': 'NON_STD',
    'Non-adjacent residues': 'NON_CONS',
    'Non-terminal residue': 'NON_TER',
    'Natural variant': 'VARIANT',
    'Sequence conflict': 'CONFLICT',
    'Alternative sequence': 'VAR_SEQ',
    'Sequence uncertainty': 'UNSURE',
    'Secondary structure': 'STRUCTURE',
    'Mutagenesis': 'MUTAGEN'
}

In [None]:
#hide

###### Export notebook to script ###### 

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()