In [None]:
# default_exp uniprot_integration

In [18]:
#export
import re

import pandas as pd
import numpy as np

# Integration of Uniprot Data

This notebook allows to download Uniprot data and save them in a csv format.  

The preprocessed downloaded data will include information about:  
- __the preprocessing events__ known for proteins, such as signal peptide, transit peptide, propeptide, chain, peptide;
- information on all available in Uniprot __post translational modificatios__, like modified residues (Phosphorylation, Methylation, Acetylation, etc.), Lipidation, Glycosylation, etc.;
- information on __sequence similarities__ with other proteins and __the domain(s)__ present in a protein, such as domain, repear, region, motif, etc.;
- information on __the secondary and tertiary structure__ of proteins, such as turn, beta strand, helix.

### Specify a path to the downloaded from Uniprot DB flat txt file

1. Go to the Uniprot website(https://www.uniprot.org/uniprot/), select a needed organism in the "Popular organisms" section and click on it.
2. Click the "Download" button and select "Text" format.
3. Select the "Compressed" radio button and click "Go".
4. Unzip the downloaded file and specify a path to this file.

In [19]:
path_downloaded_uniprot = '../testdata/uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_.txt'

### Define helper functions to preprocess Uniprot data

In [20]:
#export
def extract_note(string, splitted=False):
    """
    Function to extract information about note of the protein from Uniprot using regular expression
    """
    if not splitted:
        regex = r"\/note=\"(?P<note>.+?)\""
    else:
        regex = r"\/note=\"(?P<note>.*)"
    result = re.findall(regex, string)
    return result

def extract_note_end(string, has_mark=True):
    if has_mark:
        regex = r"FT\s+(?P<note>.*)\""
    else:
        regex = r"FT\s+(?P<note>.*)"
    result = re.findall(regex, string)
    return result

In [21]:
#export
def resolve_unclear_position(value):
    """
    Replace unclear position of the start/end of the modification defined as '?' with -1 and if it's defined as '?N' 
    or ">N" - by removing the '?'/'>'/'<' signs
    """
    # if it's "1..?" or "?..345" for start or end -> remove -1 that we can filter later
    # if it's "31..?327" or "?31..327" -> remove the question mark
    # if it's "<1..106" or "22..>115" -> remove the "<" or ">" signs
    if value == '?':
        return -1
    value = value.replace('?', '').replace('>', '').replace('<', '')
    return int(value)

In [22]:
#export
def extract_positions(posit_string):
    """
    Extract isoform_id(str) and start/end positions(int, int/float) of any feature key from the string 
    """
    isoform = ''
    start = end = np.nan
    if '..' in posit_string:
        start, end = posit_string.split('..')
    if ':' in posit_string:
        if isinstance(start, str):
            isoform, start = start.split(':')
        else:
            isoform, start = posit_string.split(':')
    # in the case when we have only one numeric value as a posit_string
    if isinstance(start, float):
        start = posit_string
    # change the type of start and end into int/float(np.nan)
    if isinstance(start, str):
        start = resolve_unclear_position(start)
    if isinstance(end, str):
        end = resolve_unclear_position(end)
    return isoform, start, end

In [30]:
#export
def preprocess_uniprot(path_to_file):
    """
    A complex complete function to preprocess Uniprot data from specifying the path to the flat text file 
    to the returning a dataframe containing information about:
        - protein_id(str)
        - feature(category)
        - isoform_od(str)
        - start(int)
        - end(int)
        - note information(str)
    """
    all_data = []
    with open(path_to_file) as f:
        
        is_splitted = False
        new_instance = False
        combined_note = []

        for line in f:

            if line.startswith(('AC', 'FT')):
                if is_splitted:
                    # in case when the note information is splitted into several lines
                    if line.rstrip().endswith('"'):
                        # if it's the final part of the note
                        combined_note.extend(extract_note_end(line))
                        all_data.append([protein_id, feature, isoform, start, end, " ".join(combined_note)])
                        is_splitted = False
                        new_instance = False
                    else:
                        # if it's the middle part of the note
                        combined_note.extend(extract_note_end(line, has_mark=False))
                elif line.startswith('AC'):
                    # contains the protein_id information
                    protein_id = line.split()[1].replace(';', '')
                elif line.startswith('FT'):
                    # contains all modifications/preprocessing events/etc., their positions, notes
                    data = line.split()
                    if data[1].isupper() and not data[1].startswith('ECO'):
                            feature = data[1]
                            isoform, start, end = extract_positions(data[2])
                            new_instance = True
                    else:
                        if data[1].startswith('/note'):
                            note = extract_note(line)
                            if note:
                                # if note was created > it contains just one line and can be already added to the data 
                                all_data.append([protein_id, feature, isoform, start, end, note[0]])
                                new_instance = False
                            else:
                                # if note is empty > it's splitted into several lines and we create combined_note
                                combined_note = extract_note(line, splitted=True)
                                is_splitted = True
                        else:
                            if new_instance:
                                # in case when we don't have any note but need to add other information about instance
                                all_data.append([protein_id, feature, isoform, start, end, ''])
                                new_instance = False
    
    # create a dataframe for preprocessed data    
    uniprot_df = pd.DataFrame(all_data, columns=['protein_id', 'feature', 'isoform_id', 'start', 'end', 'note'])
    # change the dtypes of the columns
    uniprot_df.feature = uniprot_df.feature.astype('category')
    uniprot_df.end = uniprot_df.end.astype('Int64')
    # to filter the instances that don't have a defined start/end position(start=-1 or end=-1)
    uniprot_df = uniprot_df[(uniprot_df.start != -1) & (uniprot_df.end != -1)]
    
    return uniprot_df

In [31]:
uniprot_df = preprocess_uniprot(path_downloaded_uniprot)

In [33]:
uniprot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1320214 entries, 0 to 1320213
Data columns (total 6 columns):
protein_id    1320214 non-null object
feature       1320214 non-null object
isoform_id    1320214 non-null object
start         1320214 non-null int64
end           908582 non-null float64
note          1320214 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 60.4+ MB


In [34]:
# uniprot_df.to_csv('../testdata/preprocessed_uniprot_data.csv', index=False)

# Export notebook to script

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()