In [None]:
# default_exp preprocessing

In [None]:
#hide
import pandas as pd
import numpy as np
import re

## Generate input data for testing

In [None]:
#hide

test_df = pd.DataFrame(data={'all_protein_ids': ["A0A024R161;A0A087WT10;A0A087WTH1", 
                                                 "A0A024R161;A0A087WT10", 
                                                 "A0A087WTH5","A0A087WTH5",
                                                 "Nonsense"], 
                        'modified_sequence': ["PEPT[Phospho (STY)]IDER", 
                                              "SEQ[GlyGly (K)]UENCE[GlyGly (K)]R", 
                                              "VIEWER","NONSEQ",
                                              "NONSENSE"],
                        'naked_sequence': ["PEPTIDER", 
                                           "SEQUENCER", 
                                           "VIEWER","NONSEQ",
                                           "NONSENSE"]})

In [None]:
#hide

test_df_expanded = pd.DataFrame(data={'unique_protein_id': ["A0A024R161", "A0A087WT10", "A0A087WTH1", 
                                                            "A0A024R161", "A0A087WT10", 
                                                            "A0A087WTH5","A0A087WTH5",
                                                            "Nonsense"], 
                                      'modified_sequence': ["PEPT[Phospho (STY)]IDER", "PEPT[Phospho (STY)]IDER", "PEPT[Phospho (STY)]IDER",
                                                            "SEQ[GlyGly (K)]UENCE[GlyGly (K)]R", "SEQ[GlyGly (K)]UENCE[GlyGly (K)]R", 
                                                            "VIEWER","NONSEQ",
                                                            "NONSENSE"],
                                      'naked_sequence': ["PEPTIDER", "PEPTIDER", "PEPTIDER", 
                                                         "SEQUENCER", "SEQUENCER", 
                                                         "VIEWER","NONSEQ",
                                                         "NONSENSE"]})

In [None]:
#hide

test_df_expanded_peptide_position = pd.DataFrame(data={'unique_protein_id': ["A0A024R161", "A0A087WT10", "A0A087WTH1", 
                                                                             "A0A024R161", "A0A087WT10", 
                                                                             "A0A087WTH5"], 
                                                       'modified_sequence': ["PEPT[Phospho (STY)]IDER", "PEPT[Phospho (STY)]IDER", "PEPT[Phospho (STY)]IDER",
                                                                             "SEQ[GlyGly (K)]UENCE[GlyGly (K)]R", "SEQ[GlyGly (K)]UENCE[GlyGly (K)]R", 
                                                                             "VIEWER"],
                                                       'naked_sequence': ["PEPTIDER", "PEPTIDER", "PEPTIDER", 
                                                                          "SEQUENCER", "SEQUENCER", 
                                                                          "VIEWER"],
                                                       'start':[3,28,107,95,150,1],
                                                       'end':[10,35,114,103,158,6]})

In [None]:
#hide

test_df_modifications = pd.DataFrame(data={'unique_protein_id': ["A0A024R161", "A0A087WT10", "A0A087WTH1", 
                                                                             "A0A024R161", "A0A087WT10", 
                                                                             "A0A087WTH5"], 
                                                       'modified_sequence': ["PEPT[Phospho (STY)]IDER", "PEPT[Phospho (STY)]IDER", "PEPT[Phospho (STY)]IDER",
                                                                             "SEQ[GlyGly (K)]UENCE[GlyGly (K)]R", "SEQ[GlyGly (K)]UENCE[GlyGly (K)]R", 
                                                                             "VIEWER"],
                                                       'naked_sequence': ["PEPTIDER", "PEPTIDER", "PEPTIDER", 
                                                                          "SEQUENCER", "SEQUENCER", 
                                                                          "VIEWER"],
                                                       'start':[3,28,107,95,150,1],
                                                       'end':[10,35,114,103,158,6], 
                                           'PTMsites':[[3],[3],[3],[2,7],[2,7],[]],
                                           'PTMtypes':[["[Phospho (STY)]"],["[Phospho (STY)]"],["[Phospho (STY)]"],["[GlyGly (K)]","[GlyGly (K)]"],["[GlyGly (K)]","[GlyGly (K)]"],[]]})


@ToDO: Maybe write .fasta import function to remove dependency for pyteomics

In [None]:
from pyteomics import fasta
test_fasta = fasta.IndexedUniProt("testdata/test.fasta")

## Expand shared protein ids to unique

In [None]:
#hide 

def expandProteinIds(df):
    df = df.copy(deep=True)
    df.all_protein_ids = df.all_protein_ids.str.split(';')
    res = pd.DataFrame([(d, tup.modified_sequence, tup.naked_sequence) for tup in df.itertuples() for d in tup.all_protein_ids])
    res.columns = ['unique_protein_id','modified_sequence','naked_sequence']
    return res

def test_expandProteinIds():
    res = expandProteinIds(test_df)
    pd.testing.assert_frame_equal(res,test_df_expanded)
    
test_expandProteinIds()

## Annotate peptides with start and end position

In [None]:
#hide 

def fastaError(prot,fasta):
    try:
        fasta[prot]
        return False
    except Exception:
        return True

def pepPositionHelper(seq,prot,fasta):
    if fastaError(prot,fasta):
        start, end = np.NaN, np.NaN 
        warnings.warn(f'No matching entry for {prot} in the selected fasta.')
    else:
        search_res = re.search(seq,fasta[prot].sequence)
        if search_res is None:
            start, end = np.NaN, np.NaN
            warnings.warn(f'Peptide sequence {seq} could not be mached to {prot} in the selected fasta.')
        else:
            start, end = search_res.span()
    return start, end-1

def test_pepPositionHelper():
    start, end = pepPositionHelper("PEPTIDER","A0A024R161",test_fasta)
    np.testing.assert_equal([start, end], [3,10])

test_pepPositionHelper()

In [None]:
#hide 

import warnings

def getPeptidePosition(df, fasta):
    res = df.copy(deep=True)
    res[['start','end']] = res.apply(lambda row: pepPositionHelper(row['naked_sequence'], 
                                                                   row['unique_protein_id'], 
                                                                   fasta), 
                                     axis=1, result_type='expand')
    
    res_na = res[res.isnull().any(1)]
    prots_na = res_na.unique_protein_id.unique()
    
    res = res.dropna()
    res['start'] = res['start'].astype(int)
    res['end'] = res['end'].astype(int)
    return res

def test_getPeptidePosition():
    with warnings.catch_warnings(record=True) as w:
        res = getPeptidePosition(test_df_expanded, test_fasta)
        assert len(w) == 2
        assert "Peptide sequence NONSEQ could not be mached" in str(w[0].message)
        assert "No matching entry for Nonsense" in str(w[1].message)
    pd.testing.assert_frame_equal(res,test_df_expanded_peptide_position)
    
test_getPeptidePosition()

## Grep all modifications

In [None]:
#hide 

def getPTMsites(peptide, modification_reg):
    r = re.compile(modification_reg)
    starts=[]
    ends=[]
    for m in r.finditer(peptide):
        starts.append(m.start())
        ends.append(m.end())
    PTM_sites = np.zeros(len(starts))
    for idx in range(0,len(starts)):
        if idx > 0:
            previous_len=previous_len+(ends[idx-1]-starts[idx-1])
        else:
            previous_len=0
        PTM_sites[idx] = starts[idx] - previous_len - 1
    return [int(i) for i in PTM_sites]

def test_getPTMsites():
    myPep = "PEPT[Phospho]IDE[GlyGly (K)]R"
    res = getPTMsites(myPep, modification_reg=r'\[.*?\]')
    np.testing.assert_equal(res, [3,6])
    
test_getPTMsites()

In [None]:
#hide

def getModifications(df, mod_reg):
    res = df.copy(deep=True)
    res['PTMsites'] = res.apply(lambda row: getPTMsites(row['modified_sequence'], 
                                                        modification_reg=mod_reg), axis=1)
    res['PTMtypes'] = res.apply(lambda row: re.findall(mod_reg, row['modified_sequence']), axis=1)
    return res

def test_getModifications():
    res = getModifications(test_df_expanded_peptide_position, mod_reg = r'\[.*?\]')
    pd.testing.assert_frame_equal(res, test_df_modifications)
    
test_getModifications()

## Preprocessing wrapper

In [None]:
#export

def formatInputData(df, fasta, modification_exp):
    res = df.copy(deep=True)
    res = expandProteinIds(res)
    res = getPeptidePosition(res, fasta = fasta)
    res = getModifications(res, mod_reg = modification_exp)
    return res

In [None]:
#hide

def test_formatInputData():
    with warnings.catch_warnings(record=True) as w:
        res = formatInputData(df=test_df, fasta = test_fasta, modification_exp = r'\[.*?\]')
        assert len(w) == 2
        assert "Peptide sequence NONSEQ could not be mached" in str(w[0].message)
        assert "No matching entry for Nonsense" in str(w[1].message)   
    pd.testing.assert_frame_equal(res, test_df_modifications)

test_formatInputData()

## Export notebook to script

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted Preprocessing.ipynb.
Converted index.ipynb.
