In [None]:
# default_exp importing

# This notebook provides a function to import peptide level data from Spectronaut

The preprocessed data is stored in a pandas dataframe with following columns:
* all_protein_ids: all UniProt IDs the peptide map to separated by ';'
* modified_sequence: the peptide sequence with all modifications included in square brackets
* naked_sequence: the naked peptide sequence

In [None]:
#export
import pandas as pd
import re

def import_spectronaut_data(file):
    """
    Function to import peptide level data from Spectronaut
    """
    data = pd.read_csv(file, sep=',') 
    data_sub = data[["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence"]]
    # get modified sequence
    mod_seq = data_sub.apply(lambda row: re.sub('_','',row["EG.ModifiedSequence"]), axis=1)
    data_sub = data_sub.assign(modified_sequence=mod_seq.values)
    # get naked sequence
    nak_seq = data_sub.apply(lambda row: re.sub(r'\[.*?\]','',row["modified_sequence"]), axis=1)
    data_sub = data_sub.assign(naked_sequence=nak_seq.values)
    data_sub = data_sub.rename(columns={"PEP.AllOccurringProteinAccessions": "all_protein_ids"})
    input_data = data_sub[["all_protein_ids","modified_sequence","naked_sequence"]]
    input_data = input_data.drop_duplicates().reset_index(drop=True)
    return input_data

In [None]:
#hide

def test_import_spectronaut_data():
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv")
    test = pd.read_csv('../testdata/test_spectronaut_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data, test)

test_import_spectronaut_data()

## Export notebook to script

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted Importing.ipynb.
Converted Preprocessing.ipynb.
Converted Uniprot_integration.ipynb.
Converted index.ipynb.
