In [None]:
# default_exp importing

# This notebook provides a function to import peptide level data from Spectronaut or MaxQuant

The preprocessed data is stored in a pandas dataframe with following columns:
* all_protein_ids: all UniProt IDs the peptide map to separated by ';'
* modified_sequence: the peptide sequence with all modifications included in square brackets
* naked_sequence: the naked peptide sequence

It is possible to further select one or more specific samples for import. A single sample can be provided as character string. Multiple samples can be provided as list of character strings. The raw MS filename should match corresponding entries in the "R.FileName" or "Raw file" column of the Spectronaut and MaxQuant analysis respectively.

## Import Spectronaut data

In [None]:
#export
import pandas as pd
import re

def import_spectronaut_data(file, sample=None):
    """
    Function to import peptide level data from Spectronaut returning a dataframe containing information about:
        - all_protein_ids (str)
        - modified_sequence (str)
        - naked_sequence (str)
    """
    spectronaut_columns = ["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence","R.FileName"]
    data = pd.read_csv(file, sep=None, engine='python', usecols=spectronaut_columns)
    
    if sample:
        if isinstance(sample, list):
            data_sub = data[data["R.FileName"].isin(sample)]
            data_sub = data_sub[["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence"]]
        elif isinstance(sample, str):
            data_sub = data[data["R.FileName"] == sample]
            data_sub = data_sub[["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence"]]
    else:
        data_sub = data[["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence"]]

    # get modified sequence
    mod_seq = data_sub.apply(lambda row: re.sub('_','',row["EG.ModifiedSequence"]), axis=1)
    data_sub = data_sub.assign(modified_sequence=mod_seq.values)
    # get naked sequence
    nak_seq = data_sub.apply(lambda row: re.sub(r'\[.*?\]','',row["modified_sequence"]), axis=1)
    data_sub = data_sub.assign(naked_sequence=nak_seq.values)
    data_sub = data_sub.rename(columns={"PEP.AllOccurringProteinAccessions": "all_protein_ids"})
    input_data = data_sub[["all_protein_ids","modified_sequence","naked_sequence"]]
    input_data = input_data.dropna()
    input_data = input_data.drop_duplicates().reset_index(drop=True)
    return input_data

In [None]:
#hide

def test_import_spectronaut_data():
    # test entire input test data
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv")
    #print(data.shape[0])
    assert data.shape[0] == 40
    data_t = import_spectronaut_data("../testdata/test_spectronaut_input.tsv")
    #print(data_t.shape[0])
    pd.testing.assert_frame_equal(data, data_t)
    test = pd.read_csv('../testdata/test_spectronaut_imported.csv', sep=',') 
    #print(test.shape[0])
    pd.testing.assert_frame_equal(data, test)
    
    # test single sample
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv", 
                                   sample="raw_01")
    #print(data.shape[0])
    assert data.shape[0] == 40
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv", 
                                   sample="raw_02")
    #print(data.shape[0])
    assert data.shape[0] == 20
    
    # test multiple samples
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv", 
                                   sample=["raw_01","raw_02"])
    #print(data.shape[0])
    assert data.shape[0] == 40

test_import_spectronaut_data()

## Import MaxQuant data

In [None]:
#export
import pandas as pd
import re

def import_maxquant_data(file, sample=None):
    """
    Function to import peptide level data from MaxQuant returning a dataframe containing information about:
        - all_protein_ids (str)
        - modified_sequence (str)
        - naked_sequence (str)
    """
    mq_columns = ["Proteins","Modified sequence","Raw file"]
    data = pd.read_csv(file, sep='\t', usecols=mq_columns)
    
    if sample:
        if isinstance(sample, list):
            data_sub = data[data["Raw file"].isin(sample)]
            data_sub = data_sub[["Proteins","Modified sequence"]]
        elif isinstance(sample, str):
            data_sub = data[data["Raw file"] == sample]
            data_sub = data_sub[["Proteins","Modified sequence"]]
    else:
        data_sub = data[["Proteins","Modified sequence"]]
    # get modified sequence
    mod_seq = data_sub.apply(lambda row: re.sub('_','',row["Modified sequence"]), axis=1)
    data_sub = data_sub.assign(modified_sequence=mod_seq.values)
    
    # replace outer () with []
    mod_seq_replaced = data_sub.apply(lambda row: re.sub(r'\((.*?\(.*?\))\)',r'[\1]',row["modified_sequence"]), axis=1)
    data_sub = data_sub.assign(modified_sequence=mod_seq_replaced.values)
    
    # get naked sequence
    nak_seq = data_sub.apply(lambda row: re.sub(r'\[.*?\]','',row["modified_sequence"]), axis=1)
    data_sub = data_sub.assign(naked_sequence=nak_seq.values)
    data_sub = data_sub.rename(columns={"Proteins": "all_protein_ids"})
    input_data = data_sub[["all_protein_ids","modified_sequence","naked_sequence"]]
    input_data = input_data.dropna() # remove missing values
    input_data = input_data.drop_duplicates().reset_index(drop=True)
    return input_data

In [None]:
#hide

def test_import_maxquant_data():
    data = import_maxquant_data("../testdata/test_maxquant_input.txt")
    test = pd.read_csv('../testdata/test_maxquant_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data, test)
    
    data_s = import_maxquant_data("../testdata/test_maxquant_input.txt", 
                                  sample = "raw_1")
    assert data_s.shape[0] == 85
    
    data_s = import_maxquant_data("../testdata/test_maxquant_input.txt", 
                                  sample = "raw_2")
    assert data_s.shape[0] == 77
    
    data_s = import_maxquant_data("../testdata/test_maxquant_input.txt", 
                                  sample = ["raw_1", "raw_2"])
    assert data_s.shape[0] == 136

test_import_maxquant_data()

## Aggregated import function

In [None]:
#export
import pandas as pd
import re
from io import StringIO

def import_data(file, sample=None, verbose=True, dashboard=False):
    """
    Function to import peptide level data. Depending on available columns in the provided file, 
    the function calls import_maxquant_data or import_spectronaut_data, finally returning a 
    dataframe containing information about:
        - all_protein_ids (str)
        - modified_sequence (str)
        - naked_sequence (str)
    """
    if dashboard:
        uploaded_data_columns = set(pd.read_csv(StringIO(str(file, "utf-8")), nrows=0, sep=None, engine='python').columns)
        input_info = StringIO(str(file, "utf-8"))
    else:
        uploaded_data_columns = set(pd.read_csv(file, nrows=0, sep=None, engine='python').columns)
        input_info = file
    if set(["Proteins","Modified sequence","Raw file"]).issubset(uploaded_data_columns):
        if verbose:
            print("Import MaxQuant input")
        data = import_maxquant_data(input_info, sample=sample)
    elif set(["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence","R.FileName"]).issubset(uploaded_data_columns):
        if verbose:
            print("Import Spectronaut input")
        data = import_spectronaut_data(input_info, sample=sample)
    else: 
        raise TypeError(f'Input data format for {file} not known.')
    return data

In [None]:
#hide
import sys

def test_import_data():
    data_MQ = import_data("../testdata/test_maxquant_input.txt", verbose=False)
    test = pd.read_csv('../testdata/test_maxquant_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data_MQ, test)
    
    data_S_csv = import_data("../testdata/test_spectronaut_input.csv", verbose=False)
    data_S_tsv = import_data("../testdata/test_spectronaut_input.tsv", verbose=False)
    pd.testing.assert_frame_equal(data_S_csv, data_S_tsv)
    test = pd.read_csv('../testdata/test_spectronaut_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data_S_csv, test)
    
    data_S_sub = import_data("../testdata/test_spectronaut_input.csv", 
                             sample = "raw_01", 
                             verbose=False)
    assert data_S_sub.shape[0] == 40
    
    try:
        out = import_data("../testdata/test_uniprot_df.csv")
    except TypeError as e:
        out = e
    assert str(out) == "Input data format for ../testdata/test_uniprot_df.csv not known."    

test_import_data()

## Export notebook to script

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted Importing.ipynb.
Converted Preprocessing.ipynb.
Converted SequencePlot.ipynb.
Converted Uniprot_integration.ipynb.
Converted index.ipynb.
Converted proteolytic_cleavage.ipynb.
