In [None]:
# default_exp importing

# This notebook provides a function to import peptide level data from Spectronaut or MaxQuant

The preprocessed data is stored in a pandas dataframe with following columns:
* all_protein_ids: all UniProt IDs the peptide map to separated by ';'
* modified_sequence: the peptide sequence with all modifications included in square brackets
* naked_sequence: the naked peptide sequence

## Import Spectronaut data

In [None]:
#export
import pandas as pd
import re

def import_spectronaut_data(file, sample=None):
    """
    Function to import peptide level data from Spectronaut
    """
    data = pd.read_csv(file, sep=',') 
    if sample:
        if isinstance(sample, list):
            raise NotImplementedError("Import not available for sample lists at this moment.")
        elif isinstance(sample, str):
            qval_col = sample + ".EG.Qvalue"
            data_sub = data[["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence",qval_col]]
            data_sub = data_sub[data_sub[qval_col] != 'Filtered']
            data_sub = data_sub[["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence"]]
    else:
        data_sub = data[["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence"]]

    # get modified sequence
    mod_seq = data_sub.apply(lambda row: re.sub('_','',row["EG.ModifiedSequence"]), axis=1)
    data_sub = data_sub.assign(modified_sequence=mod_seq.values)
    # get naked sequence
    nak_seq = data_sub.apply(lambda row: re.sub(r'\[.*?\]','',row["modified_sequence"]), axis=1)
    data_sub = data_sub.assign(naked_sequence=nak_seq.values)
    data_sub = data_sub.rename(columns={"PEP.AllOccurringProteinAccessions": "all_protein_ids"})
    input_data = data_sub[["all_protein_ids","modified_sequence","naked_sequence"]]
    input_data = input_data.drop_duplicates().reset_index(drop=True)
    return input_data

In [None]:
#hide

def test_import_spectronaut_data():
    # test entire input test data
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv")
    test = pd.read_csv('../testdata/test_spectronaut_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data, test)
    
    # test single sample
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv", 
                                   sample="proteome_1.raw")
    assert data.shape[0] == 2
    data = import_spectronaut_data("../testdata/test_spectronaut_input.csv", 
                                   sample="proteome_14.raw")
    assert data.shape[0] == 3
    
    # test list of samples
    try:
        out = import_spectronaut_data("../testdata/test_spectronaut_input.csv", 
                                      sample=["proteome_1.raw","proteome_14.raw"])
    except NotImplementedError as e:
        out = e
    assert str(out) == "Import not available for sample lists at this moment."

test_import_spectronaut_data()

## Import MaxQuant data

In [None]:
#export
import pandas as pd
import re

def import_maxquant_data(file, sample=None):
    """
    Function to import peptide level data from MaxQuant
    """
    data = pd.read_csv(file, sep='\t') 
    
    if sample:
        if isinstance(sample, list):
            raise NotImplementedError("Import not available for sample lists at this moment.")
        elif isinstance(sample, str):
            raise NotImplementedError("Import not available for single samples at this moment.")
            #data_sub = data[["Proteins","Modified sequence","Experiment"]]
            #data_sub = data_sub[data_sub["Experiment"] == sample]
            #data_sub = data_sub[["Proteins","Modified sequence"]]
    else:
        data_sub = data[["Proteins","Modified sequence"]]
    
    # get modified sequence
    mod_seq = data_sub.apply(lambda row: re.sub('_','',row["Modified sequence"]), axis=1)
    data_sub = data_sub.assign(modified_sequence=mod_seq.values)
    
    # replace outer () with []
    mod_seq_replaced = data_sub.apply(lambda row: re.sub(r'\((.*?\(.*?\))\)',r'[\1]',row["modified_sequence"]), axis=1)
    data_sub = data_sub.assign(modified_sequence=mod_seq_replaced.values)
    
    # get naked sequence
    nak_seq = data_sub.apply(lambda row: re.sub(r'\[.*?\]','',row["modified_sequence"]), axis=1)
    data_sub = data_sub.assign(naked_sequence=nak_seq.values)
    data_sub = data_sub.rename(columns={"Proteins": "all_protein_ids"})
    input_data = data_sub[["all_protein_ids","modified_sequence","naked_sequence"]]
    input_data = input_data.drop_duplicates().reset_index(drop=True)
    return input_data

In [None]:
#hide

def test_import_maxquant_data():
    data = import_maxquant_data("../testdata/test_maxquant_input.txt")
    test = pd.read_csv('../testdata/test_maxquant_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data, test)

test_import_maxquant_data()

## Aggregated import function

In [None]:
#export
import pandas as pd
import re

def import_data(file, sample = None, verbose=True):
    tab_cols = pd.read_csv(file,index_col=0,nrows=0, sep='\t').columns
    csv_cols = pd.read_csv(file,index_col=0,nrows=0, sep=',').columns
    if len(csv_cols) > len(tab_cols):
        cols = csv_cols 
    else:
        cols = tab_cols 
    if set(["Proteins","Modified sequence"]).issubset(set(cols)):
        if verbose:
            print("Import MaxQuant input")
        data = import_maxquant_data(file)
    elif set(["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence"]).issubset(set(cols)):
        if verbose:
            print("Import Spectronaut input")
        data = import_spectronaut_data(file, sample = sample)
    else:
        raise TypeError(f'Input data format for {file} not known.')
    return data

In [None]:
#hide
import sys

def test_import_data():
    data_MQ = import_data("../testdata/test_maxquant_input.txt", verbose=False)
    test = pd.read_csv('../testdata/test_maxquant_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data_MQ, test)
    
    data_S = import_data("../testdata/test_spectronaut_input.csv", verbose=False)
    test = pd.read_csv('../testdata/test_spectronaut_imported.csv', sep=',') 
    pd.testing.assert_frame_equal(data_S, test)
    
    data_S_sub = import_data("../testdata/test_spectronaut_input.csv", 
                             sample = "proteome_1.raw", 
                             verbose=False)
    assert data_S_sub.shape[0] == 2
    
    try:
        out = import_data("../testdata/test_uniprot_df.csv")
    except TypeError as e:
        out = e
    assert str(out) == "Input data format for ../testdata/test_uniprot_df.csv not known."
    

test_import_data()

## Export notebook to script

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted Importing.ipynb.
Converted Preprocessing.ipynb.
Converted SequencePlot.ipynb.
Converted Uniprot_integration.ipynb.
Converted index.ipynb.
