# Export

Export AlphaPept Data to match MaxQuant

In [None]:
# default_exp export

## MaxQuant output files

Read MaxQuant folder and return the files
```
import os
sample_path = 'F:/TESTDATA/DEFAULT/THERMO/combined/txt/'
mq_ouput_files = os.listdir(sample_path)
```

#To implement in AlphaPept:

```
* Type of match needs to be implemented
* evidence[['MS/MS m/z','m/z']]
MS/MS m/z: The m/z used for fragmentation (not necessarily the monoisotopic
m/z).
    m/z The recalibrated mass-over-charge value of the precursor ion.
    
* Include rt_length directly in feature finding..

* try to extract more properties from the featues

mq_dict_evidence['Number of data points'] =
mq_dict_evidence['Number of scans'] =
mq_dict_evidence['Number of isotopic peaks'] =


THis needs alot be done: 

'MS/MS count'
'MS/MS scan number'
'Delta Score' (to the second best identifeid peptide..)
```


In [None]:
#export

mq_ouput_files = ['allPeptides.txt', 'evidence.txt', 'matchedFeatures.txt', 'modificationSpecificPeptides.txt', 'ms3Scans.txt', 'msms.txt', 'msmsScans.txt', 'mzRange.txt', 'Oxidation (M)Sites.txt', 'parameters.txt', 'peptides.txt', 'proteinGroups.txt', 'summary.txt', 'tables.pdf']

## Sequence Notation

Dictionary to be able to convert AlphaPept sequence notaiton to MaxQuant

In [None]:
#export

# TODO: Acetyl (Protein N-term cannot be easily converted as we don't know the protein end.)
mod_translation = {}
mod_translation['oxM'] = 'Oxidation (M)'
mod_translation['cC'] = None
mod_translation['aM'] = None

In [None]:
#export 

from alphapept.fasta import parse

def remove_mods(sequence):
    return ''.join([_ for _ in sequence if _.isupper()])

def ap_to_mq_sequence(sequence, mod_translation):
    """
    Converts AlphaPept sequence format to MaxQuant Format
    returns naked_sequence, len_sequence, modifications_, mq_sequence
    
    """
    # Add leading and trailing modification
    naked_sequence = remove_mods(sequence)
    parsed_sequence = parse(sequence)
    
    mq_sequence = '_'

    modifications = {}
    
    for idx, AA in enumerate(parsed_sequence):
        
        mq_sequence += naked_sequence[idx]
        if len(AA) != 1:
            if mod_translation[AA] is not None:
                if mod_translation[AA] in modifications:
                    modifications[mod_translation[AA]] += 1
                else:
                    modifications[mod_translation[AA]] = 1

                mq_sequence += f"({mod_translation[AA]})"
                 
    if len(modifications) == 0:
        modifications_ = 'Unmodified'
    else:
        modifications_ = ''
        
        for mod in modifications.keys():
            count = modifications[mod]
            if count == 1:
                count_ = ''
            else:
                count_ = str(count)+' '
                
            if modifications_ == '':
                sep = ''
            else:
                sep = ', '
                
            modifications_ += sep + count_ + mod
                
    mq_sequence += '_'
    
    n_AA = len(naked_sequence)
    
    return naked_sequence, n_AA, modifications_, mq_sequence    


## evidence.txt

Apparently, the columns for evidence.txt are dynamic. As an example, when including Oxidation of M as modification, the following columns will be added:

```
'Oxidation (M)',
'Oxidation (M) Probabilities',
'Oxidation (M) Score Diffs',
'Oxidation (M) site IDs'
```

Example code to load and print the columns

```
import pandas as pd
evidence = pd.read_csv('F:/TESTDATA/DEFAULT/THERMO_IRT_MANY_MOD/combined/txt/evidence.txt', sep = '\t')
for _ in evidence.columns:
    print(f"mq_dict_evidence['{_}'] =")
````


In [None]:
#export
import os
import numpy as np

def prepare_ap_results(ref_ap):
    
    if 'type' not in ref_ap.columns:
        
        ref_ap['type'] = 'None'
        
    remove_path = ref_ap['filename'].apply(lambda x: os.path.splitext(os.path.split(x)[1])[0])
    
    ref_ap['mq_rawfile'] = remove_path.apply(lambda x: x[:-8] if x.endswith('.ms_data') else x)
    
    
    ref_ap['reverse'] = np.nan
    ref_ap.loc[ref_ap['decoy'],'reverse'] = '+'
    
    # Undefined yet:
    
    ref_ap['undefined'] = np.nan
    
    ref_ap['contaminant'] = np.nan
    ref_ap.loc[ref_ap['protein_group'].str.contains('CON__'),'contaminant'] = '+'
    
    ref_ap['id'] = ref_ap.index
    
    naked_sequence, nAA, mq_modifications, mq_sequence = zip(*ref_ap['sequence'].apply(lambda x: ap_to_mq_sequence(x, mod_translation)))
    
    ref_ap['naked_sequence'] = naked_sequence
    ref_ap['n_AA'] = nAA
    ref_ap['mq_modifications'] = mq_modifications
    ref_ap['mq_sequence'] = mq_sequence
    
    return ref_ap

In [None]:
mq_dict_evidence = {}

mq_dict_evidence['Sequence'] = 'naked_sequence'
mq_dict_evidence['Length'] = 'n_AA'
mq_dict_evidence['Modifications'] = 'mq_modifications'
mq_dict_evidence['Modified sequence'] = 'mq_sequence'
mq_dict_evidence['Missed cleavages'] = 'n_missed'
mq_dict_evidence['Proteins'] = 'protein_group'#it is not entirely clear what the difference between Leading Proteins and Proteins is
mq_dict_evidence['Leading proteins'] = 'protein_group'
mq_dict_evidence['Leading razor protein'] = 'protein'
mq_dict_evidence['Type'] = 'type'
mq_dict_evidence['Raw file'] = 'mq_rawfile'
mq_dict_evidence['MS/MS m/z'] = 'undefined'
mq_dict_evidence['Charge'] = 'charge'
mq_dict_evidence['m/z'] = 'mz'
mq_dict_evidence['Mass'] = 'mass'
mq_dict_evidence['Uncalibrated - Calibrated m/z [ppm]'] = 'undefined'
mq_dict_evidence['Uncalibrated - Calibrated m/z [Da]'] = 'undefined'
mq_dict_evidence['Mass error [ppm]'] = 'undefined'
mq_dict_evidence['Mass error [Da]'] = 'undefined'
mq_dict_evidence['Uncalibrated mass error [ppm]'] = 'undefined'
mq_dict_evidence['Uncalibrated mass error [Da]'] = 'undefined'
mq_dict_evidence['Max intensity m/z 0'] = 'undefined'
mq_dict_evidence['Retention time'] = 'rt'
mq_dict_evidence['Retention length'] = 'undefined'
mq_dict_evidence['Calibrated retention time'] = 'undefined'
mq_dict_evidence['Calibrated retention time start'] = 'undefined'
mq_dict_evidence['Calibrated retention time finish'] = 'undefined'
mq_dict_evidence['Retention time calibration'] = 'undefined'
mq_dict_evidence['Match time difference'] = 'undefined'
mq_dict_evidence['Match m/z difference'] = 'undefined'
mq_dict_evidence['Match q-value'] = 'undefined'
mq_dict_evidence['Match score'] = 'undefined'
mq_dict_evidence['Number of data points'] = 'undefined'
mq_dict_evidence['Number of scans'] = 'undefined'
mq_dict_evidence['Number of isotopic peaks'] = 'n_isotopes'
mq_dict_evidence['PIF'] = 'undefined'
mq_dict_evidence['Fraction of total spectrum'] = 'matched_int_ratio'
mq_dict_evidence['Base peak fraction'] = 'undefined'
mq_dict_evidence['PEP'] = 'undefined'
mq_dict_evidence['MS/MS count'] = 'undefined'
mq_dict_evidence['MS/MS scan number'] = 'undefined'
mq_dict_evidence['Score'] = 'score'
mq_dict_evidence['Delta score'] = 'undefined'
mq_dict_evidence['Combinatorics'] = 'undefined'
mq_dict_evidence['Intensity'] = 'int_sum'
mq_dict_evidence['Reverse'] = 'reverse'
mq_dict_evidence['Potential contaminant'] = 'contaminant'
mq_dict_evidence['id'] = 'id'
mq_dict_evidence['Protein group IDs'] = 'undefined'
mq_dict_evidence['Peptide ID'] = 'undefined'
mq_dict_evidence['Mod. peptide ID'] = 'undefined'
mq_dict_evidence['MS/MS IDs'] = 'undefined'
mq_dict_evidence['Best MS/MS'] = 'undefined'
mq_dict_evidence['Taxonomy IDs'] = 'undefined'

```
import pandas as pd

ref_ap = pd.read_csv('E:/test_temp/results.csv')
ref_ap = prepare_ap_results(ref_ap)
mq_evidence = pd.DataFrame.from_dict({k: ref_ap[mq_dict_evidence[k]] for k in mq_dict_evidence.keys()})
mq_evidence.to_csv('mq_evidence.txt', sep = '\t', index=None)
```

## ProteinGroups

```
#export 
mq_dict_proteinGroups = {}

mq_dict_proteinGroups['Protein IDs'] =
mq_dict_proteinGroups['Majority protein IDs'] =
mq_dict_proteinGroups['Peptide counts (all)'] =
mq_dict_proteinGroups['Peptide counts (razor+unique)'] =
mq_dict_proteinGroups['Peptide counts (unique)'] =
mq_dict_proteinGroups['Fasta headers'] =
mq_dict_proteinGroups['Number of proteins'] =
mq_dict_proteinGroups['Peptides'] =
mq_dict_proteinGroups['Razor + unique peptides'] =
mq_dict_proteinGroups['Unique peptides'] =
mq_dict_proteinGroups['Sequence coverage [%]'] =
mq_dict_proteinGroups['Unique + razor sequence coverage [%]'] =
mq_dict_proteinGroups['Unique sequence coverage [%]'] =
mq_dict_proteinGroups['Mol. weight [kDa]'] =
mq_dict_proteinGroups['Sequence length'] =
mq_dict_proteinGroups['Sequence lengths'] =
mq_dict_proteinGroups['Q-value'] =
mq_dict_proteinGroups['Score'] =
mq_dict_proteinGroups['Intensity'] =
mq_dict_proteinGroups['MS/MS count'] =
mq_dict_proteinGroups['Only identified by site'] =
mq_dict_proteinGroups['Reverse'] =
mq_dict_proteinGroups['Potential contaminant'] =
mq_dict_proteinGroups['id'] =
mq_dict_proteinGroups['Peptide IDs'] =
mq_dict_proteinGroups['Peptide is razor'] =
mq_dict_proteinGroups['Mod. peptide IDs'] =
mq_dict_proteinGroups['Evidence IDs'] =
mq_dict_proteinGroups['MS/MS IDs'] =
mq_dict_proteinGroups['Best MS/MS'] =
mq_dict_proteinGroups['Oxidation (M) site IDs'] =
mq_dict_proteinGroups['Oxidation (M) site positions'] =
mq_dict_proteinGroups['Taxonomy IDs'] =

```

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_settings.ipynb.
Converted 01_chem.ipynb.
Converted 02_io.ipynb.
Converted 03_fasta.ipynb.
Converted 04_feature_finding.ipynb.
Converted 05_search.ipynb.
Converted 06_score.ipynb.
Converted 07_recalibration.ipynb.
Converted 08_quantification.ipynb.
Converted 09_matching.ipynb.
Converted 10_constants.ipynb.
Converted 11_interface.ipynb.
Converted 12_speed.ipynb.
Converted 13_export.ipynb.
Converted index.ipynb.
