### Combining NAAF-corrected *de novo* and PeaksDB  peptides for Trocas 7 incubation samples

The dataset:

    16 treatments: 4 stations, 2 timepoints (Time 0 an Time 24 hrs), 2 size fractions (GFF; GF75):
    
    Stations: 
    
        - Macapa South (MS) South stem, upriver
        - Macapa North (MN) North stem, upriver
        - Chaves (CV) South stem, downriver
        - Baylique (BY) North stem, downriver


    Proteomics samples from 2 trips to UWPR (July 2020 on the QE; April 2021 on the Fusion)
    There were at least triplicate samples for each treatment
    Many in April 2021 injected twice

Starting with:

    Peaks de novo results of PTM-optimized sequencing, NAAF corrected per sample
    PeaksDB de novo-assisted results from PTM-optimized database searches, NAAF corrected per sample
    
    Multiple samples per treatment

Goal:

    CSVs with combined de novo and PeaksDB for each sample, normalized to by NAAF
    
Using:

    - pandas
    - matplotlib
    - numpy

In [1]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [2]:
cd /home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/

/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations


### 1. Baylique, timepoint 0, size fraction 0.3-0.7 um (denoted as BY_T00_GF)
### T7 samples #s: 101 (102C), 102A, 102B
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/102_BY_T00_GF_PDB
    Trocas-incubations/processed/PeaksDB/102_BY_T00_GF_DN

In [3]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_102a = pd.read_csv("processed/PeaksDN/102_BY_T00_GF_DN/102A_BY_T00_GF_DN50_naaf_totals.csv")
peaks50_102b = pd.read_csv("processed/PeaksDN/102_BY_T00_GF_DN/102B_BY_T00_GF_DN50_naaf_totals.csv")
peaks50_102c = pd.read_csv("processed/PeaksDN/102_BY_T00_GF_DN/102C_BY_T00_GF_DN50_naaf_totals.csv")
peaksdb_102a = pd.read_csv("processed/PeaksDB/102_BY_T00_GF_PDB/102A_BY_T00_GF_PDB_naaf_totals.csv")
peaksdb_102b = pd.read_csv("processed/PeaksDB/102_BY_T00_GF_PDB/102B_BY_T00_GF_PDB_naaf_totals.csv")
peaksdb_102c = pd.read_csv("processed/PeaksDB/102_BY_T00_GF_PDB/102C_BY_T00_GF_PDB_naaf_totals.csv")

frames = [peaks50_102a, peaks50_102b, peaks50_102c, peaksdb_102a, peaksdb_102b, peaksdb_102c]
index = ['peaks50_102a', 'peaks50_102b', 'peaks50_102c', 'peaksdb_102a', 'peaksdb_102b', 'peaksdb_102c']

# concatenate dataframes
tot_102 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_102 = tot_102.loc[:, ~tot_102.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks50_102a', 'peaks50_102b', 'peaks50_101', 'peaksdb_102a', 'peaksdb_102b', 'peaksdb_101']
tot_102.insert(loc=0, column='data source', value=names)
tot_102.set_index('data source')

# sum the AAs and PTMN

index = ['102']

data = {'NAAF': tot_102['NAAF'].sum(),
        'A-NAAF': tot_102['A'].sum(),
        'C-NAAF': tot_102['C'].sum(),
        'D-NAAF': tot_102['D'].sum(),
        'E-NAAF': tot_102['E'].sum(),
        'F-NAAF': tot_102['F'].sum(),
        'G-NAAF': tot_102['G'].sum(),
        'H-NAAF': tot_102['H'].sum(),
        'I-NAAF': tot_102['I'].sum(),
        'K-NAAF': tot_102['K'].sum(),
        'L-NAAF': tot_102['L'].sum(),
        'M-NAAF': tot_102['M'].sum(),
        'N-NAAF': tot_102['N'].sum(),
        'P-NAAF': tot_102['P'].sum(),
        'Q-NAAF': tot_102['Q'].sum(),
        'R-NAAF': tot_102['R'].sum(),
        'S-NAAF': tot_102['S'].sum(),
        'T-NAAF': tot_102['T'].sum(),
        'V-NAAF': tot_102['V'].sum(),
        'W-NAAF': tot_102['W'].sum(),
        'Y-NAAF': tot_102['Y'].sum(),
        'C-carb-NAAF': tot_102['c-carb'].sum(),
        'M-oxid-NAAF': tot_102['m-oxid'].sum(),
        'N-deam-NAAF': tot_102['n-deam'].sum(),
        'Q-deam-NAAF': tot_102['q-deam'].sum()
       }

sum_102 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_102['AA-NAAF'] = sum_102['A-NAAF'] + sum_102['C-NAAF'] + sum_102['D-NAAF'] + sum_102['E-NAAF'] + \
                     sum_102['F-NAAF'] + sum_102['G-NAAF'] + sum_102['H-NAAF'] + sum_102['I-NAAF'] + \
                     sum_102['K-NAAF'] + sum_102['L-NAAF'] + sum_102['M-NAAF'] + sum_102['N-NAAF'] + \
                     sum_102['P-NAAF'] + sum_102['Q-NAAF'] + sum_102['R-NAAF'] + sum_102['S-NAAF'] + \
                     sum_102['T-NAAF'] + sum_102['V-NAAF'] + sum_102['W-NAAF'] + sum_102['Y-NAAF'] 

sum_102['AA-modifiable-NAAF'] = sum_102['M-NAAF'] + sum_102['N-NAAF'] + sum_102['Q-NAAF']
                                

sum_102['Cys. w/ carb.'] = sum_102['C-carb-NAAF'] / sum_102['C-NAAF'] #1
sum_102['Met. w/ oxid.'] = sum_102['M-oxid-NAAF'] / sum_102['M-NAAF'] #2
sum_102['Asn. w/ deam.'] = sum_102['N-deam-NAAF'] / sum_102['N-NAAF'] #3
sum_102['Glu. w/ deam.'] = sum_102['Q-deam-NAAF'] / sum_102['Q-NAAF'] #3


sum_102['Overall modified'] = 0.001*((sum_102['Met. w/ oxid.']*sum_102['M-NAAF']) + \
                                     (sum_102['Asn. w/ deam.']*sum_102['N-NAAF']) + \
                                     (sum_102['Glu. w/ deam.']*sum_102['Q-NAAF']) / (sum_102['AA-modifiable-NAAF'])) 
                                                            

# write to a csv T7 combined data directory in /processed
sum_102.to_csv("processed/NAAF-sums/NAAF-sum-102.csv")

sum_102.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,M-oxid-NAAF,N-deam-NAAF,Q-deam-NAAF,AA-NAAF,AA-modifiable-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Glu. w/ deam.,Overall modified
102,776.54907,679.129108,29.621302,120.813874,121.02527,39.080522,120.676864,73.338591,151.320725,54.436557,...,10.65605,26.753558,1.492499,7216.122499,317.865399,1.0,0.208913,0.136432,0.021091,0.037414


### 2. Baylique, timepoint 0, size fraction>0.7 um (denoted as BY_T00_GD)
### T7 samples #s: 201(201B), 202
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/202_BY_T00_GD_PDB
    Trocas-incubations/processed/PeaksDB/202_BY_T00_GD_DN

In [5]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_202a = pd.read_csv("processed/PeaksDN/202_BY_T00_GD_DN/202_BY_T00_GD_DN50_naaf_totals.csv")
peaks50_202b = pd.read_csv("processed/PeaksDN/202_BY_T00_GD_DN/202B_BY_T00_GD_DN50_naaf_totals.csv")
peaksdb_202a = pd.read_csv("processed/PeaksDB/202_BY_T00_GD_PDB/202_BY_T00_GD_PDB_naaf_totals.csv")
peaksdb_202b = pd.read_csv("processed/PeaksDB/202_BY_T00_GD_PDB/202B_BY_T00_GD_PDB_naaf_totals.csv")

frames = [peaks50_202a, peaks50_202b, peaksdb_202a, peaksdb_202b]
index = ['peaks50_202a', 'peaks50_202b', 'peaksdb_202a', 'peaksdb_202b']

# concatenate dataframes
tot_202 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_202 = tot_202.loc[:, ~tot_202.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks50_202', 'peaks50_201', 'peaksdb_202', 'peaksdb_201']
tot_202.insert(loc=0, column='data source', value=names)
tot_202.set_index('data source')

# sum the AAs and PTMN

index = ['202']

data = {'NAAF': tot_202['NAAF'].sum(),
        'A-NAAF': tot_202['A'].sum(),
        'C-NAAF': tot_202['C'].sum(),
        'D-NAAF': tot_202['D'].sum(),
        'E-NAAF': tot_202['E'].sum(),
        'F-NAAF': tot_202['F'].sum(),
        'G-NAAF': tot_202['G'].sum(),
        'H-NAAF': tot_202['H'].sum(),
        'I-NAAF': tot_202['I'].sum(),
        'K-NAAF': tot_202['K'].sum(),
        'L-NAAF': tot_202['L'].sum(),
        'M-NAAF': tot_202['M'].sum(),
        'N-NAAF': tot_202['N'].sum(),
        'P-NAAF': tot_202['P'].sum(),
        'Q-NAAF': tot_202['Q'].sum(),
        'R-NAAF': tot_202['R'].sum(),
        'S-NAAF': tot_202['S'].sum(),
        'T-NAAF': tot_202['T'].sum(),
        'V-NAAF': tot_202['V'].sum(),
        'W-NAAF': tot_202['W'].sum(),
        'Y-NAAF': tot_202['Y'].sum(),
        'C-carb-NAAF': tot_202['c-carb'].sum(),
        'M-oxid-NAAF': tot_202['m-oxid'].sum(),
        'N-deam-NAAF': tot_202['n-deam'].sum(),
        'Q-deam-NAAF': tot_202['q-deam'].sum()
       }

sum_202 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_202['AA-NAAF'] = sum_202['A-NAAF'] + sum_202['C-NAAF'] + sum_202['D-NAAF'] + sum_202['E-NAAF'] + \
                     sum_202['F-NAAF'] + sum_202['G-NAAF'] + sum_202['H-NAAF'] + sum_202['I-NAAF'] + \
                     sum_202['K-NAAF'] + sum_202['L-NAAF'] + sum_202['M-NAAF'] + sum_202['N-NAAF'] + \
                     sum_202['P-NAAF'] + sum_202['Q-NAAF'] + sum_202['R-NAAF'] + sum_202['S-NAAF'] + \
                     sum_202['T-NAAF'] + sum_202['V-NAAF'] + sum_202['W-NAAF'] + sum_202['Y-NAAF'] 

sum_202['AA-modifiable-NAAF'] = sum_202['M-NAAF'] + sum_202['N-NAAF'] + sum_202['Q-NAAF']
                                

sum_202['Cys. w/ carb.'] = sum_202['C-carb-NAAF'] / sum_202['C-NAAF'] #1
sum_202['Met. w/ oxid.'] = sum_202['M-oxid-NAAF'] / sum_202['M-NAAF'] #2
sum_202['Asn. w/ deam.'] = sum_202['N-deam-NAAF'] / sum_202['N-NAAF'] #3
sum_202['Glu. w/ deam.'] = sum_202['Q-deam-NAAF'] / sum_202['Q-NAAF'] #3


sum_202['Overall modified'] = 0.001*((sum_202['Met. w/ oxid.']*sum_202['M-NAAF']) + \
                                     (sum_202['Asn. w/ deam.']*sum_202['N-NAAF']) + \
                                     (sum_202['Glu. w/ deam.']*sum_202['Q-NAAF']) / (sum_202['AA-modifiable-NAAF'])) 
                                                            

# write to a csv T7 combined data directory in /processed
sum_202.to_csv("processed/NAAF-sums/NAAF-sum-202.csv")

sum_202.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,M-oxid-NAAF,N-deam-NAAF,Q-deam-NAAF,AA-NAAF,AA-modifiable-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Glu. w/ deam.,Overall modified
202,452.320941,332.502927,5.142524,6.393829,11.586165,10.819902,18.385602,16.750413,8.166758,10.170907,...,1.210399,2.086354,0.046621,3948.973304,158.923223,1.0,0.126643,0.015021,0.004451,0.003297


### 3. Baylique, timepoint 24 hrs, size fraction 0.3-0.7 um (denoted as BY_T24_GF)
### T7 samples #s: 302A, 302B, 303 (302C)
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/302_BY_T24_GF_PDB
    Trocas-incubations/processed/PeaksDB/302_BY_T24_GF_DN

In [6]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_302a = pd.read_csv("processed/PeaksDN/302_BY_T24_GF_DN/302A_BY_T24_GF_DN50_naaf_totals.csv")
peaks50_302b = pd.read_csv("processed/PeaksDN/302_BY_T24_GF_DN/302B_BY_T24_GF_DN50_naaf_totals.csv")
peaks50_302c = pd.read_csv("processed/PeaksDN/302_BY_T24_GF_DN/302C_BY_T24_GF_DN50_naaf_totals.csv")
peaksdb_302a = pd.read_csv("processed/PeaksDB/302_BY_T24_GF_PDB/302A_BY_T24_GF_PDB_naaf_totals.csv")
peaksdb_302b = pd.read_csv("processed/PeaksDB/302_BY_T24_GF_PDB/302B_BY_T24_GF_PDB_naaf_totals.csv")
peaksdb_302c = pd.read_csv("processed/PeaksDB/302_BY_T24_GF_PDB/302C_BY_T24_GF_PDB_naaf_totals.csv")

frames = [peaks50_302a, peaks50_302b, peaks50_302c, peaksdb_302a, peaksdb_302b, peaksdb_302c]
index = ['peaks50_302a', 'peaks50_302b', 'peaks50_302c', 'peaksdb_302a', 'peaksdb_302b', 'peaksdb_302bc']

# concatenate dataframes
tot_302 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_302 = tot_302.loc[:, ~tot_302.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks50_302a', 'peaks50_302b', 'peaks50_303', 'peaksdb_302a', 'peaksdb_302b', 'peaksdb_303']
tot_302.insert(loc=0, column='data source', value=names)
tot_302.set_index('data source')

# sum the AAs and PTMN

index = ['302']

data = {'NAAF': tot_302['NAAF'].sum(),
        'A-NAAF': tot_302['A'].sum(),
        'C-NAAF': tot_302['C'].sum(),
        'D-NAAF': tot_302['D'].sum(),
        'E-NAAF': tot_302['E'].sum(),
        'F-NAAF': tot_302['F'].sum(),
        'G-NAAF': tot_302['G'].sum(),
        'H-NAAF': tot_302['H'].sum(),
        'I-NAAF': tot_302['I'].sum(),
        'K-NAAF': tot_302['K'].sum(),
        'L-NAAF': tot_302['L'].sum(),
        'M-NAAF': tot_302['M'].sum(),
        'N-NAAF': tot_302['N'].sum(),
        'P-NAAF': tot_302['P'].sum(),
        'Q-NAAF': tot_302['Q'].sum(),
        'R-NAAF': tot_302['R'].sum(),
        'S-NAAF': tot_302['S'].sum(),
        'T-NAAF': tot_302['T'].sum(),
        'V-NAAF': tot_302['V'].sum(),
        'W-NAAF': tot_302['W'].sum(),
        'Y-NAAF': tot_302['Y'].sum(),
        'C-carb-NAAF': tot_302['c-carb'].sum(),
        'M-oxid-NAAF': tot_302['m-oxid'].sum(),
        'N-deam-NAAF': tot_302['n-deam'].sum(),
        'Q-deam-NAAF': tot_302['q-deam'].sum()
       }

sum_302 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_302['AA-NAAF'] = sum_302['A-NAAF'] + sum_302['C-NAAF'] + sum_302['D-NAAF'] + sum_302['E-NAAF'] + \
                     sum_302['F-NAAF'] + sum_302['G-NAAF'] + sum_302['H-NAAF'] + sum_302['I-NAAF'] + \
                     sum_302['K-NAAF'] + sum_302['L-NAAF'] + sum_302['M-NAAF'] + sum_302['N-NAAF'] + \
                     sum_302['P-NAAF'] + sum_302['Q-NAAF'] + sum_302['R-NAAF'] + sum_302['S-NAAF'] + \
                     sum_302['T-NAAF'] + sum_302['V-NAAF'] + sum_302['W-NAAF'] + sum_302['Y-NAAF'] 

sum_302['AA-modifiable-NAAF'] = sum_302['M-NAAF'] + sum_302['N-NAAF'] + sum_302['Q-NAAF']
                                

sum_302['Cys. w/ carb.'] = sum_302['C-carb-NAAF'] / sum_302['C-NAAF'] #1
sum_302['Met. w/ oxid.'] = sum_302['M-oxid-NAAF'] / sum_302['M-NAAF'] #2
sum_302['Asn. w/ deam.'] = sum_302['N-deam-NAAF'] / sum_302['N-NAAF'] #3
sum_302['Glu. w/ deam.'] = sum_302['Q-deam-NAAF'] / sum_302['Q-NAAF'] #3


sum_302['Overall modified'] = 0.001*((sum_302['Met. w/ oxid.']*sum_302['M-NAAF']) + \
                                     (sum_302['Asn. w/ deam.']*sum_302['N-NAAF']) + \
                                     (sum_302['Glu. w/ deam.']*sum_302['Q-NAAF']) / (sum_302['AA-modifiable-NAAF'])) 
                                                            

# write to a csv T7 combined data directory in /processed
sum_302.to_csv("processed/NAAF-sums/NAAF-sum-302.csv")

sum_302.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,M-oxid-NAAF,N-deam-NAAF,Q-deam-NAAF,AA-NAAF,AA-modifiable-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Glu. w/ deam.,Overall modified
302,1047.021188,700.331414,55.956885,13.922006,44.691765,80.047053,115.866902,151.850248,33.22997,76.91549,...,11.654601,15.779159,0.974927,9131.652922,295.497616,1.0,0.135295,0.084352,0.043734,0.027437


### 4. Baylique, timepoint 24 hrs, size fraction >0.7 um (denoted as BY_T24_GD)
### T7 samples #s: 402A, 402B
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/402_BY_T24_GD_PDB
    Trocas-incubations/processed/PeaksDB/402_BY_T24_GD_DN

In [7]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_402a = pd.read_csv("processed/PeaksDN/402_BY_T24_GD_DN/402A_BY_T24_GD_DN50_naaf_totals.csv")
peaks50_402b = pd.read_csv("processed/PeaksDN/402_BY_T24_GD_DN/402B_BY_T24_GD_DN50_naaf_totals.csv")
peaksdb_402a = pd.read_csv("processed/PeaksDB/402_BY_T24_GD_PDB/402A_BY_T24_GD_PDB_naaf_totals.csv")
peaksdb_402b = pd.read_csv("processed/PeaksDB/402_BY_T24_GD_PDB/402B_BY_T24_GD_PDB_naaf_totals.csv")


frames = [peaks50_402a, peaks50_402b, peaksdb_402a, peaksdb_402b, ]
index = ['peaks50_402a', 'peaks50_402b', 'peaksdb_402a', 'peaksdb_402b']

# concatenate dataframes
tot_402 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_402 = tot_402.loc[:, ~tot_402.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks50_402a', 'peaks50_402b', 'peaksdb_402a', 'peaksdb_402b']
tot_402.insert(loc=0, column='data source', value=names)
tot_402.set_index('data source')

# sum the AAs and PTMN

index = ['402']

data = {'NAAF': tot_402['NAAF'].sum(),
        'A-NAAF': tot_402['A'].sum(),
        'C-NAAF': tot_402['C'].sum(),
        'D-NAAF': tot_402['D'].sum(),
        'E-NAAF': tot_402['E'].sum(),
        'F-NAAF': tot_402['F'].sum(),
        'G-NAAF': tot_402['G'].sum(),
        'H-NAAF': tot_402['H'].sum(),
        'I-NAAF': tot_402['I'].sum(),
        'K-NAAF': tot_402['K'].sum(),
        'L-NAAF': tot_402['L'].sum(),
        'M-NAAF': tot_402['M'].sum(),
        'N-NAAF': tot_402['N'].sum(),
        'P-NAAF': tot_402['P'].sum(),
        'Q-NAAF': tot_402['Q'].sum(),
        'R-NAAF': tot_402['R'].sum(),
        'S-NAAF': tot_402['S'].sum(),
        'T-NAAF': tot_402['T'].sum(),
        'V-NAAF': tot_402['V'].sum(),
        'W-NAAF': tot_402['W'].sum(),
        'Y-NAAF': tot_402['Y'].sum(),
        'C-carb-NAAF': tot_402['c-carb'].sum(),
        'M-oxid-NAAF': tot_402['m-oxid'].sum(),
        'N-deam-NAAF': tot_402['n-deam'].sum(),
        'Q-deam-NAAF': tot_402['q-deam'].sum()
       }

sum_402 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_402['AA-NAAF'] = sum_402['A-NAAF'] + sum_402['C-NAAF'] + sum_402['D-NAAF'] + sum_402['E-NAAF'] + \
                     sum_402['F-NAAF'] + sum_402['G-NAAF'] + sum_402['H-NAAF'] + sum_402['I-NAAF'] + \
                     sum_402['K-NAAF'] + sum_402['L-NAAF'] + sum_402['M-NAAF'] + sum_402['N-NAAF'] + \
                     sum_402['P-NAAF'] + sum_402['Q-NAAF'] + sum_402['R-NAAF'] + sum_402['S-NAAF'] + \
                     sum_402['T-NAAF'] + sum_402['V-NAAF'] + sum_402['W-NAAF'] + sum_402['Y-NAAF'] 

sum_402['AA-modifiable-NAAF'] = sum_402['M-NAAF'] + sum_402['N-NAAF'] + sum_402['Q-NAAF']
                                

sum_402['Cys. w/ carb.'] = sum_402['C-carb-NAAF'] / sum_402['C-NAAF'] #1
sum_402['Met. w/ oxid.'] = sum_402['M-oxid-NAAF'] / sum_402['M-NAAF'] #2
sum_402['Asn. w/ deam.'] = sum_402['N-deam-NAAF'] / sum_402['N-NAAF'] #3
sum_402['Glu. w/ deam.'] = sum_402['Q-deam-NAAF'] / sum_402['Q-NAAF'] #3


sum_402['Overall modified'] = 0.001*((sum_402['Met. w/ oxid.']*sum_402['M-NAAF']) + \
                                     (sum_402['Asn. w/ deam.']*sum_402['N-NAAF']) + \
                                     (sum_402['Glu. w/ deam.']*sum_402['Q-NAAF']) / (sum_402['AA-modifiable-NAAF'])) 
                                                            

# write to a csv T7 combined data directory in /processed
sum_402.to_csv("processed/NAAF-sums/NAAF-sum-402.csv")

sum_402.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,M-oxid-NAAF,N-deam-NAAF,Q-deam-NAAF,AA-NAAF,AA-modifiable-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Glu. w/ deam.,Overall modified
402,415.545251,300.417182,13.533242,11.46992,7.620073,12.238552,30.932102,32.121162,19.315776,15.480615,...,6.404374,0.999458,0.326285,3538.157285,104.224035,1.0,0.267715,0.013544,0.050123,0.007407
