### Combining NAAF-corrected *de novo* and PeaksDB  peptides for each _T. weisflogii_ rot sample:

Starting with:

    Peaks de novo results of PTM-optimized sequencing, NAAF corrected per sample
    PeaksDB de novo-assisted results from PTM-optimized database searches, NAAF corrected per sample

Goal:

    CSVs with combined de novo and PeaksDB for each sample, normalized to Waters Hi3 peptides (6 unique E. coli heat shock protein-dervied peptides). 
    
Using:

    - pandas
    - matplotlib
    - numpy

In [1]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [14]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 322 T0 digested

peaks80_322 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_322_T0_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_322 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_322_T0_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_322, peaksdb_322]
index = ['peaks80_322', 'peaksdb_322']

# concatenate dataframes
tot_322 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_322 = tot_322.loc[:, ~tot_322.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_322', 'peaksdb_322']
tot_322.insert(loc=0, column='data source', value=names)
tot_322.set_index('data source')

# sum the AAs and PTMs

index = ['322']

data = {'NAAF': tot_322['NAAF'].sum(),
        'A-NAAF': tot_322['A'].sum(),
        'C-NAAF': tot_322['C'].sum(),
        'D-NAAF': tot_322['D'].sum(),
        'E-NAAF': tot_322['E'].sum(),
        'F-NAAF': tot_322['F'].sum(),
        'G-NAAF': tot_322['G'].sum(),
        'H-NAAF': tot_322['H'].sum(),
        'I-NAAF': tot_322['I'].sum(),
        'K-NAAF': tot_322['K'].sum(),
        'L-NAAF': tot_322['L'].sum(),
        'M-NAAF': tot_322['M'].sum(),
        'N-NAAF': tot_322['N'].sum(),
        'P-NAAF': tot_322['P'].sum(),
        'Q-NAAF': tot_322['Q'].sum(),
        'R-NAAF': tot_322['R'].sum(),
        'S-NAAF': tot_322['S'].sum(),
        'T-NAAF': tot_322['T'].sum(),
        'V-NAAF': tot_322['V'].sum(),
        'W-NAAF': tot_322['W'].sum(),
        'Y-NAAF': tot_322['Y'].sum(),
        'C-carb-NAAF': tot_322['c-carb'].sum(),
        'M-oxid-NAAF': tot_322['m-oxid'].sum(),
        'K-oxid-NAAF': tot_322['k-oxid'].sum(),
        'P-oxid-NAAF': tot_322['p-oxid'].sum(),
        'R-oxid-NAAF': tot_322['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_322['y-oxid'].sum(),
        'N-deam-NAAF': tot_322['n-deam'].sum(),
        'K-meth-NAAF': tot_322['k-meth'].sum(),
        'R-meth-NAAF': tot_322['r-meth'].sum(),
        'Q-pyro-NAAF': tot_322['q-pyro'].sum(),
        'K-acet-NAAF': tot_322['k-acet'].sum()
       }

sum_322 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_322['AA-NAAF'] = sum_322['A-NAAF'] + sum_322['C-NAAF'] + sum_322['D-NAAF'] + sum_322['E-NAAF'] + \
                     sum_322['F-NAAF'] + sum_322['G-NAAF'] + sum_322['H-NAAF'] + sum_322['I-NAAF'] + \
                     sum_322['K-NAAF'] + sum_322['L-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                     sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + sum_322['S-NAAF'] + \
                     sum_322['T-NAAF'] + sum_322['V-NAAF'] + sum_322['W-NAAF'] + sum_322['Y-NAAF'] 

sum_322['AA-modifiable-NAAF'] = sum_322['K-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                                sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + \
                                sum_322['Y-NAAF']

sum_322['Cys. w/ carb.'] = sum_322['C-carb-NAAF'] / sum_322['C-NAAF'] #1
sum_322['Met. w/ oxid.'] = sum_322['M-oxid-NAAF'] / sum_322['M-NAAF'] #2
sum_322['Lys. w/ oxid.'] = sum_322['K-oxid-NAAF'] / sum_322['K-NAAF'] #3
sum_322['Pro. w/ oxid.'] = sum_322['P-oxid-NAAF'] / sum_322['P-NAAF'] #4
sum_322['Arg. w/ oxid.'] = sum_322['R-oxid-NAAF'] / sum_322['R-NAAF'] #5
sum_322['Tyr. w/ oxid.'] = sum_322['Y-oxid-NAAF'] / sum_322['Y-NAAF'] #6
sum_322['Asn. w/ deam.'] = sum_322['N-deam-NAAF'] / sum_322['N-NAAF'] #7
sum_322['Lys. w/ meth.'] = sum_322['K-meth-NAAF'] / sum_322['K-NAAF'] #8
sum_322['Arg. w/ meth.'] = sum_322['R-meth-NAAF'] / sum_322['R-NAAF'] #9
sum_322['Glu. w/ pyro.'] = sum_322['Q-pyro-NAAF'] / sum_322['Q-NAAF'] #10
sum_322['Lys. w/ acet.'] = sum_322['K-acet-NAAF'] / sum_322['K-NAAF'] #11

sum_322['Overall modified'] = 0.001*((sum_322['Met. w/ oxid.']*sum_322['M-NAAF']) + \
                                     (sum_322['Lys. w/ oxid.']*sum_322['K-NAAF']) + \
                                     (sum_322['Pro. w/ oxid.']*sum_322['P-NAAF']) + \
                                     (sum_322['Arg. w/ oxid.']*sum_322['R-NAAF']) + \
                                     (sum_322['Tyr. w/ oxid.']*sum_322['Y-NAAF']) + \
                                     (sum_322['Asn. w/ deam.']*sum_322['N-NAAF']) + \
                                     (sum_322['Lys. w/ meth.']*sum_322['K-NAAF']) + \
                                     (sum_322['Arg. w/ meth.']*sum_322['R-NAAF']) + \
                                     (sum_322['Glu. w/ pyro.']*sum_322['Q-NAAF']) + \
                                     (sum_322['Lys. w/ acet.']*sum_322['K-NAAF']) / (sum_322['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_322.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-322.csv")

sum_322.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
322,6614.771917,4340.423046,311.083912,2009.170186,3196.675259,1420.698938,3489.1633,473.968822,539.0,2696.30193,...,0.13206,0.138221,0.06312,0.244747,0.289083,0.115292,0.01549,0.007556,0.103685,3.481942


In [16]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 323 T2 digested

peaks80_323 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_323_T2_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_323 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_323_T2_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_323, peaksdb_323]
index = ['peaks80_323', 'peaksdb_323']

# concatenate dataframes
tot_323 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_323 = tot_323.loc[:, ~tot_323.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_323', 'peaksdb_323']
tot_323.insert(loc=0, column='data source', value=names)
tot_323.set_index('data source')

# sum the AAs and PTMs

index = ['323']

data = {'NAAF': tot_323['NAAF'].sum(),
        'A-NAAF': tot_323['A'].sum(),
        'C-NAAF': tot_323['C'].sum(),
        'D-NAAF': tot_323['D'].sum(),
        'E-NAAF': tot_323['E'].sum(),
        'F-NAAF': tot_323['F'].sum(),
        'G-NAAF': tot_323['G'].sum(),
        'H-NAAF': tot_323['H'].sum(),
        'I-NAAF': tot_323['I'].sum(),
        'K-NAAF': tot_323['K'].sum(),
        'L-NAAF': tot_323['L'].sum(),
        'M-NAAF': tot_323['M'].sum(),
        'N-NAAF': tot_323['N'].sum(),
        'P-NAAF': tot_323['P'].sum(),
        'Q-NAAF': tot_323['Q'].sum(),
        'R-NAAF': tot_323['R'].sum(),
        'S-NAAF': tot_323['S'].sum(),
        'T-NAAF': tot_323['T'].sum(),
        'V-NAAF': tot_323['V'].sum(),
        'W-NAAF': tot_323['W'].sum(),
        'Y-NAAF': tot_323['Y'].sum(),
        'C-carb-NAAF': tot_323['c-carb'].sum(),
        'M-oxid-NAAF': tot_323['m-oxid'].sum(),
        'K-oxid-NAAF': tot_323['k-oxid'].sum(),
        'P-oxid-NAAF': tot_323['p-oxid'].sum(),
        'R-oxid-NAAF': tot_323['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_323['y-oxid'].sum(),
        'N-deam-NAAF': tot_323['n-deam'].sum(),
        'K-meth-NAAF': tot_323['k-meth'].sum(),
        'R-meth-NAAF': tot_323['r-meth'].sum(),
        'Q-pyro-NAAF': tot_323['q-pyro'].sum(),
        'K-acet-NAAF': tot_323['k-acet'].sum()
       }

sum_323 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_323['AA-NAAF'] = sum_323['A-NAAF'] + sum_323['C-NAAF'] + sum_323['D-NAAF'] + sum_323['E-NAAF'] + \
                     sum_323['F-NAAF'] + sum_323['G-NAAF'] + sum_323['H-NAAF'] + sum_323['I-NAAF'] + \
                     sum_323['K-NAAF'] + sum_323['L-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                     sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + sum_323['S-NAAF'] + \
                     sum_323['T-NAAF'] + sum_323['V-NAAF'] + sum_323['W-NAAF'] + sum_323['Y-NAAF'] 

sum_323['AA-modifiable-NAAF'] = sum_323['K-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                                sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + \
                                sum_323['Y-NAAF']

sum_323['Cys. w/ carb.'] = sum_323['C-carb-NAAF'] / sum_323['C-NAAF'] #1
sum_323['Met. w/ oxid.'] = sum_323['M-oxid-NAAF'] / sum_323['M-NAAF'] #2
sum_323['Lys. w/ oxid.'] = sum_323['K-oxid-NAAF'] / sum_323['K-NAAF'] #3
sum_323['Pro. w/ oxid.'] = sum_323['P-oxid-NAAF'] / sum_323['P-NAAF'] #4
sum_323['Arg. w/ oxid.'] = sum_323['R-oxid-NAAF'] / sum_323['R-NAAF'] #5
sum_323['Tyr. w/ oxid.'] = sum_323['Y-oxid-NAAF'] / sum_323['Y-NAAF'] #6
sum_323['Asn. w/ deam.'] = sum_323['N-deam-NAAF'] / sum_323['N-NAAF'] #7
sum_323['Lys. w/ meth.'] = sum_323['K-meth-NAAF'] / sum_323['K-NAAF'] #8
sum_323['Arg. w/ meth.'] = sum_323['R-meth-NAAF'] / sum_323['R-NAAF'] #9
sum_323['Glu. w/ pyro.'] = sum_323['Q-pyro-NAAF'] / sum_323['Q-NAAF'] #10
sum_323['Lys. w/ acet.'] = sum_323['K-acet-NAAF'] / sum_323['K-NAAF'] #11

sum_323['Overall modified'] = 0.001*((sum_323['Met. w/ oxid.']*sum_323['M-NAAF']) + \
                                     (sum_323['Lys. w/ oxid.']*sum_323['K-NAAF']) + \
                                     (sum_323['Pro. w/ oxid.']*sum_323['P-NAAF']) + \
                                     (sum_323['Arg. w/ oxid.']*sum_323['R-NAAF']) + \
                                     (sum_323['Tyr. w/ oxid.']*sum_323['Y-NAAF']) + \
                                     (sum_323['Asn. w/ deam.']*sum_323['N-NAAF']) + \
                                     (sum_323['Lys. w/ meth.']*sum_323['K-NAAF']) + \
                                     (sum_323['Arg. w/ meth.']*sum_323['R-NAAF']) + \
                                     (sum_323['Glu. w/ pyro.']*sum_323['Q-NAAF']) + \
                                     (sum_323['Lys. w/ acet.']*sum_323['K-NAAF']) / (sum_323['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_323.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-323.csv")

sum_323.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
323,7133.308355,3901.218088,734.20847,2567.271117,5371.900556,1294.088984,3279.846385,523.078359,645.0,4489.593151,...,0.103688,0.179799,0.099164,0.137063,0.299277,0.077951,0.033178,0.170234,0.138694,4.276548


In [17]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 324 T5 digested

peaks80_324 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_324_T5_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_324 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_324_T5_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_324, peaksdb_324]
index = ['peaks80_324', 'peaksdb_324']

# concatenate dataframes
tot_324 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_324 = tot_324.loc[:, ~tot_324.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_324', 'peaksdb_324']
tot_324.insert(loc=0, column='data source', value=names)
tot_324.set_index('data source')

# sum the AAs and PTMs

index = ['324']

data = {'NAAF': tot_324['NAAF'].sum(),
        'A-NAAF': tot_324['A'].sum(),
        'C-NAAF': tot_324['C'].sum(),
        'D-NAAF': tot_324['D'].sum(),
        'E-NAAF': tot_324['E'].sum(),
        'F-NAAF': tot_324['F'].sum(),
        'G-NAAF': tot_324['G'].sum(),
        'H-NAAF': tot_324['H'].sum(),
        'I-NAAF': tot_324['I'].sum(),
        'K-NAAF': tot_324['K'].sum(),
        'L-NAAF': tot_324['L'].sum(),
        'M-NAAF': tot_324['M'].sum(),
        'N-NAAF': tot_324['N'].sum(),
        'P-NAAF': tot_324['P'].sum(),
        'Q-NAAF': tot_324['Q'].sum(),
        'R-NAAF': tot_324['R'].sum(),
        'S-NAAF': tot_324['S'].sum(),
        'T-NAAF': tot_324['T'].sum(),
        'V-NAAF': tot_324['V'].sum(),
        'W-NAAF': tot_324['W'].sum(),
        'Y-NAAF': tot_324['Y'].sum(),
        'C-carb-NAAF': tot_324['c-carb'].sum(),
        'M-oxid-NAAF': tot_324['m-oxid'].sum(),
        'K-oxid-NAAF': tot_324['k-oxid'].sum(),
        'P-oxid-NAAF': tot_324['p-oxid'].sum(),
        'R-oxid-NAAF': tot_324['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_324['y-oxid'].sum(),
        'N-deam-NAAF': tot_324['n-deam'].sum(),
        'K-meth-NAAF': tot_324['k-meth'].sum(),
        'R-meth-NAAF': tot_324['r-meth'].sum(),
        'Q-pyro-NAAF': tot_324['q-pyro'].sum(),
        'K-acet-NAAF': tot_324['k-acet'].sum()
       }

sum_324 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_324['AA-NAAF'] = sum_324['A-NAAF'] + sum_324['C-NAAF'] + sum_324['D-NAAF'] + sum_324['E-NAAF'] + \
                     sum_324['F-NAAF'] + sum_324['G-NAAF'] + sum_324['H-NAAF'] + sum_324['I-NAAF'] + \
                     sum_324['K-NAAF'] + sum_324['L-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                     sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + sum_324['S-NAAF'] + \
                     sum_324['T-NAAF'] + sum_324['V-NAAF'] + sum_324['W-NAAF'] + sum_324['Y-NAAF'] 

sum_324['AA-modifiable-NAAF'] = sum_324['K-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                                sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + \
                                sum_324['Y-NAAF']

sum_324['Cys. w/ carb.'] = sum_324['C-carb-NAAF'] / sum_324['C-NAAF'] #1
sum_324['Met. w/ oxid.'] = sum_324['M-oxid-NAAF'] / sum_324['M-NAAF'] #2
sum_324['Lys. w/ oxid.'] = sum_324['K-oxid-NAAF'] / sum_324['K-NAAF'] #3
sum_324['Pro. w/ oxid.'] = sum_324['P-oxid-NAAF'] / sum_324['P-NAAF'] #4
sum_324['Arg. w/ oxid.'] = sum_324['R-oxid-NAAF'] / sum_324['R-NAAF'] #5
sum_324['Tyr. w/ oxid.'] = sum_324['Y-oxid-NAAF'] / sum_324['Y-NAAF'] #6
sum_324['Asn. w/ deam.'] = sum_324['N-deam-NAAF'] / sum_324['N-NAAF'] #7
sum_324['Lys. w/ meth.'] = sum_324['K-meth-NAAF'] / sum_324['K-NAAF'] #8
sum_324['Arg. w/ meth.'] = sum_324['R-meth-NAAF'] / sum_324['R-NAAF'] #9
sum_324['Glu. w/ pyro.'] = sum_324['Q-pyro-NAAF'] / sum_324['Q-NAAF'] #10
sum_324['Lys. w/ acet.'] = sum_324['K-acet-NAAF'] / sum_324['K-NAAF'] #11

sum_324['Overall modified'] = 0.001*((sum_324['Met. w/ oxid.']*sum_324['M-NAAF']) + \
                                     (sum_324['Lys. w/ oxid.']*sum_324['K-NAAF']) + \
                                     (sum_324['Pro. w/ oxid.']*sum_324['P-NAAF']) + \
                                     (sum_324['Arg. w/ oxid.']*sum_324['R-NAAF']) + \
                                     (sum_324['Tyr. w/ oxid.']*sum_324['Y-NAAF']) + \
                                     (sum_324['Asn. w/ deam.']*sum_324['N-NAAF']) + \
                                     (sum_324['Lys. w/ meth.']*sum_324['K-NAAF']) + \
                                     (sum_324['Arg. w/ meth.']*sum_324['R-NAAF']) + \
                                     (sum_324['Glu. w/ pyro.']*sum_324['Q-NAAF']) + \
                                     (sum_324['Lys. w/ acet.']*sum_324['K-NAAF']) / (sum_324['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-324.csv")

sum_324.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
324,5084.325647,3067.657714,245.676576,2076.479266,3396.731355,1805.013124,3674.757071,329.540057,377.0,2650.5925,...,0.096419,0.140023,0.136939,0.312653,0.459223,0.042123,0.03054,0.029137,0.115444,3.990935


In [18]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 325 T12 digested

peaks80_325 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_325_T12_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_325 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_325_T12_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_325, peaksdb_325]
index = ['peaks80_325', 'peaksdb_325']

# concatenate dataframes
tot_325 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_325 = tot_325.loc[:, ~tot_325.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_325', 'peaksdb_325']
tot_325.insert(loc=0, column='data source', value=names)
tot_325.set_index('data source')

# sum the AAs and PTMs

index = ['325']

data = {'NAAF': tot_325['NAAF'].sum(),
        'A-NAAF': tot_325['A'].sum(),
        'C-NAAF': tot_325['C'].sum(),
        'D-NAAF': tot_325['D'].sum(),
        'E-NAAF': tot_325['E'].sum(),
        'F-NAAF': tot_325['F'].sum(),
        'G-NAAF': tot_325['G'].sum(),
        'H-NAAF': tot_325['H'].sum(),
        'I-NAAF': tot_325['I'].sum(),
        'K-NAAF': tot_325['K'].sum(),
        'L-NAAF': tot_325['L'].sum(),
        'M-NAAF': tot_325['M'].sum(),
        'N-NAAF': tot_325['N'].sum(),
        'P-NAAF': tot_325['P'].sum(),
        'Q-NAAF': tot_325['Q'].sum(),
        'R-NAAF': tot_325['R'].sum(),
        'S-NAAF': tot_325['S'].sum(),
        'T-NAAF': tot_325['T'].sum(),
        'V-NAAF': tot_325['V'].sum(),
        'W-NAAF': tot_325['W'].sum(),
        'Y-NAAF': tot_325['Y'].sum(),
        'C-carb-NAAF': tot_325['c-carb'].sum(),
        'M-oxid-NAAF': tot_325['m-oxid'].sum(),
        'K-oxid-NAAF': tot_325['k-oxid'].sum(),
        'P-oxid-NAAF': tot_325['p-oxid'].sum(),
        'R-oxid-NAAF': tot_325['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_325['y-oxid'].sum(),
        'N-deam-NAAF': tot_325['n-deam'].sum(),
        'K-meth-NAAF': tot_325['k-meth'].sum(),
        'R-meth-NAAF': tot_325['r-meth'].sum(),
        'Q-pyro-NAAF': tot_325['q-pyro'].sum(),
        'K-acet-NAAF': tot_325['k-acet'].sum()
       }

sum_325 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_325['AA-NAAF'] = sum_325['A-NAAF'] + sum_325['C-NAAF'] + sum_325['D-NAAF'] + sum_325['E-NAAF'] + \
                     sum_325['F-NAAF'] + sum_325['G-NAAF'] + sum_325['H-NAAF'] + sum_325['I-NAAF'] + \
                     sum_325['K-NAAF'] + sum_325['L-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                     sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + sum_325['S-NAAF'] + \
                     sum_325['T-NAAF'] + sum_325['V-NAAF'] + sum_325['W-NAAF'] + sum_325['Y-NAAF'] 

sum_325['AA-modifiable-NAAF'] = sum_325['K-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                                sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + \
                                sum_325['Y-NAAF']

sum_325['Cys. w/ carb.'] = sum_325['C-carb-NAAF'] / sum_325['C-NAAF'] #1
sum_325['Met. w/ oxid.'] = sum_325['M-oxid-NAAF'] / sum_325['M-NAAF'] #2
sum_325['Lys. w/ oxid.'] = sum_325['K-oxid-NAAF'] / sum_325['K-NAAF'] #3
sum_325['Pro. w/ oxid.'] = sum_325['P-oxid-NAAF'] / sum_325['P-NAAF'] #4
sum_325['Arg. w/ oxid.'] = sum_325['R-oxid-NAAF'] / sum_325['R-NAAF'] #5
sum_325['Tyr. w/ oxid.'] = sum_325['Y-oxid-NAAF'] / sum_325['Y-NAAF'] #6
sum_325['Asn. w/ deam.'] = sum_325['N-deam-NAAF'] / sum_325['N-NAAF'] #7
sum_325['Lys. w/ meth.'] = sum_325['K-meth-NAAF'] / sum_325['K-NAAF'] #8
sum_325['Arg. w/ meth.'] = sum_325['R-meth-NAAF'] / sum_325['R-NAAF'] #9
sum_325['Glu. w/ pyro.'] = sum_325['Q-pyro-NAAF'] / sum_325['Q-NAAF'] #10
sum_325['Lys. w/ acet.'] = sum_325['K-acet-NAAF'] / sum_325['K-NAAF'] #11

sum_325['Overall modified'] = 0.001*((sum_325['Met. w/ oxid.']*sum_325['M-NAAF']) + \
                                     (sum_325['Lys. w/ oxid.']*sum_325['K-NAAF']) + \
                                     (sum_325['Pro. w/ oxid.']*sum_325['P-NAAF']) + \
                                     (sum_325['Arg. w/ oxid.']*sum_325['R-NAAF']) + \
                                     (sum_325['Tyr. w/ oxid.']*sum_325['Y-NAAF']) + \
                                     (sum_325['Asn. w/ deam.']*sum_325['N-NAAF']) + \
                                     (sum_325['Lys. w/ meth.']*sum_325['K-NAAF']) + \
                                     (sum_325['Arg. w/ meth.']*sum_325['R-NAAF']) + \
                                     (sum_325['Glu. w/ pyro.']*sum_325['Q-NAAF']) + \
                                     (sum_325['Lys. w/ acet.']*sum_325['K-NAAF']) / (sum_325['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_325.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-325.csv")

sum_325.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
325,6536.522107,3892.988447,224.761328,3485.922204,4084.357848,2568.155267,4241.527648,286.423803,217.0,3446.020033,...,0.080798,0.150277,0.135308,0.213088,0.363498,0.218365,0.040676,0.058193,0.142754,5.170757
