### Combining NAAF-corrected *de novo* and PeaksDB  peptides for each _T. weisflogii_ rot sample:

Starting with:

    Peaks de novo results of PTM-optimized sequencing, NAAF corrected per sample
    PeaksDB de novo-assisted results from PTM-optimized database searches, NAAF corrected per sample

Goal:

    CSVs with combined de novo and PeaksDB for each sample, normalized to Waters Hi3 peptides (6 unique E. coli heat shock protein-dervied peptides). 
    
Using:

    - pandas
    - matplotlib
    - numpy

In [2]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [2]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 322 T0 digested

peaks80_322 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_322_T0_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_322 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_322_T0_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_322, peaksdb_322]
index = ['peaks80_322', 'peaksdb_322']

# concatenate dataframes
tot_322 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_322 = tot_322.loc[:, ~tot_322.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_322', 'peaksdb_322']
tot_322.insert(loc=0, column='data source', value=names)
tot_322.set_index('data source')

# sum the AAs and PTMs

index = ['322']

data = {'NAAF': tot_322['NAAF'].sum(),
        'A-NAAF': tot_322['A'].sum(),
        'C-NAAF': tot_322['C'].sum(),
        'D-NAAF': tot_322['D'].sum(),
        'E-NAAF': tot_322['E'].sum(),
        'F-NAAF': tot_322['F'].sum(),
        'G-NAAF': tot_322['G'].sum(),
        'H-NAAF': tot_322['H'].sum(),
        'I-NAAF': tot_322['I'].sum(),
        'K-NAAF': tot_322['K'].sum(),
        'L-NAAF': tot_322['L'].sum(),
        'M-NAAF': tot_322['M'].sum(),
        'N-NAAF': tot_322['N'].sum(),
        'P-NAAF': tot_322['P'].sum(),
        'Q-NAAF': tot_322['Q'].sum(),
        'R-NAAF': tot_322['R'].sum(),
        'S-NAAF': tot_322['S'].sum(),
        'T-NAAF': tot_322['T'].sum(),
        'V-NAAF': tot_322['V'].sum(),
        'W-NAAF': tot_322['W'].sum(),
        'Y-NAAF': tot_322['Y'].sum(),
        'C-carb-NAAF': tot_322['c-carb'].sum(),
        'M-oxid-NAAF': tot_322['m-oxid'].sum(),
        'K-oxid-NAAF': tot_322['k-oxid'].sum(),
        'P-oxid-NAAF': tot_322['p-oxid'].sum(),
        'R-oxid-NAAF': tot_322['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_322['y-oxid'].sum(),
        'N-deam-NAAF': tot_322['n-deam'].sum(),
        'K-meth-NAAF': tot_322['k-meth'].sum(),
        'R-meth-NAAF': tot_322['r-meth'].sum(),
        'Q-pyro-NAAF': tot_322['q-pyro'].sum(),
        'K-acet-NAAF': tot_322['k-acet'].sum()
       }

sum_322 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_322['AA-NAAF'] = sum_322['A-NAAF'] + sum_322['C-NAAF'] + sum_322['D-NAAF'] + sum_322['E-NAAF'] + \
                     sum_322['F-NAAF'] + sum_322['G-NAAF'] + sum_322['H-NAAF'] + sum_322['I-NAAF'] + \
                     sum_322['K-NAAF'] + sum_322['L-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                     sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + sum_322['S-NAAF'] + \
                     sum_322['T-NAAF'] + sum_322['V-NAAF'] + sum_322['W-NAAF'] + sum_322['Y-NAAF'] 

sum_322['AA-modifiable-NAAF'] = sum_322['K-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                                sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + \
                                sum_322['Y-NAAF']

sum_322['Cys. w/ carb.'] = sum_322['C-carb-NAAF'] / sum_322['C-NAAF'] #1
sum_322['Met. w/ oxid.'] = sum_322['M-oxid-NAAF'] / sum_322['M-NAAF'] #2
sum_322['Lys. w/ oxid.'] = sum_322['K-oxid-NAAF'] / sum_322['K-NAAF'] #3
sum_322['Pro. w/ oxid.'] = sum_322['P-oxid-NAAF'] / sum_322['P-NAAF'] #4
sum_322['Arg. w/ oxid.'] = sum_322['R-oxid-NAAF'] / sum_322['R-NAAF'] #5
sum_322['Tyr. w/ oxid.'] = sum_322['Y-oxid-NAAF'] / sum_322['Y-NAAF'] #6
sum_322['Asn. w/ deam.'] = sum_322['N-deam-NAAF'] / sum_322['N-NAAF'] #7
sum_322['Lys. w/ meth.'] = sum_322['K-meth-NAAF'] / sum_322['K-NAAF'] #8
sum_322['Arg. w/ meth.'] = sum_322['R-meth-NAAF'] / sum_322['R-NAAF'] #9
sum_322['Glu. w/ pyro.'] = sum_322['Q-pyro-NAAF'] / sum_322['Q-NAAF'] #10
sum_322['Lys. w/ acet.'] = sum_322['K-acet-NAAF'] / sum_322['K-NAAF'] #11

sum_322['Overall modified'] = 0.001*((sum_322['Met. w/ oxid.']*sum_322['M-NAAF']) + \
                                     (sum_322['Lys. w/ oxid.']*sum_322['K-NAAF']) + \
                                     (sum_322['Pro. w/ oxid.']*sum_322['P-NAAF']) + \
                                     (sum_322['Arg. w/ oxid.']*sum_322['R-NAAF']) + \
                                     (sum_322['Tyr. w/ oxid.']*sum_322['Y-NAAF']) + \
                                     (sum_322['Asn. w/ deam.']*sum_322['N-NAAF']) + \
                                     (sum_322['Lys. w/ meth.']*sum_322['K-NAAF']) + \
                                     (sum_322['Arg. w/ meth.']*sum_322['R-NAAF']) + \
                                     (sum_322['Glu. w/ pyro.']*sum_322['Q-NAAF']) + \
                                     (sum_322['Lys. w/ acet.']*sum_322['K-NAAF']) / (sum_322['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_322.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-322.csv")

sum_322.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
322,6614.771917,4483.273061,222.045981,1872.609403,3238.356483,1296.870742,3203.344206,493.114222,620.656959,2464.677611,...,0.14447,0.140332,0.060979,0.304422,0.265141,0.126126,0.014965,0.008627,0.113429,3.481942


In [3]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 323 T2 digested

peaks80_323 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_323_T2_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_323 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_323_T2_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_323, peaksdb_323]
index = ['peaks80_323', 'peaksdb_323']

# concatenate dataframes
tot_323 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_323 = tot_323.loc[:, ~tot_323.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_323', 'peaksdb_323']
tot_323.insert(loc=0, column='data source', value=names)
tot_323.set_index('data source')

# sum the AAs and PTMs

index = ['323']

data = {'NAAF': tot_323['NAAF'].sum(),
        'A-NAAF': tot_323['A'].sum(),
        'C-NAAF': tot_323['C'].sum(),
        'D-NAAF': tot_323['D'].sum(),
        'E-NAAF': tot_323['E'].sum(),
        'F-NAAF': tot_323['F'].sum(),
        'G-NAAF': tot_323['G'].sum(),
        'H-NAAF': tot_323['H'].sum(),
        'I-NAAF': tot_323['I'].sum(),
        'K-NAAF': tot_323['K'].sum(),
        'L-NAAF': tot_323['L'].sum(),
        'M-NAAF': tot_323['M'].sum(),
        'N-NAAF': tot_323['N'].sum(),
        'P-NAAF': tot_323['P'].sum(),
        'Q-NAAF': tot_323['Q'].sum(),
        'R-NAAF': tot_323['R'].sum(),
        'S-NAAF': tot_323['S'].sum(),
        'T-NAAF': tot_323['T'].sum(),
        'V-NAAF': tot_323['V'].sum(),
        'W-NAAF': tot_323['W'].sum(),
        'Y-NAAF': tot_323['Y'].sum(),
        'C-carb-NAAF': tot_323['c-carb'].sum(),
        'M-oxid-NAAF': tot_323['m-oxid'].sum(),
        'K-oxid-NAAF': tot_323['k-oxid'].sum(),
        'P-oxid-NAAF': tot_323['p-oxid'].sum(),
        'R-oxid-NAAF': tot_323['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_323['y-oxid'].sum(),
        'N-deam-NAAF': tot_323['n-deam'].sum(),
        'K-meth-NAAF': tot_323['k-meth'].sum(),
        'R-meth-NAAF': tot_323['r-meth'].sum(),
        'Q-pyro-NAAF': tot_323['q-pyro'].sum(),
        'K-acet-NAAF': tot_323['k-acet'].sum()
       }

sum_323 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_323['AA-NAAF'] = sum_323['A-NAAF'] + sum_323['C-NAAF'] + sum_323['D-NAAF'] + sum_323['E-NAAF'] + \
                     sum_323['F-NAAF'] + sum_323['G-NAAF'] + sum_323['H-NAAF'] + sum_323['I-NAAF'] + \
                     sum_323['K-NAAF'] + sum_323['L-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                     sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + sum_323['S-NAAF'] + \
                     sum_323['T-NAAF'] + sum_323['V-NAAF'] + sum_323['W-NAAF'] + sum_323['Y-NAAF'] 

sum_323['AA-modifiable-NAAF'] = sum_323['K-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                                sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + \
                                sum_323['Y-NAAF']

sum_323['Cys. w/ carb.'] = sum_323['C-carb-NAAF'] / sum_323['C-NAAF'] #1
sum_323['Met. w/ oxid.'] = sum_323['M-oxid-NAAF'] / sum_323['M-NAAF'] #2
sum_323['Lys. w/ oxid.'] = sum_323['K-oxid-NAAF'] / sum_323['K-NAAF'] #3
sum_323['Pro. w/ oxid.'] = sum_323['P-oxid-NAAF'] / sum_323['P-NAAF'] #4
sum_323['Arg. w/ oxid.'] = sum_323['R-oxid-NAAF'] / sum_323['R-NAAF'] #5
sum_323['Tyr. w/ oxid.'] = sum_323['Y-oxid-NAAF'] / sum_323['Y-NAAF'] #6
sum_323['Asn. w/ deam.'] = sum_323['N-deam-NAAF'] / sum_323['N-NAAF'] #7
sum_323['Lys. w/ meth.'] = sum_323['K-meth-NAAF'] / sum_323['K-NAAF'] #8
sum_323['Arg. w/ meth.'] = sum_323['R-meth-NAAF'] / sum_323['R-NAAF'] #9
sum_323['Glu. w/ pyro.'] = sum_323['Q-pyro-NAAF'] / sum_323['Q-NAAF'] #10
sum_323['Lys. w/ acet.'] = sum_323['K-acet-NAAF'] / sum_323['K-NAAF'] #11

sum_323['Overall modified'] = 0.001*((sum_323['Met. w/ oxid.']*sum_323['M-NAAF']) + \
                                     (sum_323['Lys. w/ oxid.']*sum_323['K-NAAF']) + \
                                     (sum_323['Pro. w/ oxid.']*sum_323['P-NAAF']) + \
                                     (sum_323['Arg. w/ oxid.']*sum_323['R-NAAF']) + \
                                     (sum_323['Tyr. w/ oxid.']*sum_323['Y-NAAF']) + \
                                     (sum_323['Asn. w/ deam.']*sum_323['N-NAAF']) + \
                                     (sum_323['Lys. w/ meth.']*sum_323['K-NAAF']) + \
                                     (sum_323['Arg. w/ meth.']*sum_323['R-NAAF']) + \
                                     (sum_323['Glu. w/ pyro.']*sum_323['Q-NAAF']) + \
                                     (sum_323['Lys. w/ acet.']*sum_323['K-NAAF']) / (sum_323['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_323.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-323.csv")

sum_323.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
323,7133.308355,3881.416898,716.713925,2424.339348,5453.444338,1180.023355,3016.917688,516.18734,676.119865,4373.144017,...,0.106449,0.180364,0.096542,0.158877,0.286193,0.080027,0.0323,0.164729,0.142387,4.276548


In [4]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 324 T5 digested

peaks80_324 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_324_T5_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_324 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_324_T5_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_324, peaksdb_324]
index = ['peaks80_324', 'peaksdb_324']

# concatenate dataframes
tot_324 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_324 = tot_324.loc[:, ~tot_324.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_324', 'peaksdb_324']
tot_324.insert(loc=0, column='data source', value=names)
tot_324.set_index('data source')

# sum the AAs and PTMs

index = ['324']

data = {'NAAF': tot_324['NAAF'].sum(),
        'A-NAAF': tot_324['A'].sum(),
        'C-NAAF': tot_324['C'].sum(),
        'D-NAAF': tot_324['D'].sum(),
        'E-NAAF': tot_324['E'].sum(),
        'F-NAAF': tot_324['F'].sum(),
        'G-NAAF': tot_324['G'].sum(),
        'H-NAAF': tot_324['H'].sum(),
        'I-NAAF': tot_324['I'].sum(),
        'K-NAAF': tot_324['K'].sum(),
        'L-NAAF': tot_324['L'].sum(),
        'M-NAAF': tot_324['M'].sum(),
        'N-NAAF': tot_324['N'].sum(),
        'P-NAAF': tot_324['P'].sum(),
        'Q-NAAF': tot_324['Q'].sum(),
        'R-NAAF': tot_324['R'].sum(),
        'S-NAAF': tot_324['S'].sum(),
        'T-NAAF': tot_324['T'].sum(),
        'V-NAAF': tot_324['V'].sum(),
        'W-NAAF': tot_324['W'].sum(),
        'Y-NAAF': tot_324['Y'].sum(),
        'C-carb-NAAF': tot_324['c-carb'].sum(),
        'M-oxid-NAAF': tot_324['m-oxid'].sum(),
        'K-oxid-NAAF': tot_324['k-oxid'].sum(),
        'P-oxid-NAAF': tot_324['p-oxid'].sum(),
        'R-oxid-NAAF': tot_324['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_324['y-oxid'].sum(),
        'N-deam-NAAF': tot_324['n-deam'].sum(),
        'K-meth-NAAF': tot_324['k-meth'].sum(),
        'R-meth-NAAF': tot_324['r-meth'].sum(),
        'Q-pyro-NAAF': tot_324['q-pyro'].sum(),
        'K-acet-NAAF': tot_324['k-acet'].sum()
       }

sum_324 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_324['AA-NAAF'] = sum_324['A-NAAF'] + sum_324['C-NAAF'] + sum_324['D-NAAF'] + sum_324['E-NAAF'] + \
                     sum_324['F-NAAF'] + sum_324['G-NAAF'] + sum_324['H-NAAF'] + sum_324['I-NAAF'] + \
                     sum_324['K-NAAF'] + sum_324['L-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                     sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + sum_324['S-NAAF'] + \
                     sum_324['T-NAAF'] + sum_324['V-NAAF'] + sum_324['W-NAAF'] + sum_324['Y-NAAF'] 

sum_324['AA-modifiable-NAAF'] = sum_324['K-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                                sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + \
                                sum_324['Y-NAAF']

sum_324['Cys. w/ carb.'] = sum_324['C-carb-NAAF'] / sum_324['C-NAAF'] #1
sum_324['Met. w/ oxid.'] = sum_324['M-oxid-NAAF'] / sum_324['M-NAAF'] #2
sum_324['Lys. w/ oxid.'] = sum_324['K-oxid-NAAF'] / sum_324['K-NAAF'] #3
sum_324['Pro. w/ oxid.'] = sum_324['P-oxid-NAAF'] / sum_324['P-NAAF'] #4
sum_324['Arg. w/ oxid.'] = sum_324['R-oxid-NAAF'] / sum_324['R-NAAF'] #5
sum_324['Tyr. w/ oxid.'] = sum_324['Y-oxid-NAAF'] / sum_324['Y-NAAF'] #6
sum_324['Asn. w/ deam.'] = sum_324['N-deam-NAAF'] / sum_324['N-NAAF'] #7
sum_324['Lys. w/ meth.'] = sum_324['K-meth-NAAF'] / sum_324['K-NAAF'] #8
sum_324['Arg. w/ meth.'] = sum_324['R-meth-NAAF'] / sum_324['R-NAAF'] #9
sum_324['Glu. w/ pyro.'] = sum_324['Q-pyro-NAAF'] / sum_324['Q-NAAF'] #10
sum_324['Lys. w/ acet.'] = sum_324['K-acet-NAAF'] / sum_324['K-NAAF'] #11

sum_324['Overall modified'] = 0.001*((sum_324['Met. w/ oxid.']*sum_324['M-NAAF']) + \
                                     (sum_324['Lys. w/ oxid.']*sum_324['K-NAAF']) + \
                                     (sum_324['Pro. w/ oxid.']*sum_324['P-NAAF']) + \
                                     (sum_324['Arg. w/ oxid.']*sum_324['R-NAAF']) + \
                                     (sum_324['Tyr. w/ oxid.']*sum_324['Y-NAAF']) + \
                                     (sum_324['Asn. w/ deam.']*sum_324['N-NAAF']) + \
                                     (sum_324['Lys. w/ meth.']*sum_324['K-NAAF']) + \
                                     (sum_324['Arg. w/ meth.']*sum_324['R-NAAF']) + \
                                     (sum_324['Glu. w/ pyro.']*sum_324['Q-NAAF']) + \
                                     (sum_324['Lys. w/ acet.']*sum_324['K-NAAF']) / (sum_324['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-324.csv")

sum_324.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
324,5084.325647,3056.788644,219.868175,2189.512665,3601.122169,1774.418803,3483.013518,362.410636,515.555154,2586.820736,...,0.098796,0.145497,0.134163,0.326853,0.450363,0.043161,0.029921,0.030896,0.11829,3.990935


In [5]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 325 T12 digested

peaks80_325 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_325_T12_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_325 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_325_T12_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_325, peaksdb_325]
index = ['peaks80_325', 'peaksdb_325']

# concatenate dataframes
tot_325 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_325 = tot_325.loc[:, ~tot_325.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_325', 'peaksdb_325']
tot_325.insert(loc=0, column='data source', value=names)
tot_325.set_index('data source')

# sum the AAs and PTMs

index = ['325']

data = {'NAAF': tot_325['NAAF'].sum(),
        'A-NAAF': tot_325['A'].sum(),
        'C-NAAF': tot_325['C'].sum(),
        'D-NAAF': tot_325['D'].sum(),
        'E-NAAF': tot_325['E'].sum(),
        'F-NAAF': tot_325['F'].sum(),
        'G-NAAF': tot_325['G'].sum(),
        'H-NAAF': tot_325['H'].sum(),
        'I-NAAF': tot_325['I'].sum(),
        'K-NAAF': tot_325['K'].sum(),
        'L-NAAF': tot_325['L'].sum(),
        'M-NAAF': tot_325['M'].sum(),
        'N-NAAF': tot_325['N'].sum(),
        'P-NAAF': tot_325['P'].sum(),
        'Q-NAAF': tot_325['Q'].sum(),
        'R-NAAF': tot_325['R'].sum(),
        'S-NAAF': tot_325['S'].sum(),
        'T-NAAF': tot_325['T'].sum(),
        'V-NAAF': tot_325['V'].sum(),
        'W-NAAF': tot_325['W'].sum(),
        'Y-NAAF': tot_325['Y'].sum(),
        'C-carb-NAAF': tot_325['c-carb'].sum(),
        'M-oxid-NAAF': tot_325['m-oxid'].sum(),
        'K-oxid-NAAF': tot_325['k-oxid'].sum(),
        'P-oxid-NAAF': tot_325['p-oxid'].sum(),
        'R-oxid-NAAF': tot_325['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_325['y-oxid'].sum(),
        'N-deam-NAAF': tot_325['n-deam'].sum(),
        'K-meth-NAAF': tot_325['k-meth'].sum(),
        'R-meth-NAAF': tot_325['r-meth'].sum(),
        'Q-pyro-NAAF': tot_325['q-pyro'].sum(),
        'K-acet-NAAF': tot_325['k-acet'].sum()
       }

sum_325 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_325['AA-NAAF'] = sum_325['A-NAAF'] + sum_325['C-NAAF'] + sum_325['D-NAAF'] + sum_325['E-NAAF'] + \
                     sum_325['F-NAAF'] + sum_325['G-NAAF'] + sum_325['H-NAAF'] + sum_325['I-NAAF'] + \
                     sum_325['K-NAAF'] + sum_325['L-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                     sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + sum_325['S-NAAF'] + \
                     sum_325['T-NAAF'] + sum_325['V-NAAF'] + sum_325['W-NAAF'] + sum_325['Y-NAAF'] 

sum_325['AA-modifiable-NAAF'] = sum_325['K-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                                sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + \
                                sum_325['Y-NAAF']

sum_325['Cys. w/ carb.'] = sum_325['C-carb-NAAF'] / sum_325['C-NAAF'] #1
sum_325['Met. w/ oxid.'] = sum_325['M-oxid-NAAF'] / sum_325['M-NAAF'] #2
sum_325['Lys. w/ oxid.'] = sum_325['K-oxid-NAAF'] / sum_325['K-NAAF'] #3
sum_325['Pro. w/ oxid.'] = sum_325['P-oxid-NAAF'] / sum_325['P-NAAF'] #4
sum_325['Arg. w/ oxid.'] = sum_325['R-oxid-NAAF'] / sum_325['R-NAAF'] #5
sum_325['Tyr. w/ oxid.'] = sum_325['Y-oxid-NAAF'] / sum_325['Y-NAAF'] #6
sum_325['Asn. w/ deam.'] = sum_325['N-deam-NAAF'] / sum_325['N-NAAF'] #7
sum_325['Lys. w/ meth.'] = sum_325['K-meth-NAAF'] / sum_325['K-NAAF'] #8
sum_325['Arg. w/ meth.'] = sum_325['R-meth-NAAF'] / sum_325['R-NAAF'] #9
sum_325['Glu. w/ pyro.'] = sum_325['Q-pyro-NAAF'] / sum_325['Q-NAAF'] #10
sum_325['Lys. w/ acet.'] = sum_325['K-acet-NAAF'] / sum_325['K-NAAF'] #11

sum_325['Overall modified'] = 0.001*((sum_325['Met. w/ oxid.']*sum_325['M-NAAF']) + \
                                     (sum_325['Lys. w/ oxid.']*sum_325['K-NAAF']) + \
                                     (sum_325['Pro. w/ oxid.']*sum_325['P-NAAF']) + \
                                     (sum_325['Arg. w/ oxid.']*sum_325['R-NAAF']) + \
                                     (sum_325['Tyr. w/ oxid.']*sum_325['Y-NAAF']) + \
                                     (sum_325['Asn. w/ deam.']*sum_325['N-NAAF']) + \
                                     (sum_325['Lys. w/ meth.']*sum_325['K-NAAF']) + \
                                     (sum_325['Arg. w/ meth.']*sum_325['R-NAAF']) + \
                                     (sum_325['Glu. w/ pyro.']*sum_325['Q-NAAF']) + \
                                     (sum_325['Lys. w/ acet.']*sum_325['K-NAAF']) / (sum_325['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_325.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-325.csv")

sum_325.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
325,6536.522107,3887.913626,208.235777,3510.888104,4197.227523,2514.903533,4182.793845,289.667273,271.810998,3400.09906,...,0.08189,0.153991,0.133268,0.226007,0.351315,0.221314,0.040063,0.060431,0.144682,5.170757


In [6]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 329 T0 digested

peaks80_329 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_329_T0_undigested_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_329 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_329_T0_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_329, peaksdb_329]
index = ['peaks80_329', 'peaksdb_329']

# concatenate dataframes
tot_329 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_329 = tot_329.loc[:, ~tot_329.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_329', 'peaksdb_329']
tot_329.insert(loc=0, column='data source', value=names)
tot_329.set_index('data source')

# sum the AAs and PTMs

index = ['329']

data = {'NAAF': tot_329['NAAF'].sum(),
        'A-NAAF': tot_329['A'].sum(),
        'C-NAAF': tot_329['C'].sum(),
        'D-NAAF': tot_329['D'].sum(),
        'E-NAAF': tot_329['E'].sum(),
        'F-NAAF': tot_329['F'].sum(),
        'G-NAAF': tot_329['G'].sum(),
        'H-NAAF': tot_329['H'].sum(),
        'I-NAAF': tot_329['I'].sum(),
        'K-NAAF': tot_329['K'].sum(),
        'L-NAAF': tot_329['L'].sum(),
        'M-NAAF': tot_329['M'].sum(),
        'N-NAAF': tot_329['N'].sum(),
        'P-NAAF': tot_329['P'].sum(),
        'Q-NAAF': tot_329['Q'].sum(),
        'R-NAAF': tot_329['R'].sum(),
        'S-NAAF': tot_329['S'].sum(),
        'T-NAAF': tot_329['T'].sum(),
        'V-NAAF': tot_329['V'].sum(),
        'W-NAAF': tot_329['W'].sum(),
        'Y-NAAF': tot_329['Y'].sum(),
        'C-carb-NAAF': tot_329['c-carb'].sum(),
        'M-oxid-NAAF': tot_329['m-oxid'].sum(),
        'K-oxid-NAAF': tot_329['k-oxid'].sum(),
        'P-oxid-NAAF': tot_329['p-oxid'].sum(),
        'R-oxid-NAAF': tot_329['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_329['y-oxid'].sum(),
        'N-deam-NAAF': tot_329['n-deam'].sum(),
        'K-meth-NAAF': tot_329['k-meth'].sum(),
        'R-meth-NAAF': tot_329['r-meth'].sum(),
        'Q-pyro-NAAF': tot_329['q-pyro'].sum(),
        'K-acet-NAAF': tot_329['k-acet'].sum()
       }

sum_329 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_329['AA-NAAF'] = sum_329['A-NAAF'] + sum_329['C-NAAF'] + sum_329['D-NAAF'] + sum_329['E-NAAF'] + \
                     sum_329['F-NAAF'] + sum_329['G-NAAF'] + sum_329['H-NAAF'] + sum_329['I-NAAF'] + \
                     sum_329['K-NAAF'] + sum_329['L-NAAF'] + sum_329['M-NAAF'] + sum_329['N-NAAF'] + \
                     sum_329['P-NAAF'] + sum_329['Q-NAAF'] + sum_329['R-NAAF'] + sum_329['S-NAAF'] + \
                     sum_329['T-NAAF'] + sum_329['V-NAAF'] + sum_329['W-NAAF'] + sum_329['Y-NAAF'] 

sum_329['AA-modifiable-NAAF'] = sum_329['K-NAAF'] + sum_329['M-NAAF'] + sum_329['N-NAAF'] + \
                                sum_329['P-NAAF'] + sum_329['Q-NAAF'] + sum_329['R-NAAF'] + \
                                sum_329['Y-NAAF']

sum_329['Cys. w/ carb.'] = sum_329['C-carb-NAAF'] / sum_329['C-NAAF'] #1
sum_329['Met. w/ oxid.'] = sum_329['M-oxid-NAAF'] / sum_329['M-NAAF'] #2
sum_329['Lys. w/ oxid.'] = sum_329['K-oxid-NAAF'] / sum_329['K-NAAF'] #3
sum_329['Pro. w/ oxid.'] = sum_329['P-oxid-NAAF'] / sum_329['P-NAAF'] #4
sum_329['Arg. w/ oxid.'] = sum_329['R-oxid-NAAF'] / sum_329['R-NAAF'] #5
sum_329['Tyr. w/ oxid.'] = sum_329['Y-oxid-NAAF'] / sum_329['Y-NAAF'] #6
sum_329['Asn. w/ deam.'] = sum_329['N-deam-NAAF'] / sum_329['N-NAAF'] #7
sum_329['Lys. w/ meth.'] = sum_329['K-meth-NAAF'] / sum_329['K-NAAF'] #8
sum_329['Arg. w/ meth.'] = sum_329['R-meth-NAAF'] / sum_329['R-NAAF'] #9
sum_329['Glu. w/ pyro.'] = sum_329['Q-pyro-NAAF'] / sum_329['Q-NAAF'] #10
sum_329['Lys. w/ acet.'] = sum_329['K-acet-NAAF'] / sum_329['K-NAAF'] #11

sum_329['Overall modified'] = 0.001*((sum_329['Met. w/ oxid.']*sum_329['M-NAAF']) + \
                                     (sum_329['Lys. w/ oxid.']*sum_329['K-NAAF']) + \
                                     (sum_329['Pro. w/ oxid.']*sum_329['P-NAAF']) + \
                                     (sum_329['Arg. w/ oxid.']*sum_329['R-NAAF']) + \
                                     (sum_329['Tyr. w/ oxid.']*sum_329['Y-NAAF']) + \
                                     (sum_329['Asn. w/ deam.']*sum_329['N-NAAF']) + \
                                     (sum_329['Lys. w/ meth.']*sum_329['K-NAAF']) + \
                                     (sum_329['Arg. w/ meth.']*sum_329['R-NAAF']) + \
                                     (sum_329['Glu. w/ pyro.']*sum_329['Q-NAAF']) + \
                                     (sum_329['Lys. w/ acet.']*sum_329['K-NAAF']) / (sum_329['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_329.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-329.csv")

sum_329.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
329,2234.39741,1123.188318,63.150623,1623.563531,1590.882011,873.970296,2359.117168,685.668095,237.602283,819.104048,...,0.056681,0.186296,0.093682,0.182296,0.594858,0.118455,0.077425,0.024906,0.052905,1.974838


In [7]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 330 T2 digested

peaks80_330 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_330_T2_undigested_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_330 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_330_T2_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_330, peaksdb_330]
index = ['peaks80_330', 'peaksdb_330']

# concatenate dataframes
tot_330 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_330 = tot_330.loc[:, ~tot_330.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_330', 'peaksdb_330']
tot_330.insert(loc=0, column='data source', value=names)
tot_330.set_index('data source')

# sum the AAs and PTMs

index = ['330']

data = {'NAAF': tot_330['NAAF'].sum(),
        'A-NAAF': tot_330['A'].sum(),
        'C-NAAF': tot_330['C'].sum(),
        'D-NAAF': tot_330['D'].sum(),
        'E-NAAF': tot_330['E'].sum(),
        'F-NAAF': tot_330['F'].sum(),
        'G-NAAF': tot_330['G'].sum(),
        'H-NAAF': tot_330['H'].sum(),
        'I-NAAF': tot_330['I'].sum(),
        'K-NAAF': tot_330['K'].sum(),
        'L-NAAF': tot_330['L'].sum(),
        'M-NAAF': tot_330['M'].sum(),
        'N-NAAF': tot_330['N'].sum(),
        'P-NAAF': tot_330['P'].sum(),
        'Q-NAAF': tot_330['Q'].sum(),
        'R-NAAF': tot_330['R'].sum(),
        'S-NAAF': tot_330['S'].sum(),
        'T-NAAF': tot_330['T'].sum(),
        'V-NAAF': tot_330['V'].sum(),
        'W-NAAF': tot_330['W'].sum(),
        'Y-NAAF': tot_330['Y'].sum(),
        'C-carb-NAAF': tot_330['c-carb'].sum(),
        'M-oxid-NAAF': tot_330['m-oxid'].sum(),
        'K-oxid-NAAF': tot_330['k-oxid'].sum(),
        'P-oxid-NAAF': tot_330['p-oxid'].sum(),
        'R-oxid-NAAF': tot_330['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_330['y-oxid'].sum(),
        'N-deam-NAAF': tot_330['n-deam'].sum(),
        'K-meth-NAAF': tot_330['k-meth'].sum(),
        'R-meth-NAAF': tot_330['r-meth'].sum(),
        'Q-pyro-NAAF': tot_330['q-pyro'].sum(),
        'K-acet-NAAF': tot_330['k-acet'].sum()
       }

sum_330 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_330['AA-NAAF'] = sum_330['A-NAAF'] + sum_330['C-NAAF'] + sum_330['D-NAAF'] + sum_330['E-NAAF'] + \
                     sum_330['F-NAAF'] + sum_330['G-NAAF'] + sum_330['H-NAAF'] + sum_330['I-NAAF'] + \
                     sum_330['K-NAAF'] + sum_330['L-NAAF'] + sum_330['M-NAAF'] + sum_330['N-NAAF'] + \
                     sum_330['P-NAAF'] + sum_330['Q-NAAF'] + sum_330['R-NAAF'] + sum_330['S-NAAF'] + \
                     sum_330['T-NAAF'] + sum_330['V-NAAF'] + sum_330['W-NAAF'] + sum_330['Y-NAAF'] 

sum_330['AA-modifiable-NAAF'] = sum_330['K-NAAF'] + sum_330['M-NAAF'] + sum_330['N-NAAF'] + \
                                sum_330['P-NAAF'] + sum_330['Q-NAAF'] + sum_330['R-NAAF'] + \
                                sum_330['Y-NAAF']

sum_330['Cys. w/ carb.'] = sum_330['C-carb-NAAF'] / sum_330['C-NAAF'] #1
sum_330['Met. w/ oxid.'] = sum_330['M-oxid-NAAF'] / sum_330['M-NAAF'] #2
sum_330['Lys. w/ oxid.'] = sum_330['K-oxid-NAAF'] / sum_330['K-NAAF'] #3
sum_330['Pro. w/ oxid.'] = sum_330['P-oxid-NAAF'] / sum_330['P-NAAF'] #4
sum_330['Arg. w/ oxid.'] = sum_330['R-oxid-NAAF'] / sum_330['R-NAAF'] #5
sum_330['Tyr. w/ oxid.'] = sum_330['Y-oxid-NAAF'] / sum_330['Y-NAAF'] #6
sum_330['Asn. w/ deam.'] = sum_330['N-deam-NAAF'] / sum_330['N-NAAF'] #7
sum_330['Lys. w/ meth.'] = sum_330['K-meth-NAAF'] / sum_330['K-NAAF'] #8
sum_330['Arg. w/ meth.'] = sum_330['R-meth-NAAF'] / sum_330['R-NAAF'] #9
sum_330['Glu. w/ pyro.'] = sum_330['Q-pyro-NAAF'] / sum_330['Q-NAAF'] #10
sum_330['Lys. w/ acet.'] = sum_330['K-acet-NAAF'] / sum_330['K-NAAF'] #11

sum_330['Overall modified'] = 0.001*((sum_330['Met. w/ oxid.']*sum_330['M-NAAF']) + \
                                     (sum_330['Lys. w/ oxid.']*sum_330['K-NAAF']) + \
                                     (sum_330['Pro. w/ oxid.']*sum_330['P-NAAF']) + \
                                     (sum_330['Arg. w/ oxid.']*sum_330['R-NAAF']) + \
                                     (sum_330['Tyr. w/ oxid.']*sum_330['Y-NAAF']) + \
                                     (sum_330['Asn. w/ deam.']*sum_330['N-NAAF']) + \
                                     (sum_330['Lys. w/ meth.']*sum_330['K-NAAF']) + \
                                     (sum_330['Arg. w/ meth.']*sum_330['R-NAAF']) + \
                                     (sum_330['Glu. w/ pyro.']*sum_330['Q-NAAF']) + \
                                     (sum_330['Lys. w/ acet.']*sum_330['K-NAAF']) / (sum_330['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_330.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-330.csv")

sum_330.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
330,6252.706883,3329.100337,74.515359,3490.610652,6801.840911,1721.391465,4316.290718,904.005889,1034.571071,3416.158147,...,0.091989,0.169389,0.061901,0.14588,0.263626,0.131154,0.105596,0.156316,0.102325,4.343973


In [8]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 331 T5 digested

peaks80_331 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_331_T5_undigested_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_331 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_331_T5_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_331, peaksdb_331]
index = ['peaks80_331', 'peaksdb_331']

# concatenate dataframes
tot_331 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_331 = tot_331.loc[:, ~tot_331.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_331', 'peaksdb_331']
tot_331.insert(loc=0, column='data source', value=names)
tot_331.set_index('data source')

# sum the AAs and PTMs

index = ['331']

data = {'NAAF': tot_331['NAAF'].sum(),
        'A-NAAF': tot_331['A'].sum(),
        'C-NAAF': tot_331['C'].sum(),
        'D-NAAF': tot_331['D'].sum(),
        'E-NAAF': tot_331['E'].sum(),
        'F-NAAF': tot_331['F'].sum(),
        'G-NAAF': tot_331['G'].sum(),
        'H-NAAF': tot_331['H'].sum(),
        'I-NAAF': tot_331['I'].sum(),
        'K-NAAF': tot_331['K'].sum(),
        'L-NAAF': tot_331['L'].sum(),
        'M-NAAF': tot_331['M'].sum(),
        'N-NAAF': tot_331['N'].sum(),
        'P-NAAF': tot_331['P'].sum(),
        'Q-NAAF': tot_331['Q'].sum(),
        'R-NAAF': tot_331['R'].sum(),
        'S-NAAF': tot_331['S'].sum(),
        'T-NAAF': tot_331['T'].sum(),
        'V-NAAF': tot_331['V'].sum(),
        'W-NAAF': tot_331['W'].sum(),
        'Y-NAAF': tot_331['Y'].sum(),
        'C-carb-NAAF': tot_331['c-carb'].sum(),
        'M-oxid-NAAF': tot_331['m-oxid'].sum(),
        'K-oxid-NAAF': tot_331['k-oxid'].sum(),
        'P-oxid-NAAF': tot_331['p-oxid'].sum(),
        'R-oxid-NAAF': tot_331['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_331['y-oxid'].sum(),
        'N-deam-NAAF': tot_331['n-deam'].sum(),
        'K-meth-NAAF': tot_331['k-meth'].sum(),
        'R-meth-NAAF': tot_331['r-meth'].sum(),
        'Q-pyro-NAAF': tot_331['q-pyro'].sum(),
        'K-acet-NAAF': tot_331['k-acet'].sum()
       }

sum_331 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_331['AA-NAAF'] = sum_331['A-NAAF'] + sum_331['C-NAAF'] + sum_331['D-NAAF'] + sum_331['E-NAAF'] + \
                     sum_331['F-NAAF'] + sum_331['G-NAAF'] + sum_331['H-NAAF'] + sum_331['I-NAAF'] + \
                     sum_331['K-NAAF'] + sum_331['L-NAAF'] + sum_331['M-NAAF'] + sum_331['N-NAAF'] + \
                     sum_331['P-NAAF'] + sum_331['Q-NAAF'] + sum_331['R-NAAF'] + sum_331['S-NAAF'] + \
                     sum_331['T-NAAF'] + sum_331['V-NAAF'] + sum_331['W-NAAF'] + sum_331['Y-NAAF'] 

sum_331['AA-modifiable-NAAF'] = sum_331['K-NAAF'] + sum_331['M-NAAF'] + sum_331['N-NAAF'] + \
                                sum_331['P-NAAF'] + sum_331['Q-NAAF'] + sum_331['R-NAAF'] + \
                                sum_331['Y-NAAF']

sum_331['Cys. w/ carb.'] = sum_331['C-carb-NAAF'] / sum_331['C-NAAF'] #1
sum_331['Met. w/ oxid.'] = sum_331['M-oxid-NAAF'] / sum_331['M-NAAF'] #2
sum_331['Lys. w/ oxid.'] = sum_331['K-oxid-NAAF'] / sum_331['K-NAAF'] #3
sum_331['Pro. w/ oxid.'] = sum_331['P-oxid-NAAF'] / sum_331['P-NAAF'] #4
sum_331['Arg. w/ oxid.'] = sum_331['R-oxid-NAAF'] / sum_331['R-NAAF'] #5
sum_331['Tyr. w/ oxid.'] = sum_331['Y-oxid-NAAF'] / sum_331['Y-NAAF'] #6
sum_331['Asn. w/ deam.'] = sum_331['N-deam-NAAF'] / sum_331['N-NAAF'] #7
sum_331['Lys. w/ meth.'] = sum_331['K-meth-NAAF'] / sum_331['K-NAAF'] #8
sum_331['Arg. w/ meth.'] = sum_331['R-meth-NAAF'] / sum_331['R-NAAF'] #9
sum_331['Glu. w/ pyro.'] = sum_331['Q-pyro-NAAF'] / sum_331['Q-NAAF'] #10
sum_331['Lys. w/ acet.'] = sum_331['K-acet-NAAF'] / sum_331['K-NAAF'] #11

sum_331['Overall modified'] = 0.001*((sum_331['Met. w/ oxid.']*sum_331['M-NAAF']) + \
                                     (sum_331['Lys. w/ oxid.']*sum_331['K-NAAF']) + \
                                     (sum_331['Pro. w/ oxid.']*sum_331['P-NAAF']) + \
                                     (sum_331['Arg. w/ oxid.']*sum_331['R-NAAF']) + \
                                     (sum_331['Tyr. w/ oxid.']*sum_331['Y-NAAF']) + \
                                     (sum_331['Asn. w/ deam.']*sum_331['N-NAAF']) + \
                                     (sum_331['Lys. w/ meth.']*sum_331['K-NAAF']) + \
                                     (sum_331['Arg. w/ meth.']*sum_331['R-NAAF']) + \
                                     (sum_331['Glu. w/ pyro.']*sum_331['Q-NAAF']) + \
                                     (sum_331['Lys. w/ acet.']*sum_331['K-NAAF']) / (sum_331['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_331.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-331.csv")

sum_331.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
331,2223.087138,553.83124,50.766914,2192.412993,1202.405554,648.249814,2252.599354,1064.718245,268.182024,748.146481,...,0.03878,0.064466,0.068603,0.030288,0.82784,0.092393,0.224072,0.004706,0.102695,1.878637


In [9]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >80% ALC and PeaksDB peptides <1% FDR

# 332 T12 digested

peaks80_332 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_332_T12_undigested_combine_PTMopt_DN80_NAAF_totals.csv")
peaksdb_332 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_332_T12_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

frames = [peaks80_332, peaksdb_332]
index = ['peaks80_332', 'peaksdb_332']

# concatenate dataframes
tot_332 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_332 = tot_332.loc[:, ~tot_332.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks80_332', 'peaksdb_332']
tot_332.insert(loc=0, column='data source', value=names)
tot_332.set_index('data source')

# sum the AAs and PTMs

index = ['332']

data = {'NAAF': tot_332['NAAF'].sum(),
        'A-NAAF': tot_332['A'].sum(),
        'C-NAAF': tot_332['C'].sum(),
        'D-NAAF': tot_332['D'].sum(),
        'E-NAAF': tot_332['E'].sum(),
        'F-NAAF': tot_332['F'].sum(),
        'G-NAAF': tot_332['G'].sum(),
        'H-NAAF': tot_332['H'].sum(),
        'I-NAAF': tot_332['I'].sum(),
        'K-NAAF': tot_332['K'].sum(),
        'L-NAAF': tot_332['L'].sum(),
        'M-NAAF': tot_332['M'].sum(),
        'N-NAAF': tot_332['N'].sum(),
        'P-NAAF': tot_332['P'].sum(),
        'Q-NAAF': tot_332['Q'].sum(),
        'R-NAAF': tot_332['R'].sum(),
        'S-NAAF': tot_332['S'].sum(),
        'T-NAAF': tot_332['T'].sum(),
        'V-NAAF': tot_332['V'].sum(),
        'W-NAAF': tot_332['W'].sum(),
        'Y-NAAF': tot_332['Y'].sum(),
        'C-carb-NAAF': tot_332['c-carb'].sum(),
        'M-oxid-NAAF': tot_332['m-oxid'].sum(),
        'K-oxid-NAAF': tot_332['k-oxid'].sum(),
        'P-oxid-NAAF': tot_332['p-oxid'].sum(),
        'R-oxid-NAAF': tot_332['r-oxid'].sum(),
        'Y-oxid-NAAF': tot_332['y-oxid'].sum(),
        'N-deam-NAAF': tot_332['n-deam'].sum(),
        'K-meth-NAAF': tot_332['k-meth'].sum(),
        'R-meth-NAAF': tot_332['r-meth'].sum(),
        'Q-pyro-NAAF': tot_332['q-pyro'].sum(),
        'K-acet-NAAF': tot_332['k-acet'].sum()
       }

sum_332 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_332['AA-NAAF'] = sum_332['A-NAAF'] + sum_332['C-NAAF'] + sum_332['D-NAAF'] + sum_332['E-NAAF'] + \
                     sum_332['F-NAAF'] + sum_332['G-NAAF'] + sum_332['H-NAAF'] + sum_332['I-NAAF'] + \
                     sum_332['K-NAAF'] + sum_332['L-NAAF'] + sum_332['M-NAAF'] + sum_332['N-NAAF'] + \
                     sum_332['P-NAAF'] + sum_332['Q-NAAF'] + sum_332['R-NAAF'] + sum_332['S-NAAF'] + \
                     sum_332['T-NAAF'] + sum_332['V-NAAF'] + sum_332['W-NAAF'] + sum_332['Y-NAAF'] 

sum_332['AA-modifiable-NAAF'] = sum_332['K-NAAF'] + sum_332['M-NAAF'] + sum_332['N-NAAF'] + \
                                sum_332['P-NAAF'] + sum_332['Q-NAAF'] + sum_332['R-NAAF'] + \
                                sum_332['Y-NAAF']

sum_332['Cys. w/ carb.'] = sum_332['C-carb-NAAF'] / sum_332['C-NAAF'] #1
sum_332['Met. w/ oxid.'] = sum_332['M-oxid-NAAF'] / sum_332['M-NAAF'] #2
sum_332['Lys. w/ oxid.'] = sum_332['K-oxid-NAAF'] / sum_332['K-NAAF'] #3
sum_332['Pro. w/ oxid.'] = sum_332['P-oxid-NAAF'] / sum_332['P-NAAF'] #4
sum_332['Arg. w/ oxid.'] = sum_332['R-oxid-NAAF'] / sum_332['R-NAAF'] #5
sum_332['Tyr. w/ oxid.'] = sum_332['Y-oxid-NAAF'] / sum_332['Y-NAAF'] #6
sum_332['Asn. w/ deam.'] = sum_332['N-deam-NAAF'] / sum_332['N-NAAF'] #7
sum_332['Lys. w/ meth.'] = sum_332['K-meth-NAAF'] / sum_332['K-NAAF'] #8
sum_332['Arg. w/ meth.'] = sum_332['R-meth-NAAF'] / sum_332['R-NAAF'] #9
sum_332['Glu. w/ pyro.'] = sum_332['Q-pyro-NAAF'] / sum_332['Q-NAAF'] #10
sum_332['Lys. w/ acet.'] = sum_332['K-acet-NAAF'] / sum_332['K-NAAF'] #11

sum_332['Overall modified'] = 0.001*((sum_332['Met. w/ oxid.']*sum_332['M-NAAF']) + \
                                     (sum_332['Lys. w/ oxid.']*sum_332['K-NAAF']) + \
                                     (sum_332['Pro. w/ oxid.']*sum_332['P-NAAF']) + \
                                     (sum_332['Arg. w/ oxid.']*sum_332['R-NAAF']) + \
                                     (sum_332['Tyr. w/ oxid.']*sum_332['Y-NAAF']) + \
                                     (sum_332['Asn. w/ deam.']*sum_332['N-NAAF']) + \
                                     (sum_332['Lys. w/ meth.']*sum_332['K-NAAF']) + \
                                     (sum_332['Arg. w/ meth.']*sum_332['R-NAAF']) + \
                                     (sum_332['Glu. w/ pyro.']*sum_332['Q-NAAF']) + \
                                     (sum_332['Lys. w/ acet.']*sum_332['K-NAAF']) / (sum_332['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_332.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-sum-332.csv")

sum_332.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Lys. w/ oxid.,Pro. w/ oxid.,Arg. w/ oxid.,Tyr. w/ oxid.,Asn. w/ deam.,Lys. w/ meth.,Arg. w/ meth.,Glu. w/ pyro.,Lys. w/ acet.,Overall modified
332,2317.399254,912.726454,19.133144,1698.018143,1012.152056,460.563316,2436.328784,958.019426,89.789919,1291.527737,...,0.116658,0.115927,0.101001,0.033437,0.737515,0.119172,0.167419,0.01108,0.125581,2.05964


## NAAF calculations for just PeaksDB peptides (only algal)

In [10]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 322 T0 digested

pdb322 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_322_T0_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb322 = pdb322.loc[:, ~pdb322.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['322']

data = {'NAAF': pdb322['NAAF'].sum(),
        'A-NAAF': pdb322['A'].sum(),
        'C-NAAF': pdb322['C'].sum(),
        'D-NAAF': pdb322['D'].sum(),
        'E-NAAF': pdb322['E'].sum(),
        'F-NAAF': pdb322['F'].sum(),
        'G-NAAF': pdb322['G'].sum(),
        'H-NAAF': pdb322['H'].sum(),
        'I-NAAF': pdb322['I'].sum(),
        'K-NAAF': pdb322['K'].sum(),
        'L-NAAF': pdb322['L'].sum(),
        'M-NAAF': pdb322['M'].sum(),
        'N-NAAF': pdb322['N'].sum(),
        'P-NAAF': pdb322['P'].sum(),
        'Q-NAAF': pdb322['Q'].sum(),
        'R-NAAF': pdb322['R'].sum(),
        'S-NAAF': pdb322['S'].sum(),
        'T-NAAF': pdb322['T'].sum(),
        'V-NAAF': pdb322['V'].sum(),
        'W-NAAF': pdb322['W'].sum(),
        'Y-NAAF': pdb322['Y'].sum(),
        'C-carb-NAAF': pdb322['c-carb'].sum(),
        'M-oxid-NAAF': pdb322['m-oxid'].sum(),
        'K-oxid-NAAF': pdb322['k-oxid'].sum(),
        'P-oxid-NAAF': pdb322['p-oxid'].sum(),
        'R-oxid-NAAF': pdb322['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb322['y-oxid'].sum(),
        'N-deam-NAAF': pdb322['n-deam'].sum(),
        'K-meth-NAAF': pdb322['k-meth'].sum(),
        'R-meth-NAAF': pdb322['r-meth'].sum(),
        'Q-pyro-NAAF': pdb322['q-pyro'].sum(),
        'K-acet-NAAF': pdb322['k-acet'].sum()
       }

sum_322 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_322['AA-NAAF'] = sum_322['A-NAAF'] + sum_322['C-NAAF'] + sum_322['D-NAAF'] + sum_322['E-NAAF'] + \
                     sum_322['F-NAAF'] + sum_322['G-NAAF'] + sum_322['H-NAAF'] + sum_322['I-NAAF'] + \
                     sum_322['K-NAAF'] + sum_322['L-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                     sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + sum_322['S-NAAF'] + \
                     sum_322['T-NAAF'] + sum_322['V-NAAF'] + sum_322['W-NAAF'] + sum_322['Y-NAAF'] 

sum_322['AA-modifiable-NAAF'] = sum_322['K-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                                sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + \
                                sum_322['Y-NAAF']

sum_322['Cys. w/ carb.'] = sum_322['C-carb-NAAF'] / sum_322['C-NAAF'] #1
sum_322['Met. w/ oxid.'] = sum_322['M-oxid-NAAF'] / sum_322['M-NAAF'] #2
sum_322['Lys. w/ oxid.'] = sum_322['K-oxid-NAAF'] / sum_322['K-NAAF'] #3
sum_322['Pro. w/ oxid.'] = sum_322['P-oxid-NAAF'] / sum_322['P-NAAF'] #4
sum_322['Arg. w/ oxid.'] = sum_322['R-oxid-NAAF'] / sum_322['R-NAAF'] #5
sum_322['Tyr. w/ oxid.'] = sum_322['Y-oxid-NAAF'] / sum_322['Y-NAAF'] #6
sum_322['Asn. w/ deam.'] = sum_322['N-deam-NAAF'] / sum_322['N-NAAF'] #7
sum_322['Lys. w/ meth.'] = sum_322['K-meth-NAAF'] / sum_322['K-NAAF'] #8
sum_322['Arg. w/ meth.'] = sum_322['R-meth-NAAF'] / sum_322['R-NAAF'] #9
sum_322['Glu. w/ pyro.'] = sum_322['Q-pyro-NAAF'] / sum_322['Q-NAAF'] #10
sum_322['Lys. w/ acet.'] = sum_322['K-acet-NAAF'] / sum_322['K-NAAF'] #11

sum_322['Overall modified'] = 0.01*((sum_322['Met. w/ oxid.']*sum_322['M-NAAF']) + \
                                     (sum_322['Lys. w/ oxid.']*sum_322['K-NAAF']) + \
                                     (sum_322['Pro. w/ oxid.']*sum_322['P-NAAF']) + \
                                     (sum_322['Arg. w/ oxid.']*sum_322['R-NAAF']) + \
                                     (sum_322['Tyr. w/ oxid.']*sum_322['Y-NAAF']) + \
                                     (sum_322['Asn. w/ deam.']*sum_322['N-NAAF']) + \
                                     (sum_322['Lys. w/ meth.']*sum_322['K-NAAF']) + \
                                     (sum_322['Arg. w/ meth.']*sum_322['R-NAAF']) + \
                                     (sum_322['Glu. w/ pyro.']*sum_322['Q-NAAF']) + \
                                     (sum_322['Lys. w/ acet.']*sum_322['K-NAAF']) / (sum_322['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_322.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-322.csv")

pdb322.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,851.851461,733.850015,22.962069,477.439217,666.681224,183.171804,693.180906,151.1454,620.656959,170.375682,...,8.9e-05,0.005893,0.000648,0.001996,0.001846,0.005538,0.000174,0.002328,0.0,0.001534


In [12]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 323 T0 digested

pdb323 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_323_T2_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb323 = pdb323.loc[:, ~pdb323.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['323']

data = {'NAAF': pdb323['NAAF'].sum(),
        'A-NAAF': pdb323['A'].sum(),
        'C-NAAF': pdb323['C'].sum(),
        'D-NAAF': pdb323['D'].sum(),
        'E-NAAF': pdb323['E'].sum(),
        'F-NAAF': pdb323['F'].sum(),
        'G-NAAF': pdb323['G'].sum(),
        'H-NAAF': pdb323['H'].sum(),
        'I-NAAF': pdb323['I'].sum(),
        'K-NAAF': pdb323['K'].sum(),
        'L-NAAF': pdb323['L'].sum(),
        'M-NAAF': pdb323['M'].sum(),
        'N-NAAF': pdb323['N'].sum(),
        'P-NAAF': pdb323['P'].sum(),
        'Q-NAAF': pdb323['Q'].sum(),
        'R-NAAF': pdb323['R'].sum(),
        'S-NAAF': pdb323['S'].sum(),
        'T-NAAF': pdb323['T'].sum(),
        'V-NAAF': pdb323['V'].sum(),
        'W-NAAF': pdb323['W'].sum(),
        'Y-NAAF': pdb323['Y'].sum(),
        'C-carb-NAAF': pdb323['c-carb'].sum(),
        'M-oxid-NAAF': pdb323['m-oxid'].sum(),
        'K-oxid-NAAF': pdb323['k-oxid'].sum(),
        'P-oxid-NAAF': pdb323['p-oxid'].sum(),
        'R-oxid-NAAF': pdb323['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb323['y-oxid'].sum(),
        'N-deam-NAAF': pdb323['n-deam'].sum(),
        'K-meth-NAAF': pdb323['k-meth'].sum(),
        'R-meth-NAAF': pdb323['r-meth'].sum(),
        'Q-pyro-NAAF': pdb323['q-pyro'].sum(),
        'K-acet-NAAF': pdb323['k-acet'].sum()
       }

sum_323 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_323['AA-NAAF'] = sum_323['A-NAAF'] + sum_323['C-NAAF'] + sum_323['D-NAAF'] + sum_323['E-NAAF'] + \
                     sum_323['F-NAAF'] + sum_323['G-NAAF'] + sum_323['H-NAAF'] + sum_323['I-NAAF'] + \
                     sum_323['K-NAAF'] + sum_323['L-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                     sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + sum_323['S-NAAF'] + \
                     sum_323['T-NAAF'] + sum_323['V-NAAF'] + sum_323['W-NAAF'] + sum_323['Y-NAAF'] 

sum_323['AA-modifiable-NAAF'] = sum_323['K-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                                sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + \
                                sum_323['Y-NAAF']

sum_323['Cys. w/ carb.'] = sum_323['C-carb-NAAF'] / sum_323['C-NAAF'] #1
sum_323['Met. w/ oxid.'] = sum_323['M-oxid-NAAF'] / sum_323['M-NAAF'] #2
sum_323['Lys. w/ oxid.'] = sum_323['K-oxid-NAAF'] / sum_323['K-NAAF'] #3
sum_323['Pro. w/ oxid.'] = sum_323['P-oxid-NAAF'] / sum_323['P-NAAF'] #4
sum_323['Arg. w/ oxid.'] = sum_323['R-oxid-NAAF'] / sum_323['R-NAAF'] #5
sum_323['Tyr. w/ oxid.'] = sum_323['Y-oxid-NAAF'] / sum_323['Y-NAAF'] #6
sum_323['Asn. w/ deam.'] = sum_323['N-deam-NAAF'] / sum_323['N-NAAF'] #7
sum_323['Lys. w/ meth.'] = sum_323['K-meth-NAAF'] / sum_323['K-NAAF'] #8
sum_323['Arg. w/ meth.'] = sum_323['R-meth-NAAF'] / sum_323['R-NAAF'] #9
sum_323['Glu. w/ pyro.'] = sum_323['Q-pyro-NAAF'] / sum_323['Q-NAAF'] #10
sum_323['Lys. w/ acet.'] = sum_323['K-acet-NAAF'] / sum_323['K-NAAF'] #11

sum_323['Overall modified'] = 0.01*((sum_323['Met. w/ oxid.']*sum_323['M-NAAF']) + \
                                     (sum_323['Lys. w/ oxid.']*sum_323['K-NAAF']) + \
                                     (sum_323['Pro. w/ oxid.']*sum_323['P-NAAF']) + \
                                     (sum_323['Arg. w/ oxid.']*sum_323['R-NAAF']) + \
                                     (sum_323['Tyr. w/ oxid.']*sum_323['Y-NAAF']) + \
                                     (sum_323['Asn. w/ deam.']*sum_323['N-NAAF']) + \
                                     (sum_323['Lys. w/ meth.']*sum_323['K-NAAF']) + \
                                     (sum_323['Arg. w/ meth.']*sum_323['R-NAAF']) + \
                                     (sum_323['Glu. w/ pyro.']*sum_323['Q-NAAF']) + \
                                     (sum_323['Lys. w/ acet.']*sum_323['K-NAAF']) / (sum_323['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_323.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-323.csv")

pdb323.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,1131.317421,908.198809,128.505455,787.068231,1229.543781,210.934371,915.071303,87.108981,676.119865,523.550866,...,0.003028,0.054775,0.025303,0.002159,0.003015,0.003397,0.000586,0.098085,0.0,0.006229


In [13]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 324 T0 digested

pdb324 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_324_T5_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb324 = pdb324.loc[:, ~pdb324.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['324']

data = {'NAAF': pdb324['NAAF'].sum(),
        'A-NAAF': pdb324['A'].sum(),
        'C-NAAF': pdb324['C'].sum(),
        'D-NAAF': pdb324['D'].sum(),
        'E-NAAF': pdb324['E'].sum(),
        'F-NAAF': pdb324['F'].sum(),
        'G-NAAF': pdb324['G'].sum(),
        'H-NAAF': pdb324['H'].sum(),
        'I-NAAF': pdb324['I'].sum(),
        'K-NAAF': pdb324['K'].sum(),
        'L-NAAF': pdb324['L'].sum(),
        'M-NAAF': pdb324['M'].sum(),
        'N-NAAF': pdb324['N'].sum(),
        'P-NAAF': pdb324['P'].sum(),
        'Q-NAAF': pdb324['Q'].sum(),
        'R-NAAF': pdb324['R'].sum(),
        'S-NAAF': pdb324['S'].sum(),
        'T-NAAF': pdb324['T'].sum(),
        'V-NAAF': pdb324['V'].sum(),
        'W-NAAF': pdb324['W'].sum(),
        'Y-NAAF': pdb324['Y'].sum(),
        'C-carb-NAAF': pdb324['c-carb'].sum(),
        'M-oxid-NAAF': pdb324['m-oxid'].sum(),
        'K-oxid-NAAF': pdb324['k-oxid'].sum(),
        'P-oxid-NAAF': pdb324['p-oxid'].sum(),
        'R-oxid-NAAF': pdb324['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb324['y-oxid'].sum(),
        'N-deam-NAAF': pdb324['n-deam'].sum(),
        'K-meth-NAAF': pdb324['k-meth'].sum(),
        'R-meth-NAAF': pdb324['r-meth'].sum(),
        'Q-pyro-NAAF': pdb324['q-pyro'].sum(),
        'K-acet-NAAF': pdb324['k-acet'].sum()
       }

sum_324 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_324['AA-NAAF'] = sum_324['A-NAAF'] + sum_324['C-NAAF'] + sum_324['D-NAAF'] + sum_324['E-NAAF'] + \
                     sum_324['F-NAAF'] + sum_324['G-NAAF'] + sum_324['H-NAAF'] + sum_324['I-NAAF'] + \
                     sum_324['K-NAAF'] + sum_324['L-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                     sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + sum_324['S-NAAF'] + \
                     sum_324['T-NAAF'] + sum_324['V-NAAF'] + sum_324['W-NAAF'] + sum_324['Y-NAAF'] 

sum_324['AA-modifiable-NAAF'] = sum_324['K-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                                sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + \
                                sum_324['Y-NAAF']

sum_324['Cys. w/ carb.'] = sum_324['C-carb-NAAF'] / sum_324['C-NAAF'] #1
sum_324['Met. w/ oxid.'] = sum_324['M-oxid-NAAF'] / sum_324['M-NAAF'] #2
sum_324['Lys. w/ oxid.'] = sum_324['K-oxid-NAAF'] / sum_324['K-NAAF'] #3
sum_324['Pro. w/ oxid.'] = sum_324['P-oxid-NAAF'] / sum_324['P-NAAF'] #4
sum_324['Arg. w/ oxid.'] = sum_324['R-oxid-NAAF'] / sum_324['R-NAAF'] #5
sum_324['Tyr. w/ oxid.'] = sum_324['Y-oxid-NAAF'] / sum_324['Y-NAAF'] #6
sum_324['Asn. w/ deam.'] = sum_324['N-deam-NAAF'] / sum_324['N-NAAF'] #7
sum_324['Lys. w/ meth.'] = sum_324['K-meth-NAAF'] / sum_324['K-NAAF'] #8
sum_324['Arg. w/ meth.'] = sum_324['R-meth-NAAF'] / sum_324['R-NAAF'] #9
sum_324['Glu. w/ pyro.'] = sum_324['Q-pyro-NAAF'] / sum_324['Q-NAAF'] #10
sum_324['Lys. w/ acet.'] = sum_324['K-acet-NAAF'] / sum_324['K-NAAF'] #11

sum_324['Overall modified'] = 0.01*((sum_324['Met. w/ oxid.']*sum_324['M-NAAF']) + \
                                     (sum_324['Lys. w/ oxid.']*sum_324['K-NAAF']) + \
                                     (sum_324['Pro. w/ oxid.']*sum_324['P-NAAF']) + \
                                     (sum_324['Arg. w/ oxid.']*sum_324['R-NAAF']) + \
                                     (sum_324['Tyr. w/ oxid.']*sum_324['Y-NAAF']) + \
                                     (sum_324['Asn. w/ deam.']*sum_324['N-NAAF']) + \
                                     (sum_324['Lys. w/ meth.']*sum_324['K-NAAF']) + \
                                     (sum_324['Arg. w/ meth.']*sum_324['R-NAAF']) + \
                                     (sum_324['Glu. w/ pyro.']*sum_324['Q-NAAF']) + \
                                     (sum_324['Lys. w/ acet.']*sum_324['K-NAAF']) / (sum_324['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-324.csv")

pdb324.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,584.281766,376.13093,13.1916,539.033398,577.390814,243.405679,557.256446,126.870579,515.555154,195.228236,...,0.00035,0.000814,4.2e-05,0.001352,0.003747,0.002527,0.0,9.2e-05,0.000342,0.000664


In [14]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 325 T0 digested

pdb325 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_325_T12_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb325 = pdb325.loc[:, ~pdb325.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['325']

data = {'NAAF': pdb325['NAAF'].sum(),
        'A-NAAF': pdb325['A'].sum(),
        'C-NAAF': pdb325['C'].sum(),
        'D-NAAF': pdb325['D'].sum(),
        'E-NAAF': pdb325['E'].sum(),
        'F-NAAF': pdb325['F'].sum(),
        'G-NAAF': pdb325['G'].sum(),
        'H-NAAF': pdb325['H'].sum(),
        'I-NAAF': pdb325['I'].sum(),
        'K-NAAF': pdb325['K'].sum(),
        'L-NAAF': pdb325['L'].sum(),
        'M-NAAF': pdb325['M'].sum(),
        'N-NAAF': pdb325['N'].sum(),
        'P-NAAF': pdb325['P'].sum(),
        'Q-NAAF': pdb325['Q'].sum(),
        'R-NAAF': pdb325['R'].sum(),
        'S-NAAF': pdb325['S'].sum(),
        'T-NAAF': pdb325['T'].sum(),
        'V-NAAF': pdb325['V'].sum(),
        'W-NAAF': pdb325['W'].sum(),
        'Y-NAAF': pdb325['Y'].sum(),
        'C-carb-NAAF': pdb325['c-carb'].sum(),
        'M-oxid-NAAF': pdb325['m-oxid'].sum(),
        'K-oxid-NAAF': pdb325['k-oxid'].sum(),
        'P-oxid-NAAF': pdb325['p-oxid'].sum(),
        'R-oxid-NAAF': pdb325['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb325['y-oxid'].sum(),
        'N-deam-NAAF': pdb325['n-deam'].sum(),
        'K-meth-NAAF': pdb325['k-meth'].sum(),
        'R-meth-NAAF': pdb325['r-meth'].sum(),
        'Q-pyro-NAAF': pdb325['q-pyro'].sum(),
        'K-acet-NAAF': pdb325['k-acet'].sum()
       }

sum_325 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_325['AA-NAAF'] = sum_325['A-NAAF'] + sum_325['C-NAAF'] + sum_325['D-NAAF'] + sum_325['E-NAAF'] + \
                     sum_325['F-NAAF'] + sum_325['G-NAAF'] + sum_325['H-NAAF'] + sum_325['I-NAAF'] + \
                     sum_325['K-NAAF'] + sum_325['L-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                     sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + sum_325['S-NAAF'] + \
                     sum_325['T-NAAF'] + sum_325['V-NAAF'] + sum_325['W-NAAF'] + sum_325['Y-NAAF'] 

sum_325['AA-modifiable-NAAF'] = sum_325['K-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                                sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + \
                                sum_325['Y-NAAF']

sum_325['Cys. w/ carb.'] = sum_325['C-carb-NAAF'] / sum_325['C-NAAF'] #1
sum_325['Met. w/ oxid.'] = sum_325['M-oxid-NAAF'] / sum_325['M-NAAF'] #2
sum_325['Lys. w/ oxid.'] = sum_325['K-oxid-NAAF'] / sum_325['K-NAAF'] #3
sum_325['Pro. w/ oxid.'] = sum_325['P-oxid-NAAF'] / sum_325['P-NAAF'] #4
sum_325['Arg. w/ oxid.'] = sum_325['R-oxid-NAAF'] / sum_325['R-NAAF'] #5
sum_325['Tyr. w/ oxid.'] = sum_325['Y-oxid-NAAF'] / sum_325['Y-NAAF'] #6
sum_325['Asn. w/ deam.'] = sum_325['N-deam-NAAF'] / sum_325['N-NAAF'] #7
sum_325['Lys. w/ meth.'] = sum_325['K-meth-NAAF'] / sum_325['K-NAAF'] #8
sum_325['Arg. w/ meth.'] = sum_325['R-meth-NAAF'] / sum_325['R-NAAF'] #9
sum_325['Glu. w/ pyro.'] = sum_325['Q-pyro-NAAF'] / sum_325['Q-NAAF'] #10
sum_325['Lys. w/ acet.'] = sum_325['K-acet-NAAF'] / sum_325['K-NAAF'] #11

sum_325['Overall modified'] = 0.01*((sum_325['Met. w/ oxid.']*sum_325['M-NAAF']) + \
                                     (sum_325['Lys. w/ oxid.']*sum_325['K-NAAF']) + \
                                     (sum_325['Pro. w/ oxid.']*sum_325['P-NAAF']) + \
                                     (sum_325['Arg. w/ oxid.']*sum_325['R-NAAF']) + \
                                     (sum_325['Tyr. w/ oxid.']*sum_325['Y-NAAF']) + \
                                     (sum_325['Asn. w/ deam.']*sum_325['N-NAAF']) + \
                                     (sum_325['Lys. w/ meth.']*sum_325['K-NAAF']) + \
                                     (sum_325['Arg. w/ meth.']*sum_325['R-NAAF']) + \
                                     (sum_325['Glu. w/ pyro.']*sum_325['Q-NAAF']) + \
                                     (sum_325['Lys. w/ acet.']*sum_325['K-NAAF']) / (sum_325['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_325.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-325.csv")

pdb325.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,334.69276,233.92518,6.47445,272.965901,336.869675,113.748266,365.266197,65.24347,271.810998,94.079027,...,0.001074,2.7e-05,0.0,0.001662,0.003491,0.0,0.0,9.6e-05,0.001017,0.000204


In [17]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 329 T0 digested

pdb329 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_329_T0_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb329 = pdb329.loc[:, ~pdb329.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['329']

data = {'NAAF': pdb329['NAAF'].sum(),
        'A-NAAF': pdb329['A'].sum(),
        'C-NAAF': pdb329['C'].sum(),
        'D-NAAF': pdb329['D'].sum(),
        'E-NAAF': pdb329['E'].sum(),
        'F-NAAF': pdb329['F'].sum(),
        'G-NAAF': pdb329['G'].sum(),
        'H-NAAF': pdb329['H'].sum(),
        'I-NAAF': pdb329['I'].sum(),
        'K-NAAF': pdb329['K'].sum(),
        'L-NAAF': pdb329['L'].sum(),
        'M-NAAF': pdb329['M'].sum(),
        'N-NAAF': pdb329['N'].sum(),
        'P-NAAF': pdb329['P'].sum(),
        'Q-NAAF': pdb329['Q'].sum(),
        'R-NAAF': pdb329['R'].sum(),
        'S-NAAF': pdb329['S'].sum(),
        'T-NAAF': pdb329['T'].sum(),
        'V-NAAF': pdb329['V'].sum(),
        'W-NAAF': pdb329['W'].sum(),
        'Y-NAAF': pdb329['Y'].sum(),
        'C-carb-NAAF': pdb329['c-carb'].sum(),
        'M-oxid-NAAF': pdb329['m-oxid'].sum(),
        'K-oxid-NAAF': pdb329['k-oxid'].sum(),
        'P-oxid-NAAF': pdb329['p-oxid'].sum(),
        'R-oxid-NAAF': pdb329['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb329['y-oxid'].sum(),
        'N-deam-NAAF': pdb329['n-deam'].sum(),
        'K-meth-NAAF': pdb329['k-meth'].sum(),
        'R-meth-NAAF': pdb329['r-meth'].sum(),
        'Q-pyro-NAAF': pdb329['q-pyro'].sum(),
        'K-acet-NAAF': pdb329['k-acet'].sum()
       }

sum_329 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_329['AA-NAAF'] = sum_329['A-NAAF'] + sum_329['C-NAAF'] + sum_329['D-NAAF'] + sum_329['E-NAAF'] + \
                     sum_329['F-NAAF'] + sum_329['G-NAAF'] + sum_329['H-NAAF'] + sum_329['I-NAAF'] + \
                     sum_329['K-NAAF'] + sum_329['L-NAAF'] + sum_329['M-NAAF'] + sum_329['N-NAAF'] + \
                     sum_329['P-NAAF'] + sum_329['Q-NAAF'] + sum_329['R-NAAF'] + sum_329['S-NAAF'] + \
                     sum_329['T-NAAF'] + sum_329['V-NAAF'] + sum_329['W-NAAF'] + sum_329['Y-NAAF'] 

sum_329['AA-modifiable-NAAF'] = sum_329['K-NAAF'] + sum_329['M-NAAF'] + sum_329['N-NAAF'] + \
                                sum_329['P-NAAF'] + sum_329['Q-NAAF'] + sum_329['R-NAAF'] + \
                                sum_329['Y-NAAF']

sum_329['Cys. w/ carb.'] = sum_329['C-carb-NAAF'] / sum_329['C-NAAF'] #1
sum_329['Met. w/ oxid.'] = sum_329['M-oxid-NAAF'] / sum_329['M-NAAF'] #2
sum_329['Lys. w/ oxid.'] = sum_329['K-oxid-NAAF'] / sum_329['K-NAAF'] #3
sum_329['Pro. w/ oxid.'] = sum_329['P-oxid-NAAF'] / sum_329['P-NAAF'] #4
sum_329['Arg. w/ oxid.'] = sum_329['R-oxid-NAAF'] / sum_329['R-NAAF'] #5
sum_329['Tyr. w/ oxid.'] = sum_329['Y-oxid-NAAF'] / sum_329['Y-NAAF'] #6
sum_329['Asn. w/ deam.'] = sum_329['N-deam-NAAF'] / sum_329['N-NAAF'] #7
sum_329['Lys. w/ meth.'] = sum_329['K-meth-NAAF'] / sum_329['K-NAAF'] #8
sum_329['Arg. w/ meth.'] = sum_329['R-meth-NAAF'] / sum_329['R-NAAF'] #9
sum_329['Glu. w/ pyro.'] = sum_329['Q-pyro-NAAF'] / sum_329['Q-NAAF'] #10
sum_329['Lys. w/ acet.'] = sum_329['K-acet-NAAF'] / sum_329['K-NAAF'] #11

sum_329['Overall modified'] = 0.01*((sum_329['Met. w/ oxid.']*sum_329['M-NAAF']) + \
                                     (sum_329['Lys. w/ oxid.']*sum_329['K-NAAF']) + \
                                     (sum_329['Pro. w/ oxid.']*sum_329['P-NAAF']) + \
                                     (sum_329['Arg. w/ oxid.']*sum_329['R-NAAF']) + \
                                     (sum_329['Tyr. w/ oxid.']*sum_329['Y-NAAF']) + \
                                     (sum_329['Asn. w/ deam.']*sum_329['N-NAAF']) + \
                                     (sum_329['Lys. w/ meth.']*sum_329['K-NAAF']) + \
                                     (sum_329['Arg. w/ meth.']*sum_329['R-NAAF']) + \
                                     (sum_329['Glu. w/ pyro.']*sum_329['Q-NAAF']) + \
                                     (sum_329['Lys. w/ acet.']*sum_329['K-NAAF']) / (sum_329['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_329.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-329.csv")

pdb329.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,238.206235,154.716472,0.0,309.460845,188.45147,76.992082,398.145433,96.195838,237.602283,62.717156,...,0.0,0.012373,0.006394,0.000227,0.001677,0.001426,0.0,0.014502,0.0,0.000421


In [18]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 330 T0 digested

pdb330 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_330_T2_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb330 = pdb330.loc[:, ~pdb330.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['330']

data = {'NAAF': pdb330['NAAF'].sum(),
        'A-NAAF': pdb330['A'].sum(),
        'C-NAAF': pdb330['C'].sum(),
        'D-NAAF': pdb330['D'].sum(),
        'E-NAAF': pdb330['E'].sum(),
        'F-NAAF': pdb330['F'].sum(),
        'G-NAAF': pdb330['G'].sum(),
        'H-NAAF': pdb330['H'].sum(),
        'I-NAAF': pdb330['I'].sum(),
        'K-NAAF': pdb330['K'].sum(),
        'L-NAAF': pdb330['L'].sum(),
        'M-NAAF': pdb330['M'].sum(),
        'N-NAAF': pdb330['N'].sum(),
        'P-NAAF': pdb330['P'].sum(),
        'Q-NAAF': pdb330['Q'].sum(),
        'R-NAAF': pdb330['R'].sum(),
        'S-NAAF': pdb330['S'].sum(),
        'T-NAAF': pdb330['T'].sum(),
        'V-NAAF': pdb330['V'].sum(),
        'W-NAAF': pdb330['W'].sum(),
        'Y-NAAF': pdb330['Y'].sum(),
        'C-carb-NAAF': pdb330['c-carb'].sum(),
        'M-oxid-NAAF': pdb330['m-oxid'].sum(),
        'K-oxid-NAAF': pdb330['k-oxid'].sum(),
        'P-oxid-NAAF': pdb330['p-oxid'].sum(),
        'R-oxid-NAAF': pdb330['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb330['y-oxid'].sum(),
        'N-deam-NAAF': pdb330['n-deam'].sum(),
        'K-meth-NAAF': pdb330['k-meth'].sum(),
        'R-meth-NAAF': pdb330['r-meth'].sum(),
        'Q-pyro-NAAF': pdb330['q-pyro'].sum(),
        'K-acet-NAAF': pdb330['k-acet'].sum()
       }

sum_330 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_330['AA-NAAF'] = sum_330['A-NAAF'] + sum_330['C-NAAF'] + sum_330['D-NAAF'] + sum_330['E-NAAF'] + \
                     sum_330['F-NAAF'] + sum_330['G-NAAF'] + sum_330['H-NAAF'] + sum_330['I-NAAF'] + \
                     sum_330['K-NAAF'] + sum_330['L-NAAF'] + sum_330['M-NAAF'] + sum_330['N-NAAF'] + \
                     sum_330['P-NAAF'] + sum_330['Q-NAAF'] + sum_330['R-NAAF'] + sum_330['S-NAAF'] + \
                     sum_330['T-NAAF'] + sum_330['V-NAAF'] + sum_330['W-NAAF'] + sum_330['Y-NAAF'] 

sum_330['AA-modifiable-NAAF'] = sum_330['K-NAAF'] + sum_330['M-NAAF'] + sum_330['N-NAAF'] + \
                                sum_330['P-NAAF'] + sum_330['Q-NAAF'] + sum_330['R-NAAF'] + \
                                sum_330['Y-NAAF']

sum_330['Cys. w/ carb.'] = sum_330['C-carb-NAAF'] / sum_330['C-NAAF'] #1
sum_330['Met. w/ oxid.'] = sum_330['M-oxid-NAAF'] / sum_330['M-NAAF'] #2
sum_330['Lys. w/ oxid.'] = sum_330['K-oxid-NAAF'] / sum_330['K-NAAF'] #3
sum_330['Pro. w/ oxid.'] = sum_330['P-oxid-NAAF'] / sum_330['P-NAAF'] #4
sum_330['Arg. w/ oxid.'] = sum_330['R-oxid-NAAF'] / sum_330['R-NAAF'] #5
sum_330['Tyr. w/ oxid.'] = sum_330['Y-oxid-NAAF'] / sum_330['Y-NAAF'] #6
sum_330['Asn. w/ deam.'] = sum_330['N-deam-NAAF'] / sum_330['N-NAAF'] #7
sum_330['Lys. w/ meth.'] = sum_330['K-meth-NAAF'] / sum_330['K-NAAF'] #8
sum_330['Arg. w/ meth.'] = sum_330['R-meth-NAAF'] / sum_330['R-NAAF'] #9
sum_330['Glu. w/ pyro.'] = sum_330['Q-pyro-NAAF'] / sum_330['Q-NAAF'] #10
sum_330['Lys. w/ acet.'] = sum_330['K-acet-NAAF'] / sum_330['K-NAAF'] #11

sum_330['Overall modified'] = 0.01*((sum_330['Met. w/ oxid.']*sum_330['M-NAAF']) + \
                                     (sum_330['Lys. w/ oxid.']*sum_330['K-NAAF']) + \
                                     (sum_330['Pro. w/ oxid.']*sum_330['P-NAAF']) + \
                                     (sum_330['Arg. w/ oxid.']*sum_330['R-NAAF']) + \
                                     (sum_330['Tyr. w/ oxid.']*sum_330['Y-NAAF']) + \
                                     (sum_330['Asn. w/ deam.']*sum_330['N-NAAF']) + \
                                     (sum_330['Lys. w/ meth.']*sum_330['K-NAAF']) + \
                                     (sum_330['Arg. w/ meth.']*sum_330['R-NAAF']) + \
                                     (sum_330['Glu. w/ pyro.']*sum_330['Q-NAAF']) + \
                                     (sum_330['Lys. w/ acet.']*sum_330['K-NAAF']) / (sum_330['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_330.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-330.csv")

pdb330.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,2313.313155,1343.07377,0.0,1872.380644,2886.464249,792.960391,2129.644478,400.210267,1034.571071,1092.041198,...,0.006309,0.030498,0.047883,0.004658,0.000889,0.011883,0.001193,0.140985,0.001035,0.004271


In [20]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 331 T0 digested

pdb331 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_331_T5_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb331 = pdb331.loc[:, ~pdb331.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['331']

data = {'NAAF': pdb331['NAAF'].sum(),
        'A-NAAF': pdb331['A'].sum(),
        'C-NAAF': pdb331['C'].sum(),
        'D-NAAF': pdb331['D'].sum(),
        'E-NAAF': pdb331['E'].sum(),
        'F-NAAF': pdb331['F'].sum(),
        'G-NAAF': pdb331['G'].sum(),
        'H-NAAF': pdb331['H'].sum(),
        'I-NAAF': pdb331['I'].sum(),
        'K-NAAF': pdb331['K'].sum(),
        'L-NAAF': pdb331['L'].sum(),
        'M-NAAF': pdb331['M'].sum(),
        'N-NAAF': pdb331['N'].sum(),
        'P-NAAF': pdb331['P'].sum(),
        'Q-NAAF': pdb331['Q'].sum(),
        'R-NAAF': pdb331['R'].sum(),
        'S-NAAF': pdb331['S'].sum(),
        'T-NAAF': pdb331['T'].sum(),
        'V-NAAF': pdb331['V'].sum(),
        'W-NAAF': pdb331['W'].sum(),
        'Y-NAAF': pdb331['Y'].sum(),
        'C-carb-NAAF': pdb331['c-carb'].sum(),
        'M-oxid-NAAF': pdb331['m-oxid'].sum(),
        'K-oxid-NAAF': pdb331['k-oxid'].sum(),
        'P-oxid-NAAF': pdb331['p-oxid'].sum(),
        'R-oxid-NAAF': pdb331['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb331['y-oxid'].sum(),
        'N-deam-NAAF': pdb331['n-deam'].sum(),
        'K-meth-NAAF': pdb331['k-meth'].sum(),
        'R-meth-NAAF': pdb331['r-meth'].sum(),
        'Q-pyro-NAAF': pdb331['q-pyro'].sum(),
        'K-acet-NAAF': pdb331['k-acet'].sum()
       }

sum_331 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_331['AA-NAAF'] = sum_331['A-NAAF'] + sum_331['C-NAAF'] + sum_331['D-NAAF'] + sum_331['E-NAAF'] + \
                     sum_331['F-NAAF'] + sum_331['G-NAAF'] + sum_331['H-NAAF'] + sum_331['I-NAAF'] + \
                     sum_331['K-NAAF'] + sum_331['L-NAAF'] + sum_331['M-NAAF'] + sum_331['N-NAAF'] + \
                     sum_331['P-NAAF'] + sum_331['Q-NAAF'] + sum_331['R-NAAF'] + sum_331['S-NAAF'] + \
                     sum_331['T-NAAF'] + sum_331['V-NAAF'] + sum_331['W-NAAF'] + sum_331['Y-NAAF'] 

sum_331['AA-modifiable-NAAF'] = sum_331['K-NAAF'] + sum_331['M-NAAF'] + sum_331['N-NAAF'] + \
                                sum_331['P-NAAF'] + sum_331['Q-NAAF'] + sum_331['R-NAAF'] + \
                                sum_331['Y-NAAF']

sum_331['Cys. w/ carb.'] = sum_331['C-carb-NAAF'] / sum_331['C-NAAF'] #1
sum_331['Met. w/ oxid.'] = sum_331['M-oxid-NAAF'] / sum_331['M-NAAF'] #2
sum_331['Lys. w/ oxid.'] = sum_331['K-oxid-NAAF'] / sum_331['K-NAAF'] #3
sum_331['Pro. w/ oxid.'] = sum_331['P-oxid-NAAF'] / sum_331['P-NAAF'] #4
sum_331['Arg. w/ oxid.'] = sum_331['R-oxid-NAAF'] / sum_331['R-NAAF'] #5
sum_331['Tyr. w/ oxid.'] = sum_331['Y-oxid-NAAF'] / sum_331['Y-NAAF'] #6
sum_331['Asn. w/ deam.'] = sum_331['N-deam-NAAF'] / sum_331['N-NAAF'] #7
sum_331['Lys. w/ meth.'] = sum_331['K-meth-NAAF'] / sum_331['K-NAAF'] #8
sum_331['Arg. w/ meth.'] = sum_331['R-meth-NAAF'] / sum_331['R-NAAF'] #9
sum_331['Glu. w/ pyro.'] = sum_331['Q-pyro-NAAF'] / sum_331['Q-NAAF'] #10
sum_331['Lys. w/ acet.'] = sum_331['K-acet-NAAF'] / sum_331['K-NAAF'] #11

sum_331['Overall modified'] = 0.01*((sum_331['Met. w/ oxid.']*sum_331['M-NAAF']) + \
                                     (sum_331['Lys. w/ oxid.']*sum_331['K-NAAF']) + \
                                     (sum_331['Pro. w/ oxid.']*sum_331['P-NAAF']) + \
                                     (sum_331['Arg. w/ oxid.']*sum_331['R-NAAF']) + \
                                     (sum_331['Tyr. w/ oxid.']*sum_331['Y-NAAF']) + \
                                     (sum_331['Asn. w/ deam.']*sum_331['N-NAAF']) + \
                                     (sum_331['Lys. w/ meth.']*sum_331['K-NAAF']) + \
                                     (sum_331['Arg. w/ meth.']*sum_331['R-NAAF']) + \
                                     (sum_331['Glu. w/ pyro.']*sum_331['Q-NAAF']) + \
                                     (sum_331['Lys. w/ acet.']*sum_331['K-NAAF']) / (sum_331['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_331.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-331.csv")

pdb331.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,229.477809,85.254523,0.006299,418.088294,165.118479,71.621912,275.375368,107.876616,268.182024,61.329375,...,0.0,0.000529,0.005436,0.001053,0.001951,0.0,0.0,0.001541,0.0,0.000183


In [21]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made PeaksDB (de novo assisted database search) notebooks
# bringing NAAF corrected AA and PTM totals from PeaksDB peptides <1% FDR  

# 332 T0 digested

pdb332 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_332_T12_undigested_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

# there's a useless column in there
pdb332 = pdb332.loc[:, ~pdb332.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['332']

data = {'NAAF': pdb332['NAAF'].sum(),
        'A-NAAF': pdb332['A'].sum(),
        'C-NAAF': pdb332['C'].sum(),
        'D-NAAF': pdb332['D'].sum(),
        'E-NAAF': pdb332['E'].sum(),
        'F-NAAF': pdb332['F'].sum(),
        'G-NAAF': pdb332['G'].sum(),
        'H-NAAF': pdb332['H'].sum(),
        'I-NAAF': pdb332['I'].sum(),
        'K-NAAF': pdb332['K'].sum(),
        'L-NAAF': pdb332['L'].sum(),
        'M-NAAF': pdb332['M'].sum(),
        'N-NAAF': pdb332['N'].sum(),
        'P-NAAF': pdb332['P'].sum(),
        'Q-NAAF': pdb332['Q'].sum(),
        'R-NAAF': pdb332['R'].sum(),
        'S-NAAF': pdb332['S'].sum(),
        'T-NAAF': pdb332['T'].sum(),
        'V-NAAF': pdb332['V'].sum(),
        'W-NAAF': pdb332['W'].sum(),
        'Y-NAAF': pdb332['Y'].sum(),
        'C-carb-NAAF': pdb332['c-carb'].sum(),
        'M-oxid-NAAF': pdb332['m-oxid'].sum(),
        'K-oxid-NAAF': pdb332['k-oxid'].sum(),
        'P-oxid-NAAF': pdb332['p-oxid'].sum(),
        'R-oxid-NAAF': pdb332['r-oxid'].sum(),
        'Y-oxid-NAAF': pdb332['y-oxid'].sum(),
        'N-deam-NAAF': pdb332['n-deam'].sum(),
        'K-meth-NAAF': pdb332['k-meth'].sum(),
        'R-meth-NAAF': pdb332['r-meth'].sum(),
        'Q-pyro-NAAF': pdb332['q-pyro'].sum(),
        'K-acet-NAAF': pdb332['k-acet'].sum()
       }

sum_332 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_332['AA-NAAF'] = sum_332['A-NAAF'] + sum_332['C-NAAF'] + sum_332['D-NAAF'] + sum_332['E-NAAF'] + \
                     sum_332['F-NAAF'] + sum_332['G-NAAF'] + sum_332['H-NAAF'] + sum_332['I-NAAF'] + \
                     sum_332['K-NAAF'] + sum_332['L-NAAF'] + sum_332['M-NAAF'] + sum_332['N-NAAF'] + \
                     sum_332['P-NAAF'] + sum_332['Q-NAAF'] + sum_332['R-NAAF'] + sum_332['S-NAAF'] + \
                     sum_332['T-NAAF'] + sum_332['V-NAAF'] + sum_332['W-NAAF'] + sum_332['Y-NAAF'] 

sum_332['AA-modifiable-NAAF'] = sum_332['K-NAAF'] + sum_332['M-NAAF'] + sum_332['N-NAAF'] + \
                                sum_332['P-NAAF'] + sum_332['Q-NAAF'] + sum_332['R-NAAF'] + \
                                sum_332['Y-NAAF']

sum_332['Cys. w/ carb.'] = sum_332['C-carb-NAAF'] / sum_332['C-NAAF'] #1
sum_332['Met. w/ oxid.'] = sum_332['M-oxid-NAAF'] / sum_332['M-NAAF'] #2
sum_332['Lys. w/ oxid.'] = sum_332['K-oxid-NAAF'] / sum_332['K-NAAF'] #3
sum_332['Pro. w/ oxid.'] = sum_332['P-oxid-NAAF'] / sum_332['P-NAAF'] #4
sum_332['Arg. w/ oxid.'] = sum_332['R-oxid-NAAF'] / sum_332['R-NAAF'] #5
sum_332['Tyr. w/ oxid.'] = sum_332['Y-oxid-NAAF'] / sum_332['Y-NAAF'] #6
sum_332['Asn. w/ deam.'] = sum_332['N-deam-NAAF'] / sum_332['N-NAAF'] #7
sum_332['Lys. w/ meth.'] = sum_332['K-meth-NAAF'] / sum_332['K-NAAF'] #8
sum_332['Arg. w/ meth.'] = sum_332['R-meth-NAAF'] / sum_332['R-NAAF'] #9
sum_332['Glu. w/ pyro.'] = sum_332['Q-pyro-NAAF'] / sum_332['Q-NAAF'] #10
sum_332['Lys. w/ acet.'] = sum_332['K-acet-NAAF'] / sum_332['K-NAAF'] #11

sum_332['Overall modified'] = 0.01*((sum_332['Met. w/ oxid.']*sum_332['M-NAAF']) + \
                                     (sum_332['Lys. w/ oxid.']*sum_332['K-NAAF']) + \
                                     (sum_332['Pro. w/ oxid.']*sum_332['P-NAAF']) + \
                                     (sum_332['Arg. w/ oxid.']*sum_332['R-NAAF']) + \
                                     (sum_332['Tyr. w/ oxid.']*sum_332['Y-NAAF']) + \
                                     (sum_332['Asn. w/ deam.']*sum_332['N-NAAF']) + \
                                     (sum_332['Lys. w/ meth.']*sum_332['K-NAAF']) + \
                                     (sum_332['Arg. w/ meth.']*sum_332['R-NAAF']) + \
                                     (sum_332['Glu. w/ pyro.']*sum_332['Q-NAAF']) + \
                                     (sum_332['Lys. w/ acet.']*sum_332['K-NAAF']) / (sum_332['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_332.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DB-332.csv")

pdb332.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,52.669367,8.058982,0.0,76.042215,16.6856,3.624799,85.589143,44.473753,89.789919,5.085563,...,0.005949,0.0,0.0,0.0,0.0,0.005949,0.0,0.001947,0.0,3.5e-05


## NAAF calculations for de novo peptides (mainly bacterial)

In [3]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 322 T0 digested

dn322 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_322_T0_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn322 = dn322.loc[:, ~dn322.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['322']

data = {'NAAF': dn322['NAAF'].sum(),
        'A-NAAF': dn322['A'].sum(),
        'C-NAAF': dn322['C'].sum(),
        'D-NAAF': dn322['D'].sum(),
        'E-NAAF': dn322['E'].sum(),
        'F-NAAF': dn322['F'].sum(),
        'G-NAAF': dn322['G'].sum(),
        'H-NAAF': dn322['H'].sum(),
        'I-NAAF': dn322['I'].sum(),
        'K-NAAF': dn322['K'].sum(),
        'L-NAAF': dn322['L'].sum(),
        'M-NAAF': dn322['M'].sum(),
        'N-NAAF': dn322['N'].sum(),
        'P-NAAF': dn322['P'].sum(),
        'Q-NAAF': dn322['Q'].sum(),
        'R-NAAF': dn322['R'].sum(),
        'S-NAAF': dn322['S'].sum(),
        'T-NAAF': dn322['T'].sum(),
        'V-NAAF': dn322['V'].sum(),
        'W-NAAF': dn322['W'].sum(),
        'Y-NAAF': dn322['Y'].sum(),
        'C-carb-NAAF': dn322['c-carb'].sum(),
        'M-oxid-NAAF': dn322['m-oxid'].sum(),
        'K-oxid-NAAF': dn322['k-oxid'].sum(),
        'P-oxid-NAAF': dn322['p-oxid'].sum(),
        'R-oxid-NAAF': dn322['r-oxid'].sum(),
        'Y-oxid-NAAF': dn322['y-oxid'].sum(),
        'N-deam-NAAF': dn322['n-deam'].sum(),
        'K-meth-NAAF': dn322['k-meth'].sum(),
        'R-meth-NAAF': dn322['r-meth'].sum(),
        'Q-pyro-NAAF': dn322['q-pyro'].sum(),
        'K-acet-NAAF': dn322['k-acet'].sum()
       }

sum_322 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_322['AA-NAAF'] = sum_322['A-NAAF'] + sum_322['C-NAAF'] + sum_322['D-NAAF'] + sum_322['E-NAAF'] + \
                     sum_322['F-NAAF'] + sum_322['G-NAAF'] + sum_322['H-NAAF'] + sum_322['I-NAAF'] + \
                     sum_322['K-NAAF'] + sum_322['L-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                     sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + sum_322['S-NAAF'] + \
                     sum_322['T-NAAF'] + sum_322['V-NAAF'] + sum_322['W-NAAF'] + sum_322['Y-NAAF'] 

sum_322['AA-modifiable-NAAF'] = sum_322['K-NAAF'] + sum_322['M-NAAF'] + sum_322['N-NAAF'] + \
                                sum_322['P-NAAF'] + sum_322['Q-NAAF'] + sum_322['R-NAAF'] + \
                                sum_322['Y-NAAF']

sum_322['Cys. w/ carb.'] = sum_322['C-carb-NAAF'] / sum_322['C-NAAF'] #1
sum_322['Met. w/ oxid.'] = sum_322['M-oxid-NAAF'] / sum_322['M-NAAF'] #2
sum_322['Lys. w/ oxid.'] = sum_322['K-oxid-NAAF'] / sum_322['K-NAAF'] #3
sum_322['Pro. w/ oxid.'] = sum_322['P-oxid-NAAF'] / sum_322['P-NAAF'] #4
sum_322['Arg. w/ oxid.'] = sum_322['R-oxid-NAAF'] / sum_322['R-NAAF'] #5
sum_322['Tyr. w/ oxid.'] = sum_322['Y-oxid-NAAF'] / sum_322['Y-NAAF'] #6
sum_322['Asn. w/ deam.'] = sum_322['N-deam-NAAF'] / sum_322['N-NAAF'] #7
sum_322['Lys. w/ meth.'] = sum_322['K-meth-NAAF'] / sum_322['K-NAAF'] #8
sum_322['Arg. w/ meth.'] = sum_322['R-meth-NAAF'] / sum_322['R-NAAF'] #9
sum_322['Glu. w/ pyro.'] = sum_322['Q-pyro-NAAF'] / sum_322['Q-NAAF'] #10
sum_322['Lys. w/ acet.'] = sum_322['K-acet-NAAF'] / sum_322['K-NAAF'] #11

sum_322['Overall modified'] = 0.01*((sum_322['Met. w/ oxid.']*sum_322['M-NAAF']) + \
                                     (sum_322['Lys. w/ oxid.']*sum_322['K-NAAF']) + \
                                     (sum_322['Pro. w/ oxid.']*sum_322['P-NAAF']) + \
                                     (sum_322['Arg. w/ oxid.']*sum_322['R-NAAF']) + \
                                     (sum_322['Tyr. w/ oxid.']*sum_322['Y-NAAF']) + \
                                     (sum_322['Asn. w/ deam.']*sum_322['N-NAAF']) + \
                                     (sum_322['Lys. w/ meth.']*sum_322['K-NAAF']) + \
                                     (sum_322['Arg. w/ meth.']*sum_322['R-NAAF']) + \
                                     (sum_322['Glu. w/ pyro.']*sum_322['Q-NAAF']) + \
                                     (sum_322['Lys. w/ acet.']*sum_322['K-NAAF']) / (sum_322['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_322.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-322.csv")

dn322.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,5762.920456,3749.423046,199.083912,1395.170186,2571.675259,1113.698938,2510.1633,341.968822,0.0,2294.30193,...,0.155192,0.154111,0.068426,0.333813,0.323019,0.135081,0.016791,0.010005,0.121853,0.024648


In [4]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 323 T2 digested

dn323 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_323_T2_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn323 = dn323.loc[:, ~dn323.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['323']

data = {'NAAF': dn323['NAAF'].sum(),
        'A-NAAF': dn323['A'].sum(),
        'C-NAAF': dn323['C'].sum(),
        'D-NAAF': dn323['D'].sum(),
        'E-NAAF': dn323['E'].sum(),
        'F-NAAF': dn323['F'].sum(),
        'G-NAAF': dn323['G'].sum(),
        'H-NAAF': dn323['H'].sum(),
        'I-NAAF': dn323['I'].sum(),
        'K-NAAF': dn323['K'].sum(),
        'L-NAAF': dn323['L'].sum(),
        'M-NAAF': dn323['M'].sum(),
        'N-NAAF': dn323['N'].sum(),
        'P-NAAF': dn323['P'].sum(),
        'Q-NAAF': dn323['Q'].sum(),
        'R-NAAF': dn323['R'].sum(),
        'S-NAAF': dn323['S'].sum(),
        'T-NAAF': dn323['T'].sum(),
        'V-NAAF': dn323['V'].sum(),
        'W-NAAF': dn323['W'].sum(),
        'Y-NAAF': dn323['Y'].sum(),
        'C-carb-NAAF': dn323['c-carb'].sum(),
        'M-oxid-NAAF': dn323['m-oxid'].sum(),
        'K-oxid-NAAF': dn323['k-oxid'].sum(),
        'P-oxid-NAAF': dn323['p-oxid'].sum(),
        'R-oxid-NAAF': dn323['r-oxid'].sum(),
        'Y-oxid-NAAF': dn323['y-oxid'].sum(),
        'N-deam-NAAF': dn323['n-deam'].sum(),
        'K-meth-NAAF': dn323['k-meth'].sum(),
        'R-meth-NAAF': dn323['r-meth'].sum(),
        'Q-pyro-NAAF': dn323['q-pyro'].sum(),
        'K-acet-NAAF': dn323['k-acet'].sum()
       }

sum_323 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_323['AA-NAAF'] = sum_323['A-NAAF'] + sum_323['C-NAAF'] + sum_323['D-NAAF'] + sum_323['E-NAAF'] + \
                     sum_323['F-NAAF'] + sum_323['G-NAAF'] + sum_323['H-NAAF'] + sum_323['I-NAAF'] + \
                     sum_323['K-NAAF'] + sum_323['L-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                     sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + sum_323['S-NAAF'] + \
                     sum_323['T-NAAF'] + sum_323['V-NAAF'] + sum_323['W-NAAF'] + sum_323['Y-NAAF'] 

sum_323['AA-modifiable-NAAF'] = sum_323['K-NAAF'] + sum_323['M-NAAF'] + sum_323['N-NAAF'] + \
                                sum_323['P-NAAF'] + sum_323['Q-NAAF'] + sum_323['R-NAAF'] + \
                                sum_323['Y-NAAF']

sum_323['Cys. w/ carb.'] = sum_323['C-carb-NAAF'] / sum_323['C-NAAF'] #1
sum_323['Met. w/ oxid.'] = sum_323['M-oxid-NAAF'] / sum_323['M-NAAF'] #2
sum_323['Lys. w/ oxid.'] = sum_323['K-oxid-NAAF'] / sum_323['K-NAAF'] #3
sum_323['Pro. w/ oxid.'] = sum_323['P-oxid-NAAF'] / sum_323['P-NAAF'] #4
sum_323['Arg. w/ oxid.'] = sum_323['R-oxid-NAAF'] / sum_323['R-NAAF'] #5
sum_323['Tyr. w/ oxid.'] = sum_323['Y-oxid-NAAF'] / sum_323['Y-NAAF'] #6
sum_323['Asn. w/ deam.'] = sum_323['N-deam-NAAF'] / sum_323['N-NAAF'] #7
sum_323['Lys. w/ meth.'] = sum_323['K-meth-NAAF'] / sum_323['K-NAAF'] #8
sum_323['Arg. w/ meth.'] = sum_323['R-meth-NAAF'] / sum_323['R-NAAF'] #9
sum_323['Glu. w/ pyro.'] = sum_323['Q-pyro-NAAF'] / sum_323['Q-NAAF'] #10
sum_323['Lys. w/ acet.'] = sum_323['K-acet-NAAF'] / sum_323['K-NAAF'] #11

sum_323['Overall modified'] = 0.01*((sum_323['Met. w/ oxid.']*sum_323['M-NAAF']) + \
                                     (sum_323['Lys. w/ oxid.']*sum_323['K-NAAF']) + \
                                     (sum_323['Pro. w/ oxid.']*sum_323['P-NAAF']) + \
                                     (sum_323['Arg. w/ oxid.']*sum_323['R-NAAF']) + \
                                     (sum_323['Tyr. w/ oxid.']*sum_323['Y-NAAF']) + \
                                     (sum_323['Asn. w/ deam.']*sum_323['N-NAAF']) + \
                                     (sum_323['Lys. w/ meth.']*sum_323['K-NAAF']) + \
                                     (sum_323['Arg. w/ meth.']*sum_323['R-NAAF']) + \
                                     (sum_323['Glu. w/ pyro.']*sum_323['Q-NAAF']) + \
                                     (sum_323['Lys. w/ acet.']*sum_323['K-NAAF']) / (sum_323['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_323.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-323.csv")

dn323.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,6001.990934,2973.218088,588.20847,1637.271117,4223.900556,969.088984,2101.846385,429.078359,0.0,3849.593151,...,0.120515,0.199046,0.106455,0.180812,0.354271,0.090449,0.036714,0.188677,0.161752,0.044957


In [5]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 324 T5 digested

dn324 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_324_T5_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn324 = dn324.loc[:, ~dn324.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['324']

data = {'NAAF': dn324['NAAF'].sum(),
        'A-NAAF': dn324['A'].sum(),
        'C-NAAF': dn324['C'].sum(),
        'D-NAAF': dn324['D'].sum(),
        'E-NAAF': dn324['E'].sum(),
        'F-NAAF': dn324['F'].sum(),
        'G-NAAF': dn324['G'].sum(),
        'H-NAAF': dn324['H'].sum(),
        'I-NAAF': dn324['I'].sum(),
        'K-NAAF': dn324['K'].sum(),
        'L-NAAF': dn324['L'].sum(),
        'M-NAAF': dn324['M'].sum(),
        'N-NAAF': dn324['N'].sum(),
        'P-NAAF': dn324['P'].sum(),
        'Q-NAAF': dn324['Q'].sum(),
        'R-NAAF': dn324['R'].sum(),
        'S-NAAF': dn324['S'].sum(),
        'T-NAAF': dn324['T'].sum(),
        'V-NAAF': dn324['V'].sum(),
        'W-NAAF': dn324['W'].sum(),
        'Y-NAAF': dn324['Y'].sum(),
        'C-carb-NAAF': dn324['c-carb'].sum(),
        'M-oxid-NAAF': dn324['m-oxid'].sum(),
        'K-oxid-NAAF': dn324['k-oxid'].sum(),
        'P-oxid-NAAF': dn324['p-oxid'].sum(),
        'R-oxid-NAAF': dn324['r-oxid'].sum(),
        'Y-oxid-NAAF': dn324['y-oxid'].sum(),
        'N-deam-NAAF': dn324['n-deam'].sum(),
        'K-meth-NAAF': dn324['k-meth'].sum(),
        'R-meth-NAAF': dn324['r-meth'].sum(),
        'Q-pyro-NAAF': dn324['q-pyro'].sum(),
        'K-acet-NAAF': dn324['k-acet'].sum()
       }

sum_324 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_324['AA-NAAF'] = sum_324['A-NAAF'] + sum_324['C-NAAF'] + sum_324['D-NAAF'] + sum_324['E-NAAF'] + \
                     sum_324['F-NAAF'] + sum_324['G-NAAF'] + sum_324['H-NAAF'] + sum_324['I-NAAF'] + \
                     sum_324['K-NAAF'] + sum_324['L-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                     sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + sum_324['S-NAAF'] + \
                     sum_324['T-NAAF'] + sum_324['V-NAAF'] + sum_324['W-NAAF'] + sum_324['Y-NAAF'] 

sum_324['AA-modifiable-NAAF'] = sum_324['K-NAAF'] + sum_324['M-NAAF'] + sum_324['N-NAAF'] + \
                                sum_324['P-NAAF'] + sum_324['Q-NAAF'] + sum_324['R-NAAF'] + \
                                sum_324['Y-NAAF']

sum_324['Cys. w/ carb.'] = sum_324['C-carb-NAAF'] / sum_324['C-NAAF'] #1
sum_324['Met. w/ oxid.'] = sum_324['M-oxid-NAAF'] / sum_324['M-NAAF'] #2
sum_324['Lys. w/ oxid.'] = sum_324['K-oxid-NAAF'] / sum_324['K-NAAF'] #3
sum_324['Pro. w/ oxid.'] = sum_324['P-oxid-NAAF'] / sum_324['P-NAAF'] #4
sum_324['Arg. w/ oxid.'] = sum_324['R-oxid-NAAF'] / sum_324['R-NAAF'] #5
sum_324['Tyr. w/ oxid.'] = sum_324['Y-oxid-NAAF'] / sum_324['Y-NAAF'] #6
sum_324['Asn. w/ deam.'] = sum_324['N-deam-NAAF'] / sum_324['N-NAAF'] #7
sum_324['Lys. w/ meth.'] = sum_324['K-meth-NAAF'] / sum_324['K-NAAF'] #8
sum_324['Arg. w/ meth.'] = sum_324['R-meth-NAAF'] / sum_324['R-NAAF'] #9
sum_324['Glu. w/ pyro.'] = sum_324['Q-pyro-NAAF'] / sum_324['Q-NAAF'] #10
sum_324['Lys. w/ acet.'] = sum_324['K-acet-NAAF'] / sum_324['K-NAAF'] #11

sum_324['Overall modified'] = 0.01*((sum_324['Met. w/ oxid.']*sum_324['M-NAAF']) + \
                                     (sum_324['Lys. w/ oxid.']*sum_324['K-NAAF']) + \
                                     (sum_324['Pro. w/ oxid.']*sum_324['P-NAAF']) + \
                                     (sum_324['Arg. w/ oxid.']*sum_324['R-NAAF']) + \
                                     (sum_324['Tyr. w/ oxid.']*sum_324['Y-NAAF']) + \
                                     (sum_324['Asn. w/ deam.']*sum_324['N-NAAF']) + \
                                     (sum_324['Lys. w/ meth.']*sum_324['K-NAAF']) + \
                                     (sum_324['Arg. w/ meth.']*sum_324['R-NAAF']) + \
                                     (sum_324['Glu. w/ pyro.']*sum_324['Q-NAAF']) + \
                                     (sum_324['Lys. w/ acet.']*sum_324['K-NAAF']) / (sum_324['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-324.csv")

dn324.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,4500.043882,2680.657714,206.676576,1650.479266,3023.731355,1531.013124,2925.757071,235.540057,0.0,2391.5925,...,0.106833,0.15544,0.147078,0.35457,0.497422,0.046478,0.032802,0.036876,0.127919,0.020497


In [6]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 325 T12 digested

dn325 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_325_T12_trypsin_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn325 = dn325.loc[:, ~dn325.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['325']

data = {'NAAF': dn325['NAAF'].sum(),
        'A-NAAF': dn325['A'].sum(),
        'C-NAAF': dn325['C'].sum(),
        'D-NAAF': dn325['D'].sum(),
        'E-NAAF': dn325['E'].sum(),
        'F-NAAF': dn325['F'].sum(),
        'G-NAAF': dn325['G'].sum(),
        'H-NAAF': dn325['H'].sum(),
        'I-NAAF': dn325['I'].sum(),
        'K-NAAF': dn325['K'].sum(),
        'L-NAAF': dn325['L'].sum(),
        'M-NAAF': dn325['M'].sum(),
        'N-NAAF': dn325['N'].sum(),
        'P-NAAF': dn325['P'].sum(),
        'Q-NAAF': dn325['Q'].sum(),
        'R-NAAF': dn325['R'].sum(),
        'S-NAAF': dn325['S'].sum(),
        'T-NAAF': dn325['T'].sum(),
        'V-NAAF': dn325['V'].sum(),
        'W-NAAF': dn325['W'].sum(),
        'Y-NAAF': dn325['Y'].sum(),
        'C-carb-NAAF': dn325['c-carb'].sum(),
        'M-oxid-NAAF': dn325['m-oxid'].sum(),
        'K-oxid-NAAF': dn325['k-oxid'].sum(),
        'P-oxid-NAAF': dn325['p-oxid'].sum(),
        'R-oxid-NAAF': dn325['r-oxid'].sum(),
        'Y-oxid-NAAF': dn325['y-oxid'].sum(),
        'N-deam-NAAF': dn325['n-deam'].sum(),
        'K-meth-NAAF': dn325['k-meth'].sum(),
        'R-meth-NAAF': dn325['r-meth'].sum(),
        'Q-pyro-NAAF': dn325['q-pyro'].sum(),
        'K-acet-NAAF': dn325['k-acet'].sum()
       }

sum_325 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_325['AA-NAAF'] = sum_325['A-NAAF'] + sum_325['C-NAAF'] + sum_325['D-NAAF'] + sum_325['E-NAAF'] + \
                     sum_325['F-NAAF'] + sum_325['G-NAAF'] + sum_325['H-NAAF'] + sum_325['I-NAAF'] + \
                     sum_325['K-NAAF'] + sum_325['L-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                     sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + sum_325['S-NAAF'] + \
                     sum_325['T-NAAF'] + sum_325['V-NAAF'] + sum_325['W-NAAF'] + sum_325['Y-NAAF'] 

sum_325['AA-modifiable-NAAF'] = sum_325['K-NAAF'] + sum_325['M-NAAF'] + sum_325['N-NAAF'] + \
                                sum_325['P-NAAF'] + sum_325['Q-NAAF'] + sum_325['R-NAAF'] + \
                                sum_325['Y-NAAF']

sum_325['Cys. w/ carb.'] = sum_325['C-carb-NAAF'] / sum_325['C-NAAF'] #1
sum_325['Met. w/ oxid.'] = sum_325['M-oxid-NAAF'] / sum_325['M-NAAF'] #2
sum_325['Lys. w/ oxid.'] = sum_325['K-oxid-NAAF'] / sum_325['K-NAAF'] #3
sum_325['Pro. w/ oxid.'] = sum_325['P-oxid-NAAF'] / sum_325['P-NAAF'] #4
sum_325['Arg. w/ oxid.'] = sum_325['R-oxid-NAAF'] / sum_325['R-NAAF'] #5
sum_325['Tyr. w/ oxid.'] = sum_325['Y-oxid-NAAF'] / sum_325['Y-NAAF'] #6
sum_325['Asn. w/ deam.'] = sum_325['N-deam-NAAF'] / sum_325['N-NAAF'] #7
sum_325['Lys. w/ meth.'] = sum_325['K-meth-NAAF'] / sum_325['K-NAAF'] #8
sum_325['Arg. w/ meth.'] = sum_325['R-meth-NAAF'] / sum_325['R-NAAF'] #9
sum_325['Glu. w/ pyro.'] = sum_325['Q-pyro-NAAF'] / sum_325['Q-NAAF'] #10
sum_325['Lys. w/ acet.'] = sum_325['K-acet-NAAF'] / sum_325['K-NAAF'] #11

sum_325['Overall modified'] = 0.01*((sum_325['Met. w/ oxid.']*sum_325['M-NAAF']) + \
                                     (sum_325['Lys. w/ oxid.']*sum_325['K-NAAF']) + \
                                     (sum_325['Pro. w/ oxid.']*sum_325['P-NAAF']) + \
                                     (sum_325['Arg. w/ oxid.']*sum_325['R-NAAF']) + \
                                     (sum_325['Tyr. w/ oxid.']*sum_325['Y-NAAF']) + \
                                     (sum_325['Asn. w/ deam.']*sum_325['N-NAAF']) + \
                                     (sum_325['Lys. w/ meth.']*sum_325['K-NAAF']) + \
                                     (sum_325['Arg. w/ meth.']*sum_325['R-NAAF']) + \
                                     (sum_325['Glu. w/ pyro.']*sum_325['Q-NAAF']) + \
                                     (sum_325['Lys. w/ acet.']*sum_325['K-NAAF']) / (sum_325['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_325.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-325.csv")

dn325.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,6201.829347,3653.988447,201.761328,3237.922204,3860.357848,2401.155267,3817.527648,224.423803,0.0,3306.020033,...,0.084189,0.159079,0.139122,0.237646,0.380027,0.227612,0.041823,0.065111,0.14877,0.025466


In [7]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 329 T0 undigested

dn329 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_329_T0_undigested_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn329 = dn329.loc[:, ~dn329.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['329']

data = {'NAAF': dn329['NAAF'].sum(),
        'A-NAAF': dn329['A'].sum(),
        'C-NAAF': dn329['C'].sum(),
        'D-NAAF': dn329['D'].sum(),
        'E-NAAF': dn329['E'].sum(),
        'F-NAAF': dn329['F'].sum(),
        'G-NAAF': dn329['G'].sum(),
        'H-NAAF': dn329['H'].sum(),
        'I-NAAF': dn329['I'].sum(),
        'K-NAAF': dn329['K'].sum(),
        'L-NAAF': dn329['L'].sum(),
        'M-NAAF': dn329['M'].sum(),
        'N-NAAF': dn329['N'].sum(),
        'P-NAAF': dn329['P'].sum(),
        'Q-NAAF': dn329['Q'].sum(),
        'R-NAAF': dn329['R'].sum(),
        'S-NAAF': dn329['S'].sum(),
        'T-NAAF': dn329['T'].sum(),
        'V-NAAF': dn329['V'].sum(),
        'W-NAAF': dn329['W'].sum(),
        'Y-NAAF': dn329['Y'].sum(),
        'C-carb-NAAF': dn329['c-carb'].sum(),
        'M-oxid-NAAF': dn329['m-oxid'].sum(),
        'K-oxid-NAAF': dn329['k-oxid'].sum(),
        'P-oxid-NAAF': dn329['p-oxid'].sum(),
        'R-oxid-NAAF': dn329['r-oxid'].sum(),
        'Y-oxid-NAAF': dn329['y-oxid'].sum(),
        'N-deam-NAAF': dn329['n-deam'].sum(),
        'K-meth-NAAF': dn329['k-meth'].sum(),
        'R-meth-NAAF': dn329['r-meth'].sum(),
        'Q-pyro-NAAF': dn329['q-pyro'].sum(),
        'K-acet-NAAF': dn329['k-acet'].sum()
       }

sum_329 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_329['AA-NAAF'] = sum_329['A-NAAF'] + sum_329['C-NAAF'] + sum_329['D-NAAF'] + sum_329['E-NAAF'] + \
                     sum_329['F-NAAF'] + sum_329['G-NAAF'] + sum_329['H-NAAF'] + sum_329['I-NAAF'] + \
                     sum_329['K-NAAF'] + sum_329['L-NAAF'] + sum_329['M-NAAF'] + sum_329['N-NAAF'] + \
                     sum_329['P-NAAF'] + sum_329['Q-NAAF'] + sum_329['R-NAAF'] + sum_329['S-NAAF'] + \
                     sum_329['T-NAAF'] + sum_329['V-NAAF'] + sum_329['W-NAAF'] + sum_329['Y-NAAF'] 

sum_329['AA-modifiable-NAAF'] = sum_329['K-NAAF'] + sum_329['M-NAAF'] + sum_329['N-NAAF'] + \
                                sum_329['P-NAAF'] + sum_329['Q-NAAF'] + sum_329['R-NAAF'] + \
                                sum_329['Y-NAAF']

sum_329['Cys. w/ carb.'] = sum_329['C-carb-NAAF'] / sum_329['C-NAAF'] #1
sum_329['Met. w/ oxid.'] = sum_329['M-oxid-NAAF'] / sum_329['M-NAAF'] #2
sum_329['Lys. w/ oxid.'] = sum_329['K-oxid-NAAF'] / sum_329['K-NAAF'] #3
sum_329['Pro. w/ oxid.'] = sum_329['P-oxid-NAAF'] / sum_329['P-NAAF'] #4
sum_329['Arg. w/ oxid.'] = sum_329['R-oxid-NAAF'] / sum_329['R-NAAF'] #5
sum_329['Tyr. w/ oxid.'] = sum_329['Y-oxid-NAAF'] / sum_329['Y-NAAF'] #6
sum_329['Asn. w/ deam.'] = sum_329['N-deam-NAAF'] / sum_329['N-NAAF'] #7
sum_329['Lys. w/ meth.'] = sum_329['K-meth-NAAF'] / sum_329['K-NAAF'] #8
sum_329['Arg. w/ meth.'] = sum_329['R-meth-NAAF'] / sum_329['R-NAAF'] #9
sum_329['Glu. w/ pyro.'] = sum_329['Q-pyro-NAAF'] / sum_329['Q-NAAF'] #10
sum_329['Lys. w/ acet.'] = sum_329['K-acet-NAAF'] / sum_329['K-NAAF'] #11

sum_329['Overall modified'] = 0.01*((sum_329['Met. w/ oxid.']*sum_329['M-NAAF']) + \
                                     (sum_329['Lys. w/ oxid.']*sum_329['K-NAAF']) + \
                                     (sum_329['Pro. w/ oxid.']*sum_329['P-NAAF']) + \
                                     (sum_329['Arg. w/ oxid.']*sum_329['R-NAAF']) + \
                                     (sum_329['Tyr. w/ oxid.']*sum_329['Y-NAAF']) + \
                                     (sum_329['Asn. w/ deam.']*sum_329['N-NAAF']) + \
                                     (sum_329['Lys. w/ meth.']*sum_329['K-NAAF']) + \
                                     (sum_329['Arg. w/ meth.']*sum_329['R-NAAF']) + \
                                     (sum_329['Glu. w/ pyro.']*sum_329['Q-NAAF']) + \
                                     (sum_329['Lys. w/ acet.']*sum_329['K-NAAF']) / (sum_329['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_329.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-329.csv")

dn329.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,1996.191175,968.471846,63.150623,1314.102686,1402.430541,796.978215,1960.971736,589.472258,0.0,756.386892,...,0.061381,0.206446,0.104182,0.199393,0.636713,0.128159,0.086739,0.027509,0.057292,0.007941


In [8]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 330 T2 undigested

dn330 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_330_T2_undigested_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn330 = dn330.loc[:, ~dn330.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['330']

data = {'NAAF': dn330['NAAF'].sum(),
        'A-NAAF': dn330['A'].sum(),
        'C-NAAF': dn330['C'].sum(),
        'D-NAAF': dn330['D'].sum(),
        'E-NAAF': dn330['E'].sum(),
        'F-NAAF': dn330['F'].sum(),
        'G-NAAF': dn330['G'].sum(),
        'H-NAAF': dn330['H'].sum(),
        'I-NAAF': dn330['I'].sum(),
        'K-NAAF': dn330['K'].sum(),
        'L-NAAF': dn330['L'].sum(),
        'M-NAAF': dn330['M'].sum(),
        'N-NAAF': dn330['N'].sum(),
        'P-NAAF': dn330['P'].sum(),
        'Q-NAAF': dn330['Q'].sum(),
        'R-NAAF': dn330['R'].sum(),
        'S-NAAF': dn330['S'].sum(),
        'T-NAAF': dn330['T'].sum(),
        'V-NAAF': dn330['V'].sum(),
        'W-NAAF': dn330['W'].sum(),
        'Y-NAAF': dn330['Y'].sum(),
        'C-carb-NAAF': dn330['c-carb'].sum(),
        'M-oxid-NAAF': dn330['m-oxid'].sum(),
        'K-oxid-NAAF': dn330['k-oxid'].sum(),
        'P-oxid-NAAF': dn330['p-oxid'].sum(),
        'R-oxid-NAAF': dn330['r-oxid'].sum(),
        'Y-oxid-NAAF': dn330['y-oxid'].sum(),
        'N-deam-NAAF': dn330['n-deam'].sum(),
        'K-meth-NAAF': dn330['k-meth'].sum(),
        'R-meth-NAAF': dn330['r-meth'].sum(),
        'Q-pyro-NAAF': dn330['q-pyro'].sum(),
        'K-acet-NAAF': dn330['k-acet'].sum()
       }

sum_330 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_330['AA-NAAF'] = sum_330['A-NAAF'] + sum_330['C-NAAF'] + sum_330['D-NAAF'] + sum_330['E-NAAF'] + \
                     sum_330['F-NAAF'] + sum_330['G-NAAF'] + sum_330['H-NAAF'] + sum_330['I-NAAF'] + \
                     sum_330['K-NAAF'] + sum_330['L-NAAF'] + sum_330['M-NAAF'] + sum_330['N-NAAF'] + \
                     sum_330['P-NAAF'] + sum_330['Q-NAAF'] + sum_330['R-NAAF'] + sum_330['S-NAAF'] + \
                     sum_330['T-NAAF'] + sum_330['V-NAAF'] + sum_330['W-NAAF'] + sum_330['Y-NAAF'] 

sum_330['AA-modifiable-NAAF'] = sum_330['K-NAAF'] + sum_330['M-NAAF'] + sum_330['N-NAAF'] + \
                                sum_330['P-NAAF'] + sum_330['Q-NAAF'] + sum_330['R-NAAF'] + \
                                sum_330['Y-NAAF']

sum_330['Cys. w/ carb.'] = sum_330['C-carb-NAAF'] / sum_330['C-NAAF'] #1
sum_330['Met. w/ oxid.'] = sum_330['M-oxid-NAAF'] / sum_330['M-NAAF'] #2
sum_330['Lys. w/ oxid.'] = sum_330['K-oxid-NAAF'] / sum_330['K-NAAF'] #3
sum_330['Pro. w/ oxid.'] = sum_330['P-oxid-NAAF'] / sum_330['P-NAAF'] #4
sum_330['Arg. w/ oxid.'] = sum_330['R-oxid-NAAF'] / sum_330['R-NAAF'] #5
sum_330['Tyr. w/ oxid.'] = sum_330['Y-oxid-NAAF'] / sum_330['Y-NAAF'] #6
sum_330['Asn. w/ deam.'] = sum_330['N-deam-NAAF'] / sum_330['N-NAAF'] #7
sum_330['Lys. w/ meth.'] = sum_330['K-meth-NAAF'] / sum_330['K-NAAF'] #8
sum_330['Arg. w/ meth.'] = sum_330['R-meth-NAAF'] / sum_330['R-NAAF'] #9
sum_330['Glu. w/ pyro.'] = sum_330['Q-pyro-NAAF'] / sum_330['Q-NAAF'] #10
sum_330['Lys. w/ acet.'] = sum_330['K-acet-NAAF'] / sum_330['K-NAAF'] #11

sum_330['Overall modified'] = 0.01*((sum_330['Met. w/ oxid.']*sum_330['M-NAAF']) + \
                                     (sum_330['Lys. w/ oxid.']*sum_330['K-NAAF']) + \
                                     (sum_330['Pro. w/ oxid.']*sum_330['P-NAAF']) + \
                                     (sum_330['Arg. w/ oxid.']*sum_330['R-NAAF']) + \
                                     (sum_330['Tyr. w/ oxid.']*sum_330['Y-NAAF']) + \
                                     (sum_330['Asn. w/ deam.']*sum_330['N-NAAF']) + \
                                     (sum_330['Lys. w/ meth.']*sum_330['K-NAAF']) + \
                                     (sum_330['Arg. w/ meth.']*sum_330['R-NAAF']) + \
                                     (sum_330['Glu. w/ pyro.']*sum_330['Q-NAAF']) + \
                                     (sum_330['Lys. w/ acet.']*sum_330['K-NAAF']) / (sum_330['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_330.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-330.csv")

dn330.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,3939.393728,1986.026567,74.515359,1618.230008,3915.376662,928.431074,2186.64624,503.795621,0.0,2324.116949,...,0.132248,0.23497,0.070494,0.206837,0.372215,0.187196,0.169599,0.169958,0.149918,0.006645


In [9]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 331 T5 undigested

dn331 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_331_T5_undigested_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn331 = dn331.loc[:, ~dn331.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['331']

data = {'NAAF': dn331['NAAF'].sum(),
        'A-NAAF': dn331['A'].sum(),
        'C-NAAF': dn331['C'].sum(),
        'D-NAAF': dn331['D'].sum(),
        'E-NAAF': dn331['E'].sum(),
        'F-NAAF': dn331['F'].sum(),
        'G-NAAF': dn331['G'].sum(),
        'H-NAAF': dn331['H'].sum(),
        'I-NAAF': dn331['I'].sum(),
        'K-NAAF': dn331['K'].sum(),
        'L-NAAF': dn331['L'].sum(),
        'M-NAAF': dn331['M'].sum(),
        'N-NAAF': dn331['N'].sum(),
        'P-NAAF': dn331['P'].sum(),
        'Q-NAAF': dn331['Q'].sum(),
        'R-NAAF': dn331['R'].sum(),
        'S-NAAF': dn331['S'].sum(),
        'T-NAAF': dn331['T'].sum(),
        'V-NAAF': dn331['V'].sum(),
        'W-NAAF': dn331['W'].sum(),
        'Y-NAAF': dn331['Y'].sum(),
        'C-carb-NAAF': dn331['c-carb'].sum(),
        'M-oxid-NAAF': dn331['m-oxid'].sum(),
        'K-oxid-NAAF': dn331['k-oxid'].sum(),
        'P-oxid-NAAF': dn331['p-oxid'].sum(),
        'R-oxid-NAAF': dn331['r-oxid'].sum(),
        'Y-oxid-NAAF': dn331['y-oxid'].sum(),
        'N-deam-NAAF': dn331['n-deam'].sum(),
        'K-meth-NAAF': dn331['k-meth'].sum(),
        'R-meth-NAAF': dn331['r-meth'].sum(),
        'Q-pyro-NAAF': dn331['q-pyro'].sum(),
        'K-acet-NAAF': dn331['k-acet'].sum()
       }

sum_331 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_331['AA-NAAF'] = sum_331['A-NAAF'] + sum_331['C-NAAF'] + sum_331['D-NAAF'] + sum_331['E-NAAF'] + \
                     sum_331['F-NAAF'] + sum_331['G-NAAF'] + sum_331['H-NAAF'] + sum_331['I-NAAF'] + \
                     sum_331['K-NAAF'] + sum_331['L-NAAF'] + sum_331['M-NAAF'] + sum_331['N-NAAF'] + \
                     sum_331['P-NAAF'] + sum_331['Q-NAAF'] + sum_331['R-NAAF'] + sum_331['S-NAAF'] + \
                     sum_331['T-NAAF'] + sum_331['V-NAAF'] + sum_331['W-NAAF'] + sum_331['Y-NAAF'] 

sum_331['AA-modifiable-NAAF'] = sum_331['K-NAAF'] + sum_331['M-NAAF'] + sum_331['N-NAAF'] + \
                                sum_331['P-NAAF'] + sum_331['Q-NAAF'] + sum_331['R-NAAF'] + \
                                sum_331['Y-NAAF']

sum_331['Cys. w/ carb.'] = sum_331['C-carb-NAAF'] / sum_331['C-NAAF'] #1
sum_331['Met. w/ oxid.'] = sum_331['M-oxid-NAAF'] / sum_331['M-NAAF'] #2
sum_331['Lys. w/ oxid.'] = sum_331['K-oxid-NAAF'] / sum_331['K-NAAF'] #3
sum_331['Pro. w/ oxid.'] = sum_331['P-oxid-NAAF'] / sum_331['P-NAAF'] #4
sum_331['Arg. w/ oxid.'] = sum_331['R-oxid-NAAF'] / sum_331['R-NAAF'] #5
sum_331['Tyr. w/ oxid.'] = sum_331['Y-oxid-NAAF'] / sum_331['Y-NAAF'] #6
sum_331['Asn. w/ deam.'] = sum_331['N-deam-NAAF'] / sum_331['N-NAAF'] #7
sum_331['Lys. w/ meth.'] = sum_331['K-meth-NAAF'] / sum_331['K-NAAF'] #8
sum_331['Arg. w/ meth.'] = sum_331['R-meth-NAAF'] / sum_331['R-NAAF'] #9
sum_331['Glu. w/ pyro.'] = sum_331['Q-pyro-NAAF'] / sum_331['Q-NAAF'] #10
sum_331['Lys. w/ acet.'] = sum_331['K-acet-NAAF'] / sum_331['K-NAAF'] #11

sum_331['Overall modified'] = 0.01*((sum_331['Met. w/ oxid.']*sum_331['M-NAAF']) + \
                                     (sum_331['Lys. w/ oxid.']*sum_331['K-NAAF']) + \
                                     (sum_331['Pro. w/ oxid.']*sum_331['P-NAAF']) + \
                                     (sum_331['Arg. w/ oxid.']*sum_331['R-NAAF']) + \
                                     (sum_331['Tyr. w/ oxid.']*sum_331['Y-NAAF']) + \
                                     (sum_331['Asn. w/ deam.']*sum_331['N-NAAF']) + \
                                     (sum_331['Lys. w/ meth.']*sum_331['K-NAAF']) + \
                                     (sum_331['Arg. w/ meth.']*sum_331['R-NAAF']) + \
                                     (sum_331['Glu. w/ pyro.']*sum_331['Q-NAAF']) + \
                                     (sum_331['Lys. w/ acet.']*sum_331['K-NAAF']) / (sum_331['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_331.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-331.csv")

dn331.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,1993.609329,468.576717,50.760615,1774.324699,1037.287076,576.627903,1977.223986,956.841628,0.0,686.817106,...,0.042243,0.0707,0.081032,0.033224,0.858076,0.100644,0.268162,0.005599,0.111865,0.002893


In [10]:
# for each of the 8 POM samples:
# read in NAAF totals csvs made DN (de novo sequencing) notebooks

# 332 T12 undigested

dn332 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDN/TW_332_T12_undigested_combine_PTMopt_DN80_NAAF_totals.csv")

# there's a useless column in there
dn332 = dn332.loc[:, ~dn332.columns.str.contains('^Unnamed')]

# sum the AAs and PTMs

index = ['332']

data = {'NAAF': dn332['NAAF'].sum(),
        'A-NAAF': dn332['A'].sum(),
        'C-NAAF': dn332['C'].sum(),
        'D-NAAF': dn332['D'].sum(),
        'E-NAAF': dn332['E'].sum(),
        'F-NAAF': dn332['F'].sum(),
        'G-NAAF': dn332['G'].sum(),
        'H-NAAF': dn332['H'].sum(),
        'I-NAAF': dn332['I'].sum(),
        'K-NAAF': dn332['K'].sum(),
        'L-NAAF': dn332['L'].sum(),
        'M-NAAF': dn332['M'].sum(),
        'N-NAAF': dn332['N'].sum(),
        'P-NAAF': dn332['P'].sum(),
        'Q-NAAF': dn332['Q'].sum(),
        'R-NAAF': dn332['R'].sum(),
        'S-NAAF': dn332['S'].sum(),
        'T-NAAF': dn332['T'].sum(),
        'V-NAAF': dn332['V'].sum(),
        'W-NAAF': dn332['W'].sum(),
        'Y-NAAF': dn332['Y'].sum(),
        'C-carb-NAAF': dn332['c-carb'].sum(),
        'M-oxid-NAAF': dn332['m-oxid'].sum(),
        'K-oxid-NAAF': dn332['k-oxid'].sum(),
        'P-oxid-NAAF': dn332['p-oxid'].sum(),
        'R-oxid-NAAF': dn332['r-oxid'].sum(),
        'Y-oxid-NAAF': dn332['y-oxid'].sum(),
        'N-deam-NAAF': dn332['n-deam'].sum(),
        'K-meth-NAAF': dn332['k-meth'].sum(),
        'R-meth-NAAF': dn332['r-meth'].sum(),
        'Q-pyro-NAAF': dn332['q-pyro'].sum(),
        'K-acet-NAAF': dn332['k-acet'].sum()
       }

sum_332 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'K-oxid-NAAF', 'P-oxid-NAAF', 'R-oxid-NAAF', \
                                                   'Y-oxid-NAAF', 'N-deam-NAAF', 'K-meth-NAAF', \
                                                   'R-meth-NAAF', 'Q-pyro-NAAF', 'K-acet-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, P, Y, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_332['AA-NAAF'] = sum_332['A-NAAF'] + sum_332['C-NAAF'] + sum_332['D-NAAF'] + sum_332['E-NAAF'] + \
                     sum_332['F-NAAF'] + sum_332['G-NAAF'] + sum_332['H-NAAF'] + sum_332['I-NAAF'] + \
                     sum_332['K-NAAF'] + sum_332['L-NAAF'] + sum_332['M-NAAF'] + sum_332['N-NAAF'] + \
                     sum_332['P-NAAF'] + sum_332['Q-NAAF'] + sum_332['R-NAAF'] + sum_332['S-NAAF'] + \
                     sum_332['T-NAAF'] + sum_332['V-NAAF'] + sum_332['W-NAAF'] + sum_332['Y-NAAF'] 

sum_332['AA-modifiable-NAAF'] = sum_332['K-NAAF'] + sum_332['M-NAAF'] + sum_332['N-NAAF'] + \
                                sum_332['P-NAAF'] + sum_332['Q-NAAF'] + sum_332['R-NAAF'] + \
                                sum_332['Y-NAAF']

sum_332['Cys. w/ carb.'] = sum_332['C-carb-NAAF'] / sum_332['C-NAAF'] #1
sum_332['Met. w/ oxid.'] = sum_332['M-oxid-NAAF'] / sum_332['M-NAAF'] #2
sum_332['Lys. w/ oxid.'] = sum_332['K-oxid-NAAF'] / sum_332['K-NAAF'] #3
sum_332['Pro. w/ oxid.'] = sum_332['P-oxid-NAAF'] / sum_332['P-NAAF'] #4
sum_332['Arg. w/ oxid.'] = sum_332['R-oxid-NAAF'] / sum_332['R-NAAF'] #5
sum_332['Tyr. w/ oxid.'] = sum_332['Y-oxid-NAAF'] / sum_332['Y-NAAF'] #6
sum_332['Asn. w/ deam.'] = sum_332['N-deam-NAAF'] / sum_332['N-NAAF'] #7
sum_332['Lys. w/ meth.'] = sum_332['K-meth-NAAF'] / sum_332['K-NAAF'] #8
sum_332['Arg. w/ meth.'] = sum_332['R-meth-NAAF'] / sum_332['R-NAAF'] #9
sum_332['Glu. w/ pyro.'] = sum_332['Q-pyro-NAAF'] / sum_332['Q-NAAF'] #10
sum_332['Lys. w/ acet.'] = sum_332['K-acet-NAAF'] / sum_332['K-NAAF'] #11

sum_332['Overall modified'] = 0.01*((sum_332['Met. w/ oxid.']*sum_332['M-NAAF']) + \
                                     (sum_332['Lys. w/ oxid.']*sum_332['K-NAAF']) + \
                                     (sum_332['Pro. w/ oxid.']*sum_332['P-NAAF']) + \
                                     (sum_332['Arg. w/ oxid.']*sum_332['R-NAAF']) + \
                                     (sum_332['Tyr. w/ oxid.']*sum_332['Y-NAAF']) + \
                                     (sum_332['Asn. w/ deam.']*sum_332['N-NAAF']) + \
                                     (sum_332['Lys. w/ meth.']*sum_332['K-NAAF']) + \
                                     (sum_332['Arg. w/ meth.']*sum_332['R-NAAF']) + \
                                     (sum_332['Glu. w/ pyro.']*sum_332['Q-NAAF']) + \
                                     (sum_332['Lys. w/ acet.']*sum_332['K-NAAF']) / (sum_332['AA-modifiable-NAAF'])) 
                                                            

# write to a csv in the algae rot data directory
sum_332.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/NAAF-sums/NAAF-DN-332.csv")

dn332.head()

Unnamed: 0,NAAF,A,C,D,E,F,G,H,I,K,...,% K w/ oxid,% P w/ oxid,% R w/ oxid,% Y w/ oxid,% N w/ deam,% K w/ meth,% R w/ meth,% Q w/ pyro,% K w/ acet,NAAF check
0,2264.729888,904.667473,19.133144,1621.975928,995.466456,456.938517,2350.739641,913.545673,0.0,1286.442174,...,0.117096,0.119036,0.101622,0.03479,0.741882,0.119619,0.168447,0.011257,0.126078,0.009374
