### Manipulation of kynurenine PeaskDB de novo-assisted database search results of _T. weisflogii_ rot samples LC-MS/MS data using python.

Starting with:

    PeaksDB search results (.csv) of kynurenine searches
    These were all searched with 15 ppm precursor tolerance and 0.5 ppm fragement ion tolerance
    Multiple injections and fragmentation strategies included
    Exported at <1.0% FDR
    
Goal:

    Files with stripped (no PTMs) peptide lists and
    Columns with #'s of each modification in every sequence
    Column with stripped peptide lengths (# amino acids)
    
### To use for a different file:

#### 1. Change the input file name in *IN 4*
#### 2. Use 'find + replace' (Esc + F) to replace the running # (e.g., 233) for another
#### 3. Update the NAAF factor calculated in *IN 6* into *IN 7*

In [1]:
cd /home/millieginty/Documents/git-repos/rot-mayer/data/MED_Weissrot_Fusion_UWPR2021/kynurenine/MED_Weissrot_Fusion_UWPR2021_PEAKS_127_324/

/home/millieginty/Documents/git-repos/rot-mayer/data/MED_Weissrot_Fusion_UWPR2021/kynurenine/MED_Weissrot_Fusion_UWPR2021_PEAKS_127_324


In [2]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [3]:
ls 

324-T5dig-all_PEAKS127_DB-search-psm.csv
324-T5dig-all_PEAKS127_dno.csv
324-T5dig-all_PEAKS127_peptide.csv
324-T5dig-all_PEAKS127_protein-peptides.csv
324-T5dig-all_PEAKS127_proteins.csv
324-T5dig-all_PEAKS127_proteins.fasta


In [4]:
# read the CSV into a dataframe using the pandas read_csv function
peaksdbdup324 = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/MED_Weissrot_Fusion_UWPR2021/kynurenine/MED_Weissrot_Fusion_UWPR2021_PEAKS_127_324/324-T5dig-all_PEAKS127_peptide.csv")

# remove redundant rows
peaksdb324 = pd.DataFrame.drop_duplicates(peaksdbdup324)

print(peaksdb324.columns)

columns = ['Peptide', '-10lgP', 'Mass', 'Length', 'ppm', 'm/z', 'RT',
       'Area', 'Fraction', 'Scan', 'Source_File',
       '#Spec', '#Spec', 'Accession', 'PTM',
       'AScore']

peaksdb324.columns = columns

#remmove # spec and accession columns because they mess parsing up

del peaksdb324['#Spec']
del peaksdb324['Accession']
del peaksdb324['PTM']
del peaksdb324['AScore']

mean_length = peaksdb324['Length'].mean()
print(mean_length)

print("# redundant peaksdb peptides in combined dataframe", len(peaksdbdup324))
print("# nonredundant peaksdb peptides in combined dataframe", len(peaksdb324))

#look at the dataframe
peaksdb324.head()

Index(['Peptide', '-10lgP', 'Mass', 'Length', 'ppm', 'm/z', 'RT',
       'Area 324_T5_trypsin', 'Fraction', 'Scan', 'Source File', '#Spec',
       '#Spec 324_T5_trypsin', 'Accession', 'PTM', 'AScore'],
      dtype='object')
10.65725806451613
# redundant peaksdb peptides in combined dataframe 496
# nonredundant peaksdb peptides in combined dataframe 496


Unnamed: 0,Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Area,Fraction,Scan,Source_File
0,LPQVEGTGGDVQPSQDLVR,85.77,1994.0068,19,0.8,665.6768,73.27,125000000.0,11,16024,20210114_Weissrot_324_T5_trypsin_EThcD_120min_...
1,AIDLIDEAASSIR,72.53,1372.7197,13,1.3,458.5811,106.62,210000000.0,11,24720,20210114_Weissrot_324_T5_trypsin_EThcD_120min_...
2,VIGQNEAVDAVSNAIR,72.29,1654.8638,16,3.1,552.6302,88.8,111000000.0,11,20058,20210114_Weissrot_324_T5_trypsin_EThcD_120min_...
3,DVPGTGNEFVGDFR,69.47,1508.6896,14,1.0,503.9043,86.45,56800000.0,11,19441,20210114_Weissrot_324_T5_trypsin_EThcD_120min_...
4,VTDAEIAEVLAR,68.89,1285.6877,12,1.1,429.5703,95.26,207000000.0,11,21740,20210114_Weissrot_324_T5_trypsin_EThcD_120min_...


In [5]:
# use a count function to enumerate the # of A's (alanines) in each peptide
peaksdb324['A'] = peaksdb324['Peptide'].str.count("A")

# use a count function to enumerate the # of C's (cysteines) in each peptide
peaksdb324['C'] = peaksdb324['Peptide'].str.count("C")

# use a count function to enumerate the # of D's (aspartic acids) in each peptide
peaksdb324['D'] = peaksdb324['Peptide'].str.count("D")

# use a count function to enumerate the # of E's (glutamic acids) in each peptide
peaksdb324['E'] = peaksdb324['Peptide'].str.count("E")

# use a count function to enumerate the # of F's (phenylalanines) in each peptide
peaksdb324['F'] = peaksdb324['Peptide'].str.count("F")

# use a count function to enumerate the # of G's (glycines) in each peptide
peaksdb324['G'] = peaksdb324['Peptide'].str.count("G")

# use a count function to enumerate the # of H's (histidines) in each peptide
peaksdb324['H'] = peaksdb324['Peptide'].str.count("H")

# use a count function to enumerate the # of I's (isoleucines) in each peptide
# in peaksdb324 output, there will be no isoleucines (they're lumped in with leucines)
peaksdb324['I'] = peaksdb324['Peptide'].str.count("I")

# use a count function to enumerate the # of K's (lysines) in each peptide
peaksdb324['K'] = peaksdb324['Peptide'].str.count("K")

# use a count function to enumerate the # of L's (leucines) in each peptide
# also these include the isoleucines
peaksdb324['L'] = peaksdb324['Peptide'].str.count("L")

# use a count function to enumerate the # of M's (methionines) in each peptide
peaksdb324['M'] = peaksdb324['Peptide'].str.count("M")

# use a count function to enumerate the # of N's (asparagines) in each peptide
peaksdb324['N'] = peaksdb324['Peptide'].str.count("N")

# use a count function to enumerate the # of P's ([prolines]) in each peptide
peaksdb324['P'] = peaksdb324['Peptide'].str.count("P")

# use a count function to enumerate the # of Q's (glutamines) in each peptide
peaksdb324['Q'] = peaksdb324['Peptide'].str.count("Q")

# use a count function to enumerate the # of R's (arginines) in each peptide
peaksdb324['R'] = peaksdb324['Peptide'].str.count("R")

# use a count function to enumerate the # of S's (serines) in each peptide
peaksdb324['S'] = peaksdb324['Peptide'].str.count("S")

# use a count function to enumerate the # of T's (threonines) in each peptide
peaksdb324['T'] = peaksdb324['Peptide'].str.count("T")

# use a count function to enumerate the # of V's (valines) in each peptide
peaksdb324['V'] = peaksdb324['Peptide'].str.count("V")

# use a count function to enumerate the # of W's (tryptophans) in each peptide
peaksdb324['W'] = peaksdb324['Peptide'].str.count("W")

# use a count function to enumerate the # of Y's (tyrosines) in each peptide
peaksdb324['Y'] = peaksdb324['Peptide'].str.count("Y")

# use a count function to enumerate the # of carbamidomethylated C's in each peptide
peaksdb324['c-carb'] = peaksdb324['Peptide'].str.count("57.02")

# use a count function to enumerate the # of oxidized M's in each peptide
peaksdb324['m-oxid'] = peaksdb324['Peptide'].apply(lambda x: x.count('M(+15.99)'))

# use a lamba function to enumerate the # of deamidated N's in each peptide
peaksdb324['n-deam'] = peaksdb324['Peptide'].apply(lambda x: x.count('N(+.98)'))

# use a count function to enumerate the # of acetylation of K's in each peptide
peaksdb324['q-deam'] = peaksdb324['Peptide'].apply(lambda x: x.count('Q(+.98)'))

# use a count function to enumerate the # of acetylation of K's in each peptide
peaksdb324['x-kynu'] = peaksdb324['Peptide'].apply(lambda x: x.count('(+3.99)'))

# create a column with 'stripped' peptide sequences using strip
peaksdb324['stripped_peptide'] = peaksdb324['Peptide'].str.replace(r"\(.*\)","")

# add a column with the stripped peptide length (number of AAs)
peaksdb324['stripped_length'] = peaksdb324['stripped_peptide'].apply(len)

peaksdb324['NAAF_num.'] = peaksdb324['Area'] / peaksdb324['stripped_length']

# total the number of modifications in sequence
peaksdb324['ptm-total'] = peaksdb324['c-carb'] + peaksdb324['m-oxid'] + peaksdb324['n-deam'] + \
peaksdb324['q-deam'] + peaksdb324['x-kynu']

# turn all isoleucines into leucines
# this helps later in comparing Unipept peptides to PeaksDB and Comet ones
peaksdb324['stripped_IL']= peaksdb324['stripped_peptide'].str.replace('I','L')

# write modified dataframe to new txt file
peaksdb324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/kynurenine/324_T5_trypsin_combine_kynur_DB_FDR1.csv")

# check out the results
peaksdb324.head()

Unnamed: 0,Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Area,Fraction,Scan,...,c-carb,m-oxid,n-deam,q-deam,x-kynu,stripped_peptide,stripped_length,NAAF_num.,ptm-total,stripped_IL
0,LPQVEGTGGDVQPSQDLVR,85.77,1994.0068,19,0.8,665.6768,73.27,125000000.0,11,16024,...,0,0,0,0,0,LPQVEGTGGDVQPSQDLVR,19,6578947.0,0,LPQVEGTGGDVQPSQDLVR
1,AIDLIDEAASSIR,72.53,1372.7197,13,1.3,458.5811,106.62,210000000.0,11,24720,...,0,0,0,0,0,AIDLIDEAASSIR,13,16153850.0,0,ALDLLDEAASSLR
2,VIGQNEAVDAVSNAIR,72.29,1654.8638,16,3.1,552.6302,88.8,111000000.0,11,20058,...,0,0,0,0,0,VIGQNEAVDAVSNAIR,16,6937500.0,0,VLGQNEAVDAVSNALR
3,DVPGTGNEFVGDFR,69.47,1508.6896,14,1.0,503.9043,86.45,56800000.0,11,19441,...,0,0,0,0,0,DVPGTGNEFVGDFR,14,4057143.0,0,DVPGTGNEFVGDFR
4,VTDAEIAEVLAR,68.89,1285.6877,12,1.1,429.5703,95.26,207000000.0,11,21740,...,0,0,0,0,0,VTDAEIAEVLAR,12,17250000.0,0,VTDAELAEVLAR


In [6]:
# keep only stripped peptide I/L and NAAF
db_324 = peaksdb324[['stripped_IL', 'Area', 'NAAF_num.']]

db_324.set_index('stripped_IL')

# write modified dataframe to new txt file
db_324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/kynurenine/I-L_NAAFs/T5_324_trypsin_DB_kynur_peptides.csv")

db_324.head()

Unnamed: 0,stripped_IL,Area,NAAF_num.
0,LPQVEGTGGDVQPSQDLVR,125000000.0,6578947.0
1,ALDLLDEAASSLR,210000000.0,16153850.0
2,VLGQNEAVDAVSNALR,111000000.0,6937500.0
3,DVPGTGNEFVGDFR,56800000.0,4057143.0
4,VTDAELAEVLAR,207000000.0,17250000.0


### Exporting txt files of stripped peptides at confidence cutoffs:

In [7]:
# keep only peptide column peptides <1% FDR (this is what we exported)
pep324moddup = peaksdb324[["Peptide"]]

# keep only the stripped peptide column <1% FDR
# this is what we'll use for UniPept input, etc
pep324dup = peaksdb324[["stripped_peptide"]]

# deduplicate both of these lists
pep324mod = pep324moddup.drop_duplicates()
pep324 = pep324dup.drop_duplicates()

# print out the #s of modified and stripped peptides, deduplicated and not

print("Total modified PeaksDB peptides in 324:", len(pep324moddup))
print("Deduplicated modified PeaksDB peptides in 324:", len(pep324mod))
print("Total stripped PeaksDB peptides in 324:", len(pep324dup))
print("Deduplicated stripped PeaksDB peptides in 324:", len(pep324))

# write altered dataframe to new txt file
# used header and index parameters to get rid of 'Peptide' header and the indexing
pep324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/kynurenine/324_T5_trypsin_combine_kynur_DB_FDR1_stripped_peptides.txt", header=False, index=False)

# made the text file into a FASTA 
!awk '{print ">"NR"\n"$0}' /home/millieginty/Documents/git-repos/rot-mayer/data/processed/kynurenine/324_T5_trypsin_combine_kynur_DB_FDR1_stripped_peptides.txt > \
/home/millieginty/Documents/git-repos/rot-mayer/data/processed/kynurenine/324_T5_trypsin_combine_kynur_DB_FDR1_stripped_peptides.fas

# look at the stripped peptides
pep324.head()

Total modified PeaksDB peptides in 324: 496
Deduplicated modified PeaksDB peptides in 324: 496
Total stripped PeaksDB peptides in 324: 496
Deduplicated stripped PeaksDB peptides in 324: 472


Unnamed: 0,stripped_peptide
0,LPQVEGTGGDVQPSQDLVR
1,AIDLIDEAASSIR
2,VIGQNEAVDAVSNAIR
3,DVPGTGNEFVGDFR
4,VTDAEIAEVLAR


## NAAF correction and exporting files with AA and PTM totals:

In [8]:
# made a new dataframe that contains the sums of certain columns in the modified
# peptide dataframe above 

index = ['sample total']

data = {'A': peaksdb324['A'].sum(),
        'C': peaksdb324['C'].sum(),
        'D': peaksdb324['D'].sum(),
        'E': peaksdb324['E'].sum(),
        'F': peaksdb324['F'].sum(),
        'G': peaksdb324['G'].sum(),
        'H': peaksdb324['H'].sum(),
        'I': peaksdb324['I'].sum(),
        'K': peaksdb324['K'].sum(),
        'L': peaksdb324['L'].sum(),
        'M': peaksdb324['M'].sum(),
        'N': peaksdb324['N'].sum(),
        'P': peaksdb324['P'].sum(),
        'Q': peaksdb324['Q'].sum(),
        'R': peaksdb324['R'].sum(),
        'S': peaksdb324['S'].sum(),
        'T': peaksdb324['T'].sum(),
        'V': peaksdb324['V'].sum(),
        'W': peaksdb324['W'].sum(),
        'Y': peaksdb324['Y'].sum(),
        'c-carb': peaksdb324['c-carb'].sum(),
        'm-oxid': peaksdb324['m-oxid'].sum(),
        'n-deam': peaksdb324['n-deam'].sum(),
        'q-deam': peaksdb324['q-deam'].sum(),
        'x-kynu': peaksdb324['x-kynu'].sum(),
        'Total area': peaksdb324['Area'].sum(),
        'Total length': peaksdb324['stripped_length'].sum()
       }

totalpeaksdb324 = pd.DataFrame(data, columns=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', \
                                              'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', \
                                              'c-carb', 'm-oxid', 'n-deam', 'q-deam', 'x-kynu'], index=index)

# calculate percentage of C's with carb (should be 1.0)
totalpeaksdb324['% C w/ carb'] = totalpeaksdb324['c-carb'] / totalpeaksdb324['C'] 

# calculate percentage of M's that are oxidized
totalpeaksdb324['% M w/ oxid'] = totalpeaksdb324['m-oxid'] / totalpeaksdb324['M'] 

# calculate percentage of N's that are deamidated
totalpeaksdb324['% N w/ deam'] = totalpeaksdb324['n-deam'] / totalpeaksdb324['N'] 

# calculate percentage of N's that are deamidated
totalpeaksdb324['% Q w/ deam'] = totalpeaksdb324['q-deam'] / totalpeaksdb324['Q'] 

# calculate percentage of X's that are deamidated
totalpeaksdb324['% X w/ kyur'] = totalpeaksdb324['x-kynu'] / totalpeaksdb324['N'] 

# calculate NAAF denominator for all peptides in dataset i
#totalpeaksdb324['NAAF denom.'] = peaksdb324['Total area'] / peaksdb324['Total length']

# write modified dataframe to new txt file
totalpeaksdb324.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/kynurenine/324_T5_trypsin_combine_kynur_DB_FDR1_totals.csv")

totalpeaksdb324.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,c-carb,m-oxid,n-deam,q-deam,x-kynu,% C w/ carb,% M w/ oxid,% N w/ deam,% Q w/ deam,% X w/ kyur
sample total,369,42,401,342,245,716,96,347,241,404,...,42,15,9,8,1,1.0,0.46875,0.032847,0.039604,0.00365


In [None]:
# use the calculated NAAF factor (in totalpeaksdb324 dataframe, above) to caluclate the NAAF 
# NAAF: normalized normalized area abundance factor

NAAF20 = 555224.331024

# we'll use -10lgP > 20 as an approximate cutoff for the <1.0% FDR which is what we actually have
peaksdb324['NAAF factor'] = (peaksdb324['NAAF_num.'])/NAAF20

# make a dataframe that contains only what we need: sequences, AAs, PTMs
peaksdb324_NAAF = peaksdb324[['stripped_peptide', 'NAAF factor', 'A', 'C', 'D', 'E', 'F', 'G', 'H', \
                              'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', \
                              'c-carb', 'm-oxid', 'k-oxid', 'p-oxid', 'r-oxid', 'y-oxid', 'n-deam', \
                              'k-meth', 'r-meth', 'q-pyro', 'k-acet']].copy()

# multiply the NAAF20 factor by the AAs to normalize its abundance by peak area and peptide length

peaksdb324_NAAF['A-NAAF20'] = peaksdb324_NAAF['A'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['C-NAAF20'] = peaksdb324_NAAF['C'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['D-NAAF20'] = peaksdb324_NAAF['D'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['E-NAAF20'] = peaksdb324_NAAF['E'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['F-NAAF20'] = peaksdb324_NAAF['F'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['G-NAAF20'] = peaksdb324_NAAF['G'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['H-NAAF20'] = peaksdb324_NAAF['H'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['I-NAAF20'] = peaksdb324_NAAF['I'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['K-NAAF20'] = peaksdb324_NAAF['K'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['L-NAAF20'] = peaksdb324_NAAF['L'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['M-NAAF20'] = peaksdb324_NAAF['M'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['N-NAAF20'] = peaksdb324_NAAF['N'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['P-NAAF20'] = peaksdb324_NAAF['P'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['Q-NAAF20'] = peaksdb324_NAAF['Q'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['R-NAAF20'] = peaksdb324_NAAF['R'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['S-NAAF20'] = peaksdb324_NAAF['S'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['T-NAAF20'] = peaksdb324_NAAF['T'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['V-NAAF20'] = peaksdb324_NAAF['V'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['W-NAAF20'] = peaksdb324_NAAF['W'] * peaksdb324['NAAF factor']
peaksdb324_NAAF['Y-NAAF20'] = peaksdb324_NAAF['Y'] * peaksdb324['NAAF factor']

# multiply the NAAF20 factor by the PTMs normalize its abundance by peak area and peptide length

peaksdb324_NAAF['ccarb-NAAF20'] = peaksdb324_NAAF['c-carb'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['moxid-NAAF20'] = peaksdb324_NAAF['m-oxid'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['koxid-NAAF20'] = peaksdb324_NAAF['k-oxid'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['poxid-NAAF20'] = peaksdb324_NAAF['p-oxid'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['roxid-NAAF20'] = peaksdb324_NAAF['r-oxid'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['yoxid-NAAF20'] = peaksdb324_NAAF['y-oxid'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['ndeam-NAAF20'] = peaksdb324_NAAF['n-deam'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['kmeth-NAAF20'] = peaksdb324_NAAF['k-meth'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['rmeth-NAAF20'] = peaksdb324_NAAF['r-meth'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['qpyro-NAAF20'] = peaksdb324_NAAF['q-pyro'] * peaksdb324_NAAF['NAAF factor']
peaksdb324_NAAF['kacet-NAAF20'] = peaksdb324_NAAF['k-acet'] * peaksdb324_NAAF['NAAF factor']

# write the dataframe to a new csv
peaksdb324_NAAF.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_324_T5_trypsin_combine_PTMopt_DB_FDR1_NAAF.csv")

peaksdb324_NAAF.head()

In [None]:
# made a new dataframe that contains the sums of NAAF normalized AAs for peaksdb324 peaksdb results
# also contains the sums of the NAAF-corrected PTMs occurances for each affected residue

index = ['sample total']

data = {'NAAF': peaksdb324_NAAF['NAAF factor'].sum(),
        'A': peaksdb324_NAAF['A-NAAF20'].sum(),
        'C': peaksdb324_NAAF['C-NAAF20'].sum(),
        'D': peaksdb324_NAAF['D-NAAF20'].sum(),
        'E': peaksdb324_NAAF['E-NAAF20'].sum(),
        'F': peaksdb324_NAAF['F-NAAF20'].sum(),
        'G': peaksdb324_NAAF['G-NAAF20'].sum(),
        'H': peaksdb324_NAAF['H-NAAF20'].sum(),
        'I': peaksdb324_NAAF['I-NAAF20'].sum(),
        'K': peaksdb324_NAAF['K-NAAF20'].sum(),
        'L': peaksdb324_NAAF['L-NAAF20'].sum(),
        'M': peaksdb324_NAAF['M-NAAF20'].sum(),
        'N': peaksdb324_NAAF['N-NAAF20'].sum(),
        'P': peaksdb324_NAAF['P-NAAF20'].sum(),
        'Q': peaksdb324_NAAF['Q-NAAF20'].sum(),
        'R': peaksdb324_NAAF['R-NAAF20'].sum(),
        'S': peaksdb324_NAAF['S-NAAF20'].sum(),
        'T': peaksdb324_NAAF['T-NAAF20'].sum(),
        'V': peaksdb324_NAAF['V-NAAF20'].sum(),
        'W': peaksdb324_NAAF['W-NAAF20'].sum(),
        'Y': peaksdb324_NAAF['Y-NAAF20'].sum(),
        'c-carb': peaksdb324_NAAF['ccarb-NAAF20'].sum(),
        'm-oxid': peaksdb324_NAAF['moxid-NAAF20'].sum(),
        'k-oxid': peaksdb324_NAAF['koxid-NAAF20'].sum(),
        'p-oxid': peaksdb324_NAAF['poxid-NAAF20'].sum(),
        'r-oxid': peaksdb324_NAAF['roxid-NAAF20'].sum(),
        'y-oxid': peaksdb324_NAAF['yoxid-NAAF20'].sum(),
        'n-deam': peaksdb324_NAAF['ndeam-NAAF20'].sum(),
        'k-meth': peaksdb324_NAAF['kmeth-NAAF20'].sum(),
        'r-meth': peaksdb324_NAAF['rmeth-NAAF20'].sum(),
        'q-pyro': peaksdb324_NAAF['qpyro-NAAF20'].sum(),
        'k-acet': peaksdb324_NAAF['kacet-NAAF20'].sum()
       }

totalpeaksdb324_NAAF = pd.DataFrame(data, columns=['NAAF', 'A', 'C', 'D', 'E', 'F', \
                                                   'G', 'H', 'I','K', 'L', 'M', \
                                                   'N', 'P', 'Q', 'R', 'S', \
                                                   'T', 'V', 'W', 'Y', 'c-carb', \
                                                   'm-oxid', 'k-oxid', 'p-oxid', 'r-oxid', \
                                                   'y-oxid', 'n-deam', 'k-meth', 'r-meth', \
                                                   'q-pyro', \
                                                   'k-acet'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


# calculate NAAF-corrected percentage of C's with carb (should be 1.0)
totalpeaksdb324_NAAF['% C w/ carb.'] = totalpeaksdb324_NAAF['c-carb'] / totalpeaksdb324_NAAF['C'] 

# calculate NAAF-corrected percentage of M's that are oxidized
totalpeaksdb324_NAAF['% M w/ oxid'] = totalpeaksdb324_NAAF['m-oxid'] / totalpeaksdb324_NAAF['M'] 

# calculate NAAF-corrected percentage of K's that are oxidized
totalpeaksdb324_NAAF['% K w/ oxid'] = totalpeaksdb324_NAAF['k-oxid'] / totalpeaksdb324_NAAF['K'] 

# calculate NAAF-corrected percentage of P's that are oxidized
totalpeaksdb324_NAAF['% P w/ oxid'] = totalpeaksdb324_NAAF['p-oxid'] / totalpeaksdb324_NAAF['P'] 

# calculate NAAF-corrected percentage of R's that are oxidized
totalpeaksdb324_NAAF['% R w/ oxid'] = totalpeaksdb324_NAAF['r-oxid'] / totalpeaksdb324_NAAF['R'] 

# calculate NAAF-corrected percentage of Y's that are oxidized
totalpeaksdb324_NAAF['% Y w/ oxid'] = totalpeaksdb324_NAAF['y-oxid'] / totalpeaksdb324_NAAF['Y'] 

# calculate NAAF-corrected percentage of N's that are deamidated
totalpeaksdb324_NAAF['% N w/ deam'] = totalpeaksdb324_NAAF['n-deam'] / totalpeaksdb324_NAAF['N'] 

# calculate NAAF-corrected percentage of K's that are methylated
totalpeaksdb324_NAAF['% K w/ meth'] = totalpeaksdb324_NAAF['k-meth'] / totalpeaksdb324_NAAF['K'] 

# calculate NAAF-corrected percentage of R's that are methylated
totalpeaksdb324_NAAF['% R w/ meth'] = totalpeaksdb324_NAAF['r-meth'] / totalpeaksdb324_NAAF['R'] 

# calculate NAAF-corrected percentage of Q's that are pyro glu'd
totalpeaksdb324_NAAF['% Q w/ pyro'] = totalpeaksdb324_NAAF['q-pyro'] / totalpeaksdb324_NAAF['Q'] 

# calculate NAAF-corrected percentage of K's that are methylated
totalpeaksdb324_NAAF['% K w/ acet'] = totalpeaksdb324_NAAF['k-acet'] / totalpeaksdb324_NAAF['K'] 

# calculate NAAF summed numerator over denominator (in above cell) for all peptides in dataset i: a check
totalpeaksdb324_NAAF['NAAF check'] = totalpeaksdb324_NAAF['NAAF'] / 555224.331024

# write modified dataframe to new txt file, same name + totals
totalpeaksdb324_NAAF.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/PeaksDB/TW_324_T5_trypsin_combine_PTMopt_DB_FDR1_NAAF_totals.csv")

totalpeaksdb324_NAAF.head()