### Combining NAAF-corrected *de novo*, PeaksDB, and Comet peptides for each sample:

Starting with:

    Peaks de novo results of PTM-optimized sequencing, NAAF corrected per sample
    Comet results from PTM-optimized database searches, NAAF corrected per sample
    PeaksDB de novo-assisted results from PTM-optimized database searches, NAAF corrected per sample

Goal:

    CSVs with combined de novo, PeaksDB, and Comet peptides for each sample, normalized to Waters Hi3 peptides (6 unique E. coli heat shock protein-dervied peptides). 
    
Using:

    - pandas
    - matplotlib
    - numpy

In [1]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [20]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 231 100 m suspended

peaks80_231 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/231/ETNP-SKQ17-231-100m-0.3-JA2_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_231 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/231/ETNP-SKQ17-231-100m-0.3-JA2_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_231 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/231/ETNP_SKQ17_PEAKSDB20_231-100m-0.3-JA2_15ppm_NAAF_totals")

frames = [peaks80_231, comet_231, peaksdb_231]
index = ['peaks80_231', 'comet_231', 'peaksdb_231']

# concatenate dataframes
tot_231 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_231 = tot_231.loc[:, ~tot_231.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_231', 'comet_231', 'peaksdb_231']
tot_231.insert(loc=0, column='data source', value=names)
tot_231.set_index('data source')

# sum the AAs and PTMs

index = ['231']

data = {'NAAF': tot_231['NAAF'].sum(),
        'A-NAAF': tot_231['A-NAAF'].sum(),
        'C-NAAF': tot_231['C-NAAF'].sum(),
        'D-NAAF': tot_231['D-NAAF'].sum(),
        'E-NAAF': tot_231['E-NAAF'].sum(),
        'F-NAAF': tot_231['F-NAAF'].sum(),
        'G-NAAF': tot_231['G-NAAF'].sum(),
        'H-NAAF': tot_231['H-NAAF'].sum(),
        'I-NAAF': tot_231['I-NAAF'].sum(),
        'K-NAAF': tot_231['K-NAAF'].sum(),
        'L-NAAF': tot_231['L-NAAF'].sum(),
        'M-NAAF': tot_231['M-NAAF'].sum(),
        'N-NAAF': tot_231['N-NAAF'].sum(),
        'P-NAAF': tot_231['P-NAAF'].sum(),
        'Q-NAAF': tot_231['Q-NAAF'].sum(),
        'R-NAAF': tot_231['R-NAAF'].sum(),
        'S-NAAF': tot_231['S-NAAF'].sum(),
        'T-NAAF': tot_231['T-NAAF'].sum(),
        'V-NAAF': tot_231['V-NAAF'].sum(),
        'W-NAAF': tot_231['W-NAAF'].sum(),
        'Y-NAAF': tot_231['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_231['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_231['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_231['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_231['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_231['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_231['R-meth-NAAF'].sum()
       }

sum_231 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_231['AA-NAAF'] = sum_231['A-NAAF'] + sum_231['C-NAAF'] + sum_231['D-NAAF'] + sum_231['E-NAAF'] + sum_231['F-NAAF'] + \
                     sum_231['G-NAAF'] + sum_231['H-NAAF'] + sum_231['I-NAAF'] + sum_231['K-NAAF'] + sum_231['L-NAAF'] + sum_231['M-NAAF'] + \
                     sum_231['N-NAAF'] + sum_231['P-NAAF'] + sum_231['Q-NAAF'] + sum_231['R-NAAF'] + sum_231['S-NAAF'] + \
                     sum_231['T-NAAF'] + sum_231['V-NAAF'] + sum_231['W-NAAF'] + sum_231['Y-NAAF'] + sum_231['C-carb-NAAF'] + \
                     sum_231['M-oxid-NAAF'] + sum_231['N-deam-NAAF'] + sum_231['Q-deam-NAAF'] + sum_231['K-hydr-NAAF'] +\
                     sum_231['R-meth-NAAF']

sum_231['Cys. w/ carb.'] = sum_231['C-carb-NAAF'] / sum_231['C-NAAF']
sum_231['Met. w/ oxid.'] = sum_231['M-oxid-NAAF'] / sum_231['M-NAAF']
sum_231['Asn. w/ deam.'] = sum_231['N-deam-NAAF'] / sum_231['N-NAAF']
sum_231['Gln. w/ deam.'] = sum_231['Q-deam-NAAF'] / sum_231['Q-NAAF']
sum_231['Lys. w/ hydr.'] = sum_231['K-hydr-NAAF'] / sum_231['K-NAAF']
sum_231['Arg. w/ meth.'] = sum_231['R-meth-NAAF'] / sum_231['R-NAAF']
sum_231['Overall modified'] = 0.001*((sum_231['Met. w/ oxid.']*sum_231['M-NAAF']) + (sum_231['Asn. w/ deam.']*sum_231['N-NAAF']) + \
                            (sum_231['Gln. w/ deam.']*sum_231['Q-NAAF']) + (sum_231['Lys. w/ hydr.']*sum_231['K-NAAF']) + \
                             (sum_231['Arg. w/ meth.']*sum_231['R-NAAF']) / (sum_231['AA-NAAF'])) 
                                                            


# write to a csv in the ETNP data directory
sum_231.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-231.csv")

sum_231.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,K-hydr-NAAF,R-meth-NAAF,AA-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Overall modified
231,2789.546548,3503.230601,96.206531,1583.007014,1817.918086,467.335483,1900.169974,62.427987,1278.213205,951.597499,...,44.865005,118.259304,30818.448782,1.0,0.755277,0.117467,0.069276,0.047147,0.052534,0.369422


In [3]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 233 265 m suspended

peaks80_233 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/233/ETNP-SKQ17-233-265m-0.3-JA4_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_233 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/233/ETNP-SKQ17-233-265m-0.3-JA4_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_233 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/233/ETNP_SKQ17_PEAKSDB20_233-265m-0.3-JA4_15ppm_NAAF_totals")

frames = [peaks80_233, comet_233, peaksdb_233]
index = ['peaks80_233', 'comet_233', 'peaksdb_233']

# concatenate dataframes
tot_233 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_233 = tot_233.loc[:, ~tot_233.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_233', 'comet_233', 'peaksdb_233']
tot_233.insert(loc=0, column='data source', value=names)
tot_233.set_index('data source')

# sum the AAs and PTMs

index = ['233']

data = {'NAAF': tot_233['NAAF'].sum(),
        'A-NAAF': tot_233['A-NAAF'].sum(),
        'C-NAAF': tot_233['C-NAAF'].sum(),
        'D-NAAF': tot_233['D-NAAF'].sum(),
        'E-NAAF': tot_233['E-NAAF'].sum(),
        'F-NAAF': tot_233['F-NAAF'].sum(),
        'G-NAAF': tot_233['G-NAAF'].sum(),
        'H-NAAF': tot_233['H-NAAF'].sum(),
        'I-NAAF': tot_233['I-NAAF'].sum(),
        'K-NAAF': tot_233['K-NAAF'].sum(),
        'L-NAAF': tot_233['L-NAAF'].sum(),
        'M-NAAF': tot_233['M-NAAF'].sum(),
        'N-NAAF': tot_233['N-NAAF'].sum(),
        'P-NAAF': tot_233['P-NAAF'].sum(),
        'Q-NAAF': tot_233['Q-NAAF'].sum(),
        'R-NAAF': tot_233['R-NAAF'].sum(),
        'S-NAAF': tot_233['S-NAAF'].sum(),
        'T-NAAF': tot_233['T-NAAF'].sum(),
        'V-NAAF': tot_233['V-NAAF'].sum(),
        'W-NAAF': tot_233['W-NAAF'].sum(),
        'Y-NAAF': tot_233['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_233['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_233['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_233['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_233['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_233['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_233['R-meth-NAAF'].sum()
       }

sum_233 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_233['Cys. w/ carb.'] = sum_233['C-carb-NAAF'] / sum_233['C-NAAF']
sum_233['Met. w/ oxid.'] = sum_233['M-oxid-NAAF'] / sum_233['M-NAAF']
sum_233['Asn. w/ deam.'] = sum_233['N-deam-NAAF'] / sum_233['N-NAAF']
sum_233['Gln. w/ deam.'] = sum_233['Q-deam-NAAF'] / sum_233['Q-NAAF']
sum_233['Lys. w/ hydr.'] = sum_233['K-hydr-NAAF'] / sum_233['K-NAAF']
sum_233['Arg. w/ meth.'] = sum_233['R-meth-NAAF'] / sum_233['R-NAAF']
sum_233['Total modified'] = (sum_233['Cys. w/ carb.'] + sum_233['Met. w/ oxid.'] + sum_233['Asn. w/ deam.'] + \
                            sum_233['Gln. w/ deam.'] + sum_233['Lys. w/ hydr.'] + sum_233['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_233.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-233.csv")

sum_233.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
233,1985.158672,2618.74519,100.698258,961.508374,1264.337665,230.564805,1245.119421,56.443911,1055.874197,590.502915,...,29.451307,35.800928,70.866825,1.0,0.164092,0.175549,0.041727,0.060628,0.042582,0.296915


In [4]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 243 965 m suspended

peaks80_243 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/243/ETNP-SKQ17-243-965m-0.3-JA14_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_243 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/243/ETNP-SKQ17-243-965m-0.3-JA14_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_243 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/243/ETNP_SKQ17_PEAKSDB20_243-965m-0.3-JA14_15ppm_NAAF_totals")

frames = [peaks80_243, comet_243, peaksdb_243]
index = ['peaks80_243', 'comet_243', 'peaksdb_243']

# concatenate dataframes
tot_243 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_243 = tot_243.loc[:, ~tot_243.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_243', 'comet_243', 'peaksdb_243']
tot_243.insert(loc=0, column='data source', value=names)
tot_243.set_index('data source')

# sum the AAs and PTMs

index = ['243']

data = {'NAAF': tot_243['NAAF'].sum(),
        'A-NAAF': tot_243['A-NAAF'].sum(),
        'C-NAAF': tot_243['C-NAAF'].sum(),
        'D-NAAF': tot_243['D-NAAF'].sum(),
        'E-NAAF': tot_243['E-NAAF'].sum(),
        'F-NAAF': tot_243['F-NAAF'].sum(),
        'G-NAAF': tot_243['G-NAAF'].sum(),
        'H-NAAF': tot_243['H-NAAF'].sum(),
        'I-NAAF': tot_243['I-NAAF'].sum(),
        'K-NAAF': tot_243['K-NAAF'].sum(),
        'L-NAAF': tot_243['L-NAAF'].sum(),
        'M-NAAF': tot_243['M-NAAF'].sum(),
        'N-NAAF': tot_243['N-NAAF'].sum(),
        'P-NAAF': tot_243['P-NAAF'].sum(),
        'Q-NAAF': tot_243['Q-NAAF'].sum(),
        'R-NAAF': tot_243['R-NAAF'].sum(),
        'S-NAAF': tot_243['S-NAAF'].sum(),
        'T-NAAF': tot_243['T-NAAF'].sum(),
        'V-NAAF': tot_243['V-NAAF'].sum(),
        'W-NAAF': tot_243['W-NAAF'].sum(),
        'Y-NAAF': tot_243['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_243['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_243['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_243['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_243['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_243['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_243['R-meth-NAAF'].sum()
       }

sum_243 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_243['Cys. w/ carb.'] = sum_243['C-carb-NAAF'] / sum_243['C-NAAF']
sum_243['Met. w/ oxid.'] = sum_243['M-oxid-NAAF'] / sum_243['M-NAAF']
sum_243['Asn. w/ deam.'] = sum_243['N-deam-NAAF'] / sum_243['N-NAAF']
sum_243['Gln. w/ deam.'] = sum_243['Q-deam-NAAF'] / sum_243['Q-NAAF']
sum_243['Lys. w/ hydr.'] = sum_243['K-hydr-NAAF'] / sum_243['K-NAAF']
sum_243['Arg. w/ meth.'] = sum_243['R-meth-NAAF'] / sum_243['R-NAAF']
sum_243['Total modified'] = (sum_243['Cys. w/ carb.'] + sum_243['Met. w/ oxid.'] + sum_243['Asn. w/ deam.'] + \
                            sum_243['Gln. w/ deam.'] + sum_243['Lys. w/ hydr.'] + sum_243['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_243.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-243.csv")

sum_243.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
243,508.407353,662.141669,14.050445,189.751365,248.538994,14.495422,181.336989,5.883503,173.695735,51.516163,...,7.656642,6.34669,151.792995,1.0,0.188156,0.304075,0.052042,0.123198,0.285675,0.390629


In [5]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 378 100 m sinking

peaks80_378 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/378/ETNP-SKQ17-378-100m-trap_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_378 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/378/ETNP-SKQ17-378-100m-trap_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_378 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/378/ETNP_SKQ17_PEAKSDB20_378-100m-trap_15ppm_NAAF_totals")

frames = [peaks80_378, comet_378, peaksdb_378]
index = ['peaks80_378', 'comet_378', 'peaksdb_378']

# concatenate dataframes
tot_378 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_378 = tot_378.loc[:, ~tot_378.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_378', 'comet_378', 'peaksdb_378']
tot_378.insert(loc=0, column='data source', value=names)
tot_378.set_index('data source')

# sum the AAs and PTMs

index = ['378']

data = {'NAAF': tot_378['NAAF'].sum(),
        'A-NAAF': tot_378['A-NAAF'].sum(),
        'C-NAAF': tot_378['C-NAAF'].sum(),
        'D-NAAF': tot_378['D-NAAF'].sum(),
        'E-NAAF': tot_378['E-NAAF'].sum(),
        'F-NAAF': tot_378['F-NAAF'].sum(),
        'G-NAAF': tot_378['G-NAAF'].sum(),
        'H-NAAF': tot_378['H-NAAF'].sum(),
        'I-NAAF': tot_378['I-NAAF'].sum(),
        'K-NAAF': tot_378['K-NAAF'].sum(),
        'L-NAAF': tot_378['L-NAAF'].sum(),
        'M-NAAF': tot_378['M-NAAF'].sum(),
        'N-NAAF': tot_378['N-NAAF'].sum(),
        'P-NAAF': tot_378['P-NAAF'].sum(),
        'Q-NAAF': tot_378['Q-NAAF'].sum(),
        'R-NAAF': tot_378['R-NAAF'].sum(),
        'S-NAAF': tot_378['S-NAAF'].sum(),
        'T-NAAF': tot_378['T-NAAF'].sum(),
        'V-NAAF': tot_378['V-NAAF'].sum(),
        'W-NAAF': tot_378['W-NAAF'].sum(),
        'Y-NAAF': tot_378['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_378['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_378['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_378['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_378['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_378['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_378['R-meth-NAAF'].sum()
       }

sum_378 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_378['Cys. w/ carb.'] = sum_378['C-carb-NAAF'] / sum_378['C-NAAF']
sum_378['Met. w/ oxid.'] = sum_378['M-oxid-NAAF'] / sum_378['M-NAAF']
sum_378['Asn. w/ deam.'] = sum_378['N-deam-NAAF'] / sum_378['N-NAAF']
sum_378['Gln. w/ deam.'] = sum_378['Q-deam-NAAF'] / sum_378['Q-NAAF']
sum_378['Lys. w/ hydr.'] = sum_378['K-hydr-NAAF'] / sum_378['K-NAAF']
sum_378['Arg. w/ meth.'] = sum_378['R-meth-NAAF'] / sum_378['R-NAAF']
sum_378['Total modified'] = (sum_378['Cys. w/ carb.'] + sum_378['Met. w/ oxid.'] + sum_378['Asn. w/ deam.'] + \
                            sum_378['Gln. w/ deam.'] + sum_378['Lys. w/ hydr.'] + sum_378['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_378.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-378.csv")

sum_378.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
378,565.464559,526.937966,11.057405,118.04945,323.528595,69.20847,227.960506,3.690205,68.997074,152.192401,...,5.937658,6.827074,225.374968,1.0,0.478825,0.238317,0.071509,0.044858,0.471315,0.460965


In [6]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 278 265 m sinking

peaks80_278 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/278/ETNP-SKQ17-278-265m-trap_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_278 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/278/ETNP-SKQ17-278-265m-trap_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_278 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/278/ETNP_SKQ17_PEAKSDB20_278-265m-trap_15ppm_NAAF_totals")

frames = [peaks80_278, comet_278, peaksdb_278]
index = ['peaks80_278', 'comet_278', 'peaksdb_278']

# concatenate dataframes
tot_278 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_278 = tot_278.loc[:, ~tot_278.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_278', 'comet_278', 'peaksdb_278']
tot_278.insert(loc=0, column='data source', value=names)
tot_278.set_index('data source')

# sum the AAs and PTMs

index = ['278']

data = {'NAAF': tot_278['NAAF'].sum(),
        'A-NAAF': tot_278['A-NAAF'].sum(),
        'C-NAAF': tot_278['C-NAAF'].sum(),
        'D-NAAF': tot_278['D-NAAF'].sum(),
        'E-NAAF': tot_278['E-NAAF'].sum(),
        'F-NAAF': tot_278['F-NAAF'].sum(),
        'G-NAAF': tot_278['G-NAAF'].sum(),
        'H-NAAF': tot_278['H-NAAF'].sum(),
        'I-NAAF': tot_278['I-NAAF'].sum(),
        'K-NAAF': tot_278['K-NAAF'].sum(),
        'L-NAAF': tot_278['L-NAAF'].sum(),
        'M-NAAF': tot_278['M-NAAF'].sum(),
        'N-NAAF': tot_278['N-NAAF'].sum(),
        'P-NAAF': tot_278['P-NAAF'].sum(),
        'Q-NAAF': tot_278['Q-NAAF'].sum(),
        'R-NAAF': tot_278['R-NAAF'].sum(),
        'S-NAAF': tot_278['S-NAAF'].sum(),
        'T-NAAF': tot_278['T-NAAF'].sum(),
        'V-NAAF': tot_278['V-NAAF'].sum(),
        'W-NAAF': tot_278['W-NAAF'].sum(),
        'Y-NAAF': tot_278['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_278['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_278['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_278['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_278['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_278['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_278['R-meth-NAAF'].sum()
       }

sum_278 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_278['Cys. w/ carb.'] = sum_278['C-carb-NAAF'] / sum_278['C-NAAF']
sum_278['Met. w/ oxid.'] = sum_278['M-oxid-NAAF'] / sum_278['M-NAAF']
sum_278['Asn. w/ deam.'] = sum_278['N-deam-NAAF'] / sum_278['N-NAAF']
sum_278['Gln. w/ deam.'] = sum_278['Q-deam-NAAF'] / sum_278['Q-NAAF']
sum_278['Lys. w/ hydr.'] = sum_278['K-hydr-NAAF'] / sum_278['K-NAAF']
sum_278['Arg. w/ meth.'] = sum_278['R-meth-NAAF'] / sum_278['R-NAAF']
sum_278['Total modified'] = (sum_278['Cys. w/ carb.'] + sum_278['Met. w/ oxid.'] + sum_278['Asn. w/ deam.'] + \
                            sum_278['Gln. w/ deam.'] + sum_278['Lys. w/ hydr.'] + sum_278['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_278.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-278.csv")

sum_278.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
278,1961.554832,1659.680289,184.520785,881.01051,1541.172225,344.476099,970.647311,163.093872,400.993457,1104.15373,...,25.614705,180.842895,359.481506,1.0,0.665736,0.285893,0.057439,0.163784,0.292309,0.493032


In [7]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 273 965 m sinking

peaks80_273 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/273/ETNP-SKQ17-273-965m-trap_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_273 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/273/ETNP-SKQ17-273-965m-trap_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_273 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/273/ETNP_SKQ17_PEAKSDB20_273-965m-trap_15ppm_NAAF_totals")

frames = [peaks80_273, comet_273, peaksdb_273]
index = ['peaks80_273', 'comet_273', 'peaksdb_273']

# concatenate dataframes
tot_273 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_273 = tot_273.loc[:, ~tot_273.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_273', 'comet_273', 'peaksdb_273']
tot_273.insert(loc=0, column='data source', value=names)
tot_273.set_index('data source')

# sum the AAs and PTMs

index = ['273']

data = {'NAAF': tot_273['NAAF'].sum(),
        'A-NAAF': tot_273['A-NAAF'].sum(),
        'C-NAAF': tot_273['C-NAAF'].sum(),
        'D-NAAF': tot_273['D-NAAF'].sum(),
        'E-NAAF': tot_273['E-NAAF'].sum(),
        'F-NAAF': tot_273['F-NAAF'].sum(),
        'G-NAAF': tot_273['G-NAAF'].sum(),
        'H-NAAF': tot_273['H-NAAF'].sum(),
        'I-NAAF': tot_273['I-NAAF'].sum(),
        'K-NAAF': tot_273['K-NAAF'].sum(),
        'L-NAAF': tot_273['L-NAAF'].sum(),
        'M-NAAF': tot_273['M-NAAF'].sum(),
        'N-NAAF': tot_273['N-NAAF'].sum(),
        'P-NAAF': tot_273['P-NAAF'].sum(),
        'Q-NAAF': tot_273['Q-NAAF'].sum(),
        'R-NAAF': tot_273['R-NAAF'].sum(),
        'S-NAAF': tot_273['S-NAAF'].sum(),
        'T-NAAF': tot_273['T-NAAF'].sum(),
        'V-NAAF': tot_273['V-NAAF'].sum(),
        'W-NAAF': tot_273['W-NAAF'].sum(),
        'Y-NAAF': tot_273['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_273['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_273['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_273['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_273['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_273['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_273['R-meth-NAAF'].sum()
       }

sum_273 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_273['Cys. w/ carb.'] = sum_273['C-carb-NAAF'] / sum_273['C-NAAF']
sum_273['Met. w/ oxid.'] = sum_273['M-oxid-NAAF'] / sum_273['M-NAAF']
sum_273['Asn. w/ deam.'] = sum_273['N-deam-NAAF'] / sum_273['N-NAAF']
sum_273['Gln. w/ deam.'] = sum_273['Q-deam-NAAF'] / sum_273['Q-NAAF']
sum_273['Lys. w/ hydr.'] = sum_273['K-hydr-NAAF'] / sum_273['K-NAAF']
sum_273['Arg. w/ meth.'] = sum_273['R-meth-NAAF'] / sum_273['R-NAAF']
sum_273['Total modified'] = (sum_273['Cys. w/ carb.'] + sum_273['Met. w/ oxid.'] + sum_273['Asn. w/ deam.'] + \
                            sum_273['Gln. w/ deam.'] + sum_273['Lys. w/ hydr.'] + sum_273['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_273.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-273.csv")

sum_273.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
273,1290.914966,1472.0329,83.90502,571.40465,874.848043,222.385169,659.126571,60.43943,240.47614,519.073151,...,49.486957,67.699017,300.932935,1.0,0.761132,0.313974,0.134326,0.130423,0.29943,0.527857
