### Combining NAAF-corrected *de novo*, PeaksDB, and Comet peptides for each sample:

Starting with:

    Peaks de novo results of PTM-optimized sequencing, NAAF corrected per sample
    Comet results from PTM-optimized database searches, NAAF corrected per sample
    PeaksDB de novo-assisted results from PTM-optimized database searches, NAAF corrected per sample

Goal:

    CSVs with combined de novo, PeaksDB, and Comet peptides for each sample, normalized to Waters Hi3 peptides (6 unique E. coli heat shock protein-dervied peptides). 
    
Using:

    - pandas
    - matplotlib
    - numpy

In [2]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [3]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 231 100 m suspended

peaks80_231 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/231/ETNP-SKQ17-231-100m-0.3-JA2_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_231 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/231/ETNP-SKQ17-231-100m-0.3-JA2_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_231 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/231/ETNP_SKQ17_PEAKSDB20_231-100m-0.3-JA2_15ppm_NAAF_totals")

frames = [peaks80_231, comet_231, peaksdb_231]
index = ['peaks80_231', 'comet_231', 'peaksdb_231']

# concatenate dataframes
tot_231 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_231 = tot_231.loc[:, ~tot_231.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_231', 'comet_231', 'peaksdb_231']
tot_231.insert(loc=0, column='data source', value=names)
tot_231.set_index('data source')

# sum the AAs and PTMs

index = ['231']

data = {'NAAF': tot_231['NAAF'].sum(),
        'A-NAAF': tot_231['A-NAAF'].sum(),
        'C-NAAF': tot_231['C-NAAF'].sum(),
        'D-NAAF': tot_231['D-NAAF'].sum(),
        'E-NAAF': tot_231['E-NAAF'].sum(),
        'F-NAAF': tot_231['F-NAAF'].sum(),
        'G-NAAF': tot_231['G-NAAF'].sum(),
        'H-NAAF': tot_231['H-NAAF'].sum(),
        'I-NAAF': tot_231['I-NAAF'].sum(),
        'K-NAAF': tot_231['K-NAAF'].sum(),
        'L-NAAF': tot_231['L-NAAF'].sum(),
        'M-NAAF': tot_231['M-NAAF'].sum(),
        'N-NAAF': tot_231['N-NAAF'].sum(),
        'P-NAAF': tot_231['P-NAAF'].sum(),
        'Q-NAAF': tot_231['Q-NAAF'].sum(),
        'R-NAAF': tot_231['R-NAAF'].sum(),
        'S-NAAF': tot_231['S-NAAF'].sum(),
        'T-NAAF': tot_231['T-NAAF'].sum(),
        'V-NAAF': tot_231['V-NAAF'].sum(),
        'W-NAAF': tot_231['W-NAAF'].sum(),
        'Y-NAAF': tot_231['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_231['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_231['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_231['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_231['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_231['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_231['R-meth-NAAF'].sum()
       }

sum_231 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_231['Cys. w/ carb.'] = sum_231['C-carb-NAAF'] / sum_231['C-NAAF']
sum_231['Met. w/ oxid.'] = sum_231['M-oxid-NAAF'] / sum_231['M-NAAF']
sum_231['Asn. w/ deam.'] = sum_231['N-deam-NAAF'] / sum_231['N-NAAF']
sum_231['Gln. w/ deam.'] = sum_231['Q-deam-NAAF'] / sum_231['Q-NAAF']
sum_231['Lys. w/ hydr.'] = sum_231['K-hydr-NAAF'] / sum_231['K-NAAF']
sum_231['Arg. w/ meth.'] = sum_231['R-meth-NAAF'] / sum_231['R-NAAF']
sum_231['Total modified'] = (sum_231['Cys. w/ carb.'] + sum_231['Met. w/ oxid.'] + sum_231['Asn. w/ deam.'] + \
                            sum_231['Gln. w/ deam.'] + sum_231['Lys. w/ hydr.'] + sum_231['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_231.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-231.csv")

sum_231.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
231,2794.464747,3508.193338,93.186706,1591.921611,1825.15303,466.477373,1918.165229,62.720219,1284.177015,952.31042,...,62.213773,44.806131,118.259304,1.0,0.751435,0.11705,0.067912,0.04705,0.05226,0.407141


In [5]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 233 265 m suspended

peaks80_233 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/233/ETNP-SKQ17-233-265m-0.3-JA4_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_233 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/233/ETNP-SKQ17-233-265m-0.3-JA4_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_233 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/233/ETNP_SKQ17_PEAKSDB20_233-265m-0.3-JA4_15ppm_NAAF_totals")

frames = [peaks80_233, comet_233, peaksdb_233]
index = ['peaks80_233', 'comet_233', 'peaksdb_233']

# concatenate dataframes
tot_233 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_233 = tot_233.loc[:, ~tot_233.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_233', 'comet_233', 'peaksdb_233']
tot_233.insert(loc=0, column='data source', value=names)
tot_233.set_index('data source')

# sum the AAs and PTMs

index = ['233']

data = {'NAAF': tot_233['NAAF'].sum(),
        'A-NAAF': tot_233['A-NAAF'].sum(),
        'C-NAAF': tot_233['C-NAAF'].sum(),
        'D-NAAF': tot_233['D-NAAF'].sum(),
        'E-NAAF': tot_233['E-NAAF'].sum(),
        'F-NAAF': tot_233['F-NAAF'].sum(),
        'G-NAAF': tot_233['G-NAAF'].sum(),
        'H-NAAF': tot_233['H-NAAF'].sum(),
        'I-NAAF': tot_233['I-NAAF'].sum(),
        'K-NAAF': tot_233['K-NAAF'].sum(),
        'L-NAAF': tot_233['L-NAAF'].sum(),
        'M-NAAF': tot_233['M-NAAF'].sum(),
        'N-NAAF': tot_233['N-NAAF'].sum(),
        'P-NAAF': tot_233['P-NAAF'].sum(),
        'Q-NAAF': tot_233['Q-NAAF'].sum(),
        'R-NAAF': tot_233['R-NAAF'].sum(),
        'S-NAAF': tot_233['S-NAAF'].sum(),
        'T-NAAF': tot_233['T-NAAF'].sum(),
        'V-NAAF': tot_233['V-NAAF'].sum(),
        'W-NAAF': tot_233['W-NAAF'].sum(),
        'Y-NAAF': tot_233['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_233['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_233['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_233['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_233['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_233['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_233['R-meth-NAAF'].sum()
       }

sum_233 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_233['Cys. w/ carb.'] = sum_233['C-carb-NAAF'] / sum_233['C-NAAF']
sum_233['Met. w/ oxid.'] = sum_233['M-oxid-NAAF'] / sum_233['M-NAAF']
sum_233['Asn. w/ deam.'] = sum_233['N-deam-NAAF'] / sum_233['N-NAAF']
sum_233['Gln. w/ deam.'] = sum_233['Q-deam-NAAF'] / sum_233['Q-NAAF']
sum_233['Lys. w/ hydr.'] = sum_233['K-hydr-NAAF'] / sum_233['K-NAAF']
sum_233['Arg. w/ meth.'] = sum_233['R-meth-NAAF'] / sum_233['R-NAAF']
sum_233['Total modified'] = (sum_233['Cys. w/ carb.'] + sum_233['Met. w/ oxid.'] + sum_233['Asn. w/ deam.'] + \
                            sum_233['Gln. w/ deam.'] + sum_233['Lys. w/ hydr.'] + sum_233['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_233.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-233.csv")

sum_233.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
233,1985.158672,2618.74519,100.698258,961.508374,1264.337665,230.564805,1245.119421,56.443911,1055.874197,590.502915,...,29.451307,35.800928,70.866825,1.0,0.164092,0.175549,0.041727,0.060628,0.042582,0.296915


In [6]:
# for each of the 6 POM samples:
# read in NAAF totals csvs made in SKQ-Peaks (de novo), SKQ-PeaksDB, and SKQ-Comet notebooks
# bringing in Comet > XCorr 2.17 (will say 3) and de novo peptides >80% ALC
# also reading in Peaks DB peptides >20 -10lgP (all we imported)

# 243 965 m suspended

peaks80_243 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/243/ETNP-SKQ17-243-965m-0.3-JA14_PTMopt_15ppm_DN80_NAAF_totals.csv")
comet_243 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/243/ETNP-SKQ17-243-965m-0.3-JA14_PTMopt_Comet15_1FDR_NAAF_totals.csv")
peaksdb_243 = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/243/ETNP_SKQ17_PEAKSDB20_243-965m-0.3-JA14_15ppm_NAAF_totals")

frames = [peaks80_243, comet_243, peaksdb_243]
index = ['peaks80_243', 'comet_243', 'peaksdb_243']

# concatenate dataframes
tot_243 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_243 = tot_243.loc[:, ~tot_243.columns.str.contains('^Unnamed')]

# reindex with peaks and comet
names = ['peaks80_243', 'comet_243', 'peaksdb_243']
tot_243.insert(loc=0, column='data source', value=names)
tot_243.set_index('data source')

# sum the AAs and PTMs

index = ['243']

data = {'NAAF': tot_243['NAAF'].sum(),
        'A-NAAF': tot_243['A-NAAF'].sum(),
        'C-NAAF': tot_243['C-NAAF'].sum(),
        'D-NAAF': tot_243['D-NAAF'].sum(),
        'E-NAAF': tot_243['E-NAAF'].sum(),
        'F-NAAF': tot_243['F-NAAF'].sum(),
        'G-NAAF': tot_243['G-NAAF'].sum(),
        'H-NAAF': tot_243['H-NAAF'].sum(),
        'I-NAAF': tot_243['I-NAAF'].sum(),
        'K-NAAF': tot_243['K-NAAF'].sum(),
        'L-NAAF': tot_243['L-NAAF'].sum(),
        'M-NAAF': tot_243['M-NAAF'].sum(),
        'N-NAAF': tot_243['N-NAAF'].sum(),
        'P-NAAF': tot_243['P-NAAF'].sum(),
        'Q-NAAF': tot_243['Q-NAAF'].sum(),
        'R-NAAF': tot_243['R-NAAF'].sum(),
        'S-NAAF': tot_243['S-NAAF'].sum(),
        'T-NAAF': tot_243['T-NAAF'].sum(),
        'V-NAAF': tot_243['V-NAAF'].sum(),
        'W-NAAF': tot_243['W-NAAF'].sum(),
        'Y-NAAF': tot_243['Y-NAAF'].sum(),
        'C-carb-NAAF': tot_243['C-carb-NAAF'].sum(),
        'M-oxid-NAAF': tot_243['M-oxid-NAAF'].sum(),
        'N-deam-NAAF': tot_243['N-deam-NAAF'].sum(),
        'Q-deam-NAAF': tot_243['Q-deam-NAAF'].sum(),
        'K-hydr-NAAF': tot_243['K-hydr-NAAF'].sum(),
        'R-meth-NAAF': tot_243['R-meth-NAAF'].sum()
       }

sum_243 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF', 'K-hydr-NAAF',\
                                                   'R-meth-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q, K, and Rs


sum_243['Cys. w/ carb.'] = sum_243['C-carb-NAAF'] / sum_243['C-NAAF']
sum_243['Met. w/ oxid.'] = sum_243['M-oxid-NAAF'] / sum_243['M-NAAF']
sum_243['Asn. w/ deam.'] = sum_243['N-deam-NAAF'] / sum_243['N-NAAF']
sum_243['Gln. w/ deam.'] = sum_243['Q-deam-NAAF'] / sum_243['Q-NAAF']
sum_243['Lys. w/ hydr.'] = sum_243['K-hydr-NAAF'] / sum_243['K-NAAF']
sum_243['Arg. w/ meth.'] = sum_243['R-meth-NAAF'] / sum_243['R-NAAF']
sum_243['Total modified'] = (sum_243['Cys. w/ carb.'] + sum_243['Met. w/ oxid.'] + sum_243['Asn. w/ deam.'] + \
                            sum_243['Gln. w/ deam.'] + sum_243['Lys. w/ hydr.'] + sum_243['Arg. w/ meth.']) / 5


# write to a csv in the ETNP data directory
sum_243.to_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/NAAF-sums/NAAF-sum-243.csv")

sum_243.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,Q-deam-NAAF,K-hydr-NAAF,R-meth-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Gln. w/ deam.,Lys. w/ hydr.,Arg. w/ meth.,Total modified
243,508.407353,662.141669,14.050445,189.751365,248.538994,14.495422,181.336989,5.883503,173.695735,51.516163,...,7.656642,6.34669,151.792995,1.0,0.188156,0.304075,0.052042,0.123198,0.285675,0.390629
