### Combining NAAF-corrected *de novo* and PeaksDB  peptides for Trocas 8 non-incubation samples

The dataset:

    4 stations, 2 depths (50% and surface), 1 size fractions (GF75):
    
    Stations: 
    
        - Macapa South (MS) South stem, upriver (have left, middle, and right channels)
        - Macapa North (MN) North stem, upriver (have left, middle, and right channels)
        - Chaves (CV) South stem, downriver
        - Baylique (BY) North stem, downriver


    Proteomics samples from a trips to UWPR (June 2021 on the Fusion)
    There were some duplicate injections for select samples 
    Each sample has a 

Starting with:

    Peaks de novo results of PTM-optimized sequencing, NAAF corrected per sample
    PeaksDB de novo-assisted results from PTM-optimized database searches, NAAF corrected per sample
    
    Multiple samples per treatment

Goal:

    CSVs with combined de novo and PeaksDB for each sample, normalized to by NAAF
    
Using:

    - pandas
    - matplotlib
    - numpy

In [5]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [6]:
cd /home/millieginty/Documents/git-repos/amazon/data/Trocas8-notincs/

/home/millieginty/Documents/git-repos/amazon/data/Trocas8-notincs


### 1. Baylique, 50%, size fraction 0.3-0.7 um (denoted as BY_50_GF75)
### T8 samples #s: 675, 683
### Exported NAAF and stripped peptides contained in the following directories:

        data/Trocas8-notincs/processed/PeaksDB/675_BY_50_GF75_PDB
        data/Trocas8-notincs/processed/PeaksDN/675_BY_50_GF75_PDN


In [9]:
# for each of sample versions
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_675 = pd.read_csv("processed/PeaksDN/675_BY_50_GF75_DN/675_BY_50_GF75_DN50_naaf_totals.csv")
peaks50_683 = pd.read_csv("processed/PeaksDN/675_BY_50_GF75_DN/683_BY_50_GF75_DN50_naaf_totals.csv")

peaksdb_675 = pd.read_csv("processed/PeaksDB/675_BY_50_GF75_PDB/675_BY_50_GF75_PDB_naaf_totals.csv")
peaksdb_683 = pd.read_csv("processed/PeaksDB/675_BY_50_GF75_PDB/683_BY_50_GF75_PDB_naaf_totals.csv")


frames = [peaks50_675, peaks50_683, peaksdb_675, peaksdb_683]
index = ['peaks50_675', 'peaks50_683', 'peaksdb_675', 'peaksdb_683']

# concatenate dataframes
tot_675 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_675 = tot_675.loc[:, ~tot_675.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks50_675', 'peaks50_683', 'peaksdb_675', 'peaksdb_683']
tot_675.insert(loc=0, column='data source', value=names)
tot_675.set_index('data source')

# sum the AAs and PTMN

index = ['675']

data = {'NAAF': tot_675['NAAF'].sum(),
        'A-NAAF': tot_675['A'].sum(),
        'C-NAAF': tot_675['C'].sum(),
        'D-NAAF': tot_675['D'].sum(),
        'E-NAAF': tot_675['E'].sum(),
        'F-NAAF': tot_675['F'].sum(),
        'G-NAAF': tot_675['G'].sum(),
        'H-NAAF': tot_675['H'].sum(),
        'I-NAAF': tot_675['I'].sum(),
        'K-NAAF': tot_675['K'].sum(),
        'L-NAAF': tot_675['L'].sum(),
        'M-NAAF': tot_675['M'].sum(),
        'N-NAAF': tot_675['N'].sum(),
        'P-NAAF': tot_675['P'].sum(),
        'Q-NAAF': tot_675['Q'].sum(),
        'R-NAAF': tot_675['R'].sum(),
        'S-NAAF': tot_675['S'].sum(),
        'T-NAAF': tot_675['T'].sum(),
        'V-NAAF': tot_675['V'].sum(),
        'W-NAAF': tot_675['W'].sum(),
        'Y-NAAF': tot_675['Y'].sum(),
        'C-carb-NAAF': tot_675['c-carb'].sum(),
        'M-oxid-NAAF': tot_675['m-oxid'].sum(),
        'N-deam-NAAF': tot_675['n-deam'].sum(),
        'Q-deam-NAAF': tot_675['q-deam'].sum()
       }

sum_675 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_675['AA-NAAF'] = sum_675['A-NAAF'] + sum_675['C-NAAF'] + sum_675['D-NAAF'] + sum_675['E-NAAF'] + \
                     sum_675['F-NAAF'] + sum_675['G-NAAF'] + sum_675['H-NAAF'] + sum_675['I-NAAF'] + \
                     sum_675['K-NAAF'] + sum_675['L-NAAF'] + sum_675['M-NAAF'] + sum_675['N-NAAF'] + \
                     sum_675['P-NAAF'] + sum_675['Q-NAAF'] + sum_675['R-NAAF'] + sum_675['S-NAAF'] + \
                     sum_675['T-NAAF'] + sum_675['V-NAAF'] + sum_675['W-NAAF'] + sum_675['Y-NAAF'] 

sum_675['AA-modifiable-NAAF'] = sum_675['M-NAAF'] + sum_675['N-NAAF'] + sum_675['Q-NAAF']
                                

sum_675['Cys. w/ carb.'] = sum_675['C-carb-NAAF'] / sum_675['C-NAAF'] #1
sum_675['Met. w/ oxid.'] = sum_675['M-oxid-NAAF'] / sum_675['M-NAAF'] #2
sum_675['Asn. w/ deam.'] = sum_675['N-deam-NAAF'] / sum_675['N-NAAF'] #3
sum_675['Glu. w/ deam.'] = sum_675['Q-deam-NAAF'] / sum_675['Q-NAAF'] #3


sum_675['Overall modified'] = 0.001*((sum_675['Met. w/ oxid.']*sum_675['M-NAAF']) + \
                                     (sum_675['Asn. w/ deam.']*sum_675['N-NAAF']) + \
                                     (sum_675['Glu. w/ deam.']*sum_675['Q-NAAF']) / (sum_675['AA-modifiable-NAAF'])) 
                                                            

# write to a csv T8 combined data directory in /processed
sum_675.to_csv("processed/NAAF-sums/NAAF-sum-675.csv")

sum_675.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,M-oxid-NAAF,N-deam-NAAF,Q-deam-NAAF,AA-NAAF,AA-modifiable-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Glu. w/ deam.,Overall modified
675,266.91019,158.3525,0.172654,1.20592,1.509804,1.41651,14.95852,10.886413,38.068027,11.346827,...,0.069595,0.350593,0.119044,2239.568685,60.097134,1.0,0.006951,0.007275,0.062857,0.000422


### 1. Baylique, surface, size fraction 0.3-0.7 um (denoted as BY_surf_GF75)
### T8 samples #s: 676, 684
### Exported NAAF and stripped peptides contained in the following directories:

        data/Trocas8-notincs/processed/PeaksDB/676_BY_surf_GF75_PDB
        data/Trocas8-notincs/processed/PeaksDN/676_BY_surf_GF75_DN

In [10]:
# for each of sample versions
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_676 = pd.read_csv("processed/PeaksDN/676_BY_surf_GF75_DN/676_BY_surf_GF75_DN50_naaf_totals.csv")
peaks50_684 = pd.read_csv("processed/PeaksDN/676_BY_surf_GF75_DN/684_BY_surf_GF75_DN50_naaf_totals.csv")

peaksdb_676 = pd.read_csv("processed/PeaksDB/676_BY_surf_GF75_PDB/676_BY_surf_GF75_PDB_naaf_totals.csv")
peaksdb_684 = pd.read_csv("processed/PeaksDB/676_BY_surf_GF75_PDB/684_BY_surf_GF75_PDB_naaf_totals.csv")


frames = [peaks50_676, peaks50_684, peaksdb_676, peaksdb_684]
index = ['peaks50_676', 'peaks50_684', 'peaksdb_676', 'peaksdb_684']

# concatenate dataframes
tot_676 = pd.concat(frames, sort=False, ignore_index=True)

# there's a useless column in there
tot_676 = tot_676.loc[:, ~tot_676.columns.str.contains('^Unnamed')]

# reindex with peaks 
names = ['peaks50_676', 'peaks50_684', 'peaksdb_676', 'peaksdb_684']
tot_676.insert(loc=0, column='data source', value=names)
tot_676.set_index('data source')

# sum the AAs and PTMN

index = ['676']

data = {'NAAF': tot_676['NAAF'].sum(),
        'A-NAAF': tot_676['A'].sum(),
        'C-NAAF': tot_676['C'].sum(),
        'D-NAAF': tot_676['D'].sum(),
        'E-NAAF': tot_676['E'].sum(),
        'F-NAAF': tot_676['F'].sum(),
        'G-NAAF': tot_676['G'].sum(),
        'H-NAAF': tot_676['H'].sum(),
        'I-NAAF': tot_676['I'].sum(),
        'K-NAAF': tot_676['K'].sum(),
        'L-NAAF': tot_676['L'].sum(),
        'M-NAAF': tot_676['M'].sum(),
        'N-NAAF': tot_676['N'].sum(),
        'P-NAAF': tot_676['P'].sum(),
        'Q-NAAF': tot_676['Q'].sum(),
        'R-NAAF': tot_676['R'].sum(),
        'S-NAAF': tot_676['S'].sum(),
        'T-NAAF': tot_676['T'].sum(),
        'V-NAAF': tot_676['V'].sum(),
        'W-NAAF': tot_676['W'].sum(),
        'Y-NAAF': tot_676['Y'].sum(),
        'C-carb-NAAF': tot_676['c-carb'].sum(),
        'M-oxid-NAAF': tot_676['m-oxid'].sum(),
        'N-deam-NAAF': tot_676['n-deam'].sum(),
        'Q-deam-NAAF': tot_676['q-deam'].sum()
       }

sum_676 = pd.DataFrame(data, columns=['NAAF', 'A-NAAF', 'C-NAAF', 'D-NAAF', 'E-NAAF', 'F-NAAF', \
                                                   'G-NAAF', 'H-NAAF', 'I-NAAF','K-NAAF', 'L-NAAF', 'M-NAAF', \
                                                   'N-NAAF', 'P-NAAF', 'Q-NAAF', 'R-NAAF', 'S-NAAF', \
                                                   'T-NAAF', 'V-NAAF', 'W-NAAF', 'Y-NAAF', 'C-carb-NAAF', \
                                                   'M-oxid-NAAF', 'N-deam-NAAF', 'Q-deam-NAAF'], index=index)

# calculate the NAAF-corrected % modified C, M, N, Q
# also calculate the NAAF-corrected % of 'total modifiable AA space'
# this is a weighted average of modified residues wrt their relative abundance in the combined peptide set

sum_676['AA-NAAF'] = sum_676['A-NAAF'] + sum_676['C-NAAF'] + sum_676['D-NAAF'] + sum_676['E-NAAF'] + \
                     sum_676['F-NAAF'] + sum_676['G-NAAF'] + sum_676['H-NAAF'] + sum_676['I-NAAF'] + \
                     sum_676['K-NAAF'] + sum_676['L-NAAF'] + sum_676['M-NAAF'] + sum_676['N-NAAF'] + \
                     sum_676['P-NAAF'] + sum_676['Q-NAAF'] + sum_676['R-NAAF'] + sum_676['S-NAAF'] + \
                     sum_676['T-NAAF'] + sum_676['V-NAAF'] + sum_676['W-NAAF'] + sum_676['Y-NAAF'] 

sum_676['AA-modifiable-NAAF'] = sum_676['M-NAAF'] + sum_676['N-NAAF'] + sum_676['Q-NAAF']
                                

sum_676['Cys. w/ carb.'] = sum_676['C-carb-NAAF'] / sum_676['C-NAAF'] #1
sum_676['Met. w/ oxid.'] = sum_676['M-oxid-NAAF'] / sum_676['M-NAAF'] #2
sum_676['Asn. w/ deam.'] = sum_676['N-deam-NAAF'] / sum_676['N-NAAF'] #3
sum_676['Glu. w/ deam.'] = sum_676['Q-deam-NAAF'] / sum_676['Q-NAAF'] #3


sum_676['Overall modified'] = 0.001*((sum_676['Met. w/ oxid.']*sum_676['M-NAAF']) + \
                                     (sum_676['Asn. w/ deam.']*sum_676['N-NAAF']) + \
                                     (sum_676['Glu. w/ deam.']*sum_676['Q-NAAF']) / (sum_676['AA-modifiable-NAAF'])) 
                                                            

# write to a csv T8 combined data directory in /processed
sum_676.to_csv("processed/NAAF-sums/NAAF-sum-676.csv")

sum_676.head()

Unnamed: 0,NAAF,A-NAAF,C-NAAF,D-NAAF,E-NAAF,F-NAAF,G-NAAF,H-NAAF,I-NAAF,K-NAAF,...,M-oxid-NAAF,N-deam-NAAF,Q-deam-NAAF,AA-NAAF,AA-modifiable-NAAF,Cys. w/ carb.,Met. w/ oxid.,Asn. w/ deam.,Glu. w/ deam.,Overall modified
676,213.394672,154.97724,1.020673,1.836981,2.758512,1.102961,9.200438,2.076114,34.657824,3.469712,...,0.432756,0.590057,0.090918,1784.559873,42.946912,1.0,0.185964,0.014678,0.216262,0.001025
