## Manipulation of PeaskDB de novo-assisted database search results of Trocas 7 (April, 2019 high water) lower Amazon river proteomics LC-MS/MS data using python.

Starting with:

PeaksDB search results (.csv) of database searches against Henrique's Amazon metagenome (+Hi3)
All samples (duplicates of most) included, so `Area` and `Spectral Counts` columns for each injection
These were all searched with 15 ppm precursor tolerance and 0.5 ppm fragement ion tolerance
Exported at <1.0% FDR

Goal:

Files with stripped (no PTMs) peptide lists and
Columns with #'s of each modification in every sequence
Column with stripped peptide lengths (# amino acids)


In [1]:
cd /home/millieginty/Documents/git-repos/amazon/data/TROCAS7_Fusion_Apr2021_PEAKS_76-all-samples/

/home/millieginty/Documents/git-repos/amazon/data/TROCAS7_Fusion_Apr2021_PEAKS_76-all-samples


In [2]:
ls

Apr21-peaks76-DB-peptide.csv           Apr21-peaks76-DB-proteins.fasta
Apr21-peaks76-DB-protein-peptides.csv  Apr21-peaks76-DB-search-psm.csv
Apr21-peaks76-DB-proteins.csv          Apr21-peaks76-dno.csv


In [3]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [12]:
# read the CSV into a dataframe using the pandas read_csv function
pdb_dup = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/TROCAS7_Fusion_Apr2021_PEAKS_76-all-samples/Apr21-peaks76-DB-peptide.csv")

# remove redundant rows
pdb = pd.DataFrame.drop_duplicates(pdb_dup)

print(pdb.columns)

##columns = ['Peptide', '-10lgP', 'Mass', 'Length', 'ppm', 'm/z', 'RT',
       ##'Area', 'Fraction', 'Scan', 'Source_File',
       ##'#Spec', '#Spec', 'Accession', 'PTM',
       ##'AScore']

##peaksdb322.columns = columns

#remmove # spec and accession columns because they mess parsing up

##del peaksdb322['#Spec']
##del peaksdb322['Accession']
del pdb['PTM']
del pdb['AScore']

# get rid of all the spectral count #s, we're fine with Area
pdb = pdb[pdb.columns.drop(list(pdb.filter(regex='Spec')))]

mean_length = pdb['Length'].mean()
print('mean peptide length:', mean_length)

print("# redundant peaksdb peptides in combined dataframe", len(pdb_dup))
print("# nonredundant peaksdb peptides in combined dataframe", len(pdb))

#look at the dataframe
pdb.head()

Index(['Peptide', '-10lgP', 'Mass', 'Length', 'ppm', 'm/z', 'RT',
       'Area Trocas7-302-Bay', 'Area Trocas7-306-Chav',
       'Area Trocas7-310-SMCP', 'Area Trocas7-318-NMCP',
       'Area Trocas7-402-Bay', 'Area Trocas7-406-Chav',
       'Area Trocas7-410-SMCP', 'Area Trocas7-417-NMCP',
       'Area Trocas7-102-Bay', 'Area Trocas7-106-Chav',
       'Area Trocas7-206-Chav', 'Area Trocas7-110-SMCP',
       'Area Trocas7-126-NMCP', 'Area Trocas7-202-Bay',
       'Area Trocas7-210-SMCP', 'Area Trocas7-410-SMCP-DUP',
       'Area Trocas7-226-NMCP', 'Area Trocas7-303-Bay',
       'Area Trocas7-310-SMCP-DUP', 'Area Trocas7-102-Bay-DUP',
       'Area Trocas7-106-Chav-DUP', 'Area Trocas7-302-Bay-DUP',
       'Area Trocas7-306-Chav-DUP', 'Area Trocas7-503-Bay',
       'Area Trocas7-519-NMCP', 'Area Trocas7-318-NMCP-DUP',
       'Area Trocas7-402-Bay-DUP', 'Area Trocas7-406-Chav-DUP',
       'Area Trocas7-417-NMCP-DUP', 'Area Trocas7-307-Chav',
       'Area Trocas7-311-SMCP', 'Area Trocas7-31

Unnamed: 0,Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Area Trocas7-302-Bay,Area Trocas7-306-Chav,Area Trocas7-310-SMCP,...,Area Trocas7-417-NMCP-DUP,Area Trocas7-307-Chav,Area Trocas7-311-SMCP,Area Trocas7-319-NMCP,Area Trocas7-507-Chav,Area Trocas7-511-SMCP,Fraction,Scan,Source File,Accession
0,LGEHNIDVLEGNEQFINAAK,112.16,2210.0967,20,-2.8,1106.0525,95.57,4370000.0,5920000.0,247000.0,...,,18200.0,2030000.0,155000.0,283000.0,1050000.0,112,16192,20210411_Trocas7_668_SMCP311_DDA_120min_1.raw,
1,SC(+57.02)AAAGTEC(+57.02)LISGWGNTK,104.26,1881.835,18,2.1,941.9268,93.16,3330000.0,3860000.0,1730000.0,...,570000.0,3370000.0,3140000.0,6150000.0,608000.0,534000.0,111,16381,20210411_Trocas7_667_Chav307_DDA_120min_1.raw,
2,SSGSSYPSLLQC(+57.02)LK,88.49,1525.7446,14,2.0,763.8811,101.67,371000.0,1150000.0,367000.0,...,,253000.0,178000.0,570000.0,,135000.0,111,18330,20210411_Trocas7_667_Chav307_DDA_120min_1.raw,
3,SGGGGGGGLGSGGSIR,85.72,1231.5905,16,1.9,616.8036,39.3,,,,...,,204000.0,114000.0,270000.0,,,113,6581,20210411_Trocas7_669_NMCP319_DDA_120min_1.raw,
4,RHPYFYAPELLFFAKR,83.71,2054.0889,16,2.1,514.5306,131.89,,,,...,,,,,4900.0,,81,25155,20210411_Trocas7_666_Bay303_DDA_120min_1.raw,


In [13]:
# use a count function to enumerate the # of A's (alanines) in each peptide
pdb['A'] = pdb['Peptide'].str.count("A")

# use a count function to enumerate the # of C's (cysteines) in each peptide
pdb['C'] = pdb['Peptide'].str.count("C")

# use a count function to enumerate the # of D's (aspartic acids) in each peptide
pdb['D'] = pdb['Peptide'].str.count("D")

# use a count function to enumerate the # of E's (glutamic acids) in each peptide
pdb['E'] = pdb['Peptide'].str.count("E")

# use a count function to enumerate the # of F's (phenylalanines) in each peptide
pdb['F'] = pdb['Peptide'].str.count("F")

# use a count function to enumerate the # of G's (glycines) in each peptide
pdb['G'] = pdb['Peptide'].str.count("G")

# use a count function to enumerate the # of H's (histidines) in each peptide
pdb['H'] = pdb['Peptide'].str.count("H")

# use a count function to enumerate the # of I's (isoleucines) in each peptide
# in pdb output, there will be no isoleucines (they're lumped in with leucines)
pdb['I'] = pdb['Peptide'].str.count("I")

# use a count function to enumerate the # of K's (lysines) in each peptide
pdb['K'] = pdb['Peptide'].str.count("K")

# use a count function to enumerate the # of L's (leucines) in each peptide
# also these include the isoleucines
pdb['L'] = pdb['Peptide'].str.count("L")

# use a count function to enumerate the # of M's (methionines) in each peptide
pdb['M'] = pdb['Peptide'].str.count("M")

# use a count function to enumerate the # of N's (asparagines) in each peptide
pdb['N'] = pdb['Peptide'].str.count("N")

# use a count function to enumerate the # of P's ([prolines]) in each peptide
pdb['P'] = pdb['Peptide'].str.count("P")

# use a count function to enumerate the # of Q's (glutamines) in each peptide
pdb['Q'] = pdb['Peptide'].str.count("Q")

# use a count function to enumerate the # of R's (arginines) in each peptide
pdb['R'] = pdb['Peptide'].str.count("R")

# use a count function to enumerate the # of S's (serines) in each peptide
pdb['S'] = pdb['Peptide'].str.count("S")

# use a count function to enumerate the # of T's (threonines) in each peptide
pdb['T'] = pdb['Peptide'].str.count("T")

# use a count function to enumerate the # of V's (valines) in each peptide
pdb['V'] = pdb['Peptide'].str.count("V")

# use a count function to enumerate the # of W's (tryptophans) in each peptide
pdb['W'] = pdb['Peptide'].str.count("W")

# use a count function to enumerate the # of Y's (tyrosines) in each peptide
pdb['Y'] = pdb['Peptide'].str.count("Y")

# use a count function to enumerate the # of carbamidomethylated C's in each peptide
pdb['c-carb'] = pdb['Peptide'].str.count("57.02")

# use a count function to enumerate the # of oxidized M's in each peptide
pdb['m-oxid'] = pdb['Peptide'].apply(lambda x: x.count('M(+15.99)'))

# use a count function to enumerate the # of oxidized K's in each peptide
#pdb['k-oxid'] = pdb['Peptide'].apply(lambda x: x.count('K(+15.99)'))

# use a count function to enumerate the # of oxidized P's in each peptide
#pdb['p-oxid'] = pdb['Peptide'].apply(lambda x: x.count('P(+15.99)'))

# use a count function to enumerate the # of oxidized R's in each peptide
#pdb['r-oxid'] = pdb['Peptide'].apply(lambda x: x.count('R(+15.99)'))

# use a count function to enumerate the # of oxidized Y's in each peptide
#pdb['y-oxid'] = pdb['Peptide'].apply(lambda x: x.count('Y(+15.99)'))

# use a lamba function to enumerate the # of deamidated N's in each peptide
pdb['n-deam'] = pdb['Peptide'].apply(lambda x: x.count('N(+.98)'))

# use a lamba function to enumerate the # of deamidated N's in each peptide
pdb['q-deam'] = pdb['Peptide'].apply(lambda x: x.count('Q(+.98)'))

# use a count function to enumerate the # of methylated K's in each peptide
#pdb['k-meth'] = pdb['Peptide'].apply(lambda x: x.count('K(+14.02)'))

# use a count function to enumerate the # of methylated R's in each peptide
#pdb['r-meth'] = pdb['Peptide'].apply(lambda x: x.count('R(+14.02)'))

# use a count function to enumerate the # of pyro glu Q's in each peptide
#pdb['q-pyro'] = pdb['Peptide'].apply(lambda x: x.count('Q(-17.03)'))

# use a count function to enumerate the # of acetylation of K's in each peptide
#pdb['k-acet'] = pdb['Peptide'].apply(lambda x: x.count('K(+42.01)'))

# create a column with 'stripped' peptide sequences using strip
pdb['stripped_peptide'] = pdb['Peptide'].str.replace(r"\(.*\)","")

# add a column with the stripped peptide length (number of AAs)
pdb['stripped_length'] = pdb['stripped_peptide'].apply(len)

##pdb['NAAF_num.'] = pdb['Area'] / pdb['stripped_length']

# total the number of modifications in sequence
pdb['ptm-total'] = pdb['c-carb'] + pdb['m-oxid'] + pdb['n-deam'] + pdb['q-deam'] 

# turn all isoleucines into leucines
# this helps later in comparing Unipept peptides to PeaksDB and Comet ones
pdb['stripped_IL']= pdb['stripped_peptide'].str.replace('I','L')

# write modified dataframe to new txt file
pdb.to_csv("/home/millieginty/Documents/git-repos/amazon/data/processed/TROCAS7_Fusion_Apr2021-all-samples/Apr21-peaks76-DB-peptide-proc.csv")

# check out the results
pdb.head()

Unnamed: 0,Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Area Trocas7-302-Bay,Area Trocas7-306-Chav,Area Trocas7-310-SMCP,...,W,Y,c-carb,m-oxid,n-deam,q-deam,stripped_peptide,stripped_length,ptm-total,stripped_IL
0,LGEHNIDVLEGNEQFINAAK,112.16,2210.0967,20,-2.8,1106.0525,95.57,4370000.0,5920000.0,247000.0,...,0,0,0,0,0,0,LGEHNIDVLEGNEQFINAAK,20,0,LGEHNLDVLEGNEQFLNAAK
1,SC(+57.02)AAAGTEC(+57.02)LISGWGNTK,104.26,1881.835,18,2.1,941.9268,93.16,3330000.0,3860000.0,1730000.0,...,1,0,2,0,0,0,SCLISGWGNTK,11,2,SCLLSGWGNTK
2,SSGSSYPSLLQC(+57.02)LK,88.49,1525.7446,14,2.0,763.8811,101.67,371000.0,1150000.0,367000.0,...,0,1,1,0,0,0,SSGSSYPSLLQCLK,14,1,SSGSSYPSLLQCLK
3,SGGGGGGGLGSGGSIR,85.72,1231.5905,16,1.9,616.8036,39.3,,,,...,0,0,0,0,0,0,SGGGGGGGLGSGGSIR,16,0,SGGGGGGGLGSGGSLR
4,RHPYFYAPELLFFAKR,83.71,2054.0889,16,2.1,514.5306,131.89,,,,...,0,2,0,0,0,0,RHPYFYAPELLFFAKR,16,0,RHPYFYAPELLFFAKR
