### Manipulation of PEAKS de novo results of Trocas7 (April, 2019 high water) lower Amazon River proteomics LC-MS/MS data using Python.

Starting with:

PEAKS 8.5 de novo results (.csv) of all combined samples sequencing >580% ALC
from Thermo Fusion tribrid runs at the UW Proteomics Resource center, April 2021
combined from multiple injections

Goal:

Files with stripped (no PTMs) peptide lists and
Columns with #'s of each modification in every sequence
Column with stripped peptide lengths (# amino acids)

In [1]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [2]:
cd /home/millieginty/Documents/git-repos/amazon/data/TROCAS7_Fusion_Apr2021_DENOVO-75-all-samples/

/home/millieginty/Documents/git-repos/amazon/data/TROCAS7_Fusion_Apr2021_DENOVO-75-all-samples


In [3]:
ls

Apr21-all-DN50.csv


In [12]:
# read the CSVs into a dataframe using the read_csv function and call 'peaks'

peaks = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/TROCAS7_Fusion_Apr2021_DENOVO-75-all-samples/Apr21-all-DN50.csv")

print("# redundant Peaks peptides >50% ALC in combined dataframe:", len(peaks))

print(peaks.columns)

# These columns mess things up- get rid of them

del peaks['Fraction']
del peaks['Scan']
del peaks['Source File']
del peaks['Tag Length']
del peaks['PTM']
del peaks['tag (>=0%)']
del peaks['mode']
del peaks['local confidence (%)']


columns = ['Peptide', 'ALC', 'length', 'm/z', 'z', 'RT', 'Area',
       'Mass', 'ppm']

peaks.columns = columns

mean_len = peaks['length'].mean()
print(mean_len)

# look at the dataframe
peaks.head()

# redundant Peaks peptides >50% ALC in combined dataframe: 7201
Index(['Fraction', 'Scan', 'Source File', 'Peptide', 'Tag Length', 'ALC (%)',
       'length', 'm/z', 'z', 'RT', 'Area', 'Mass', 'ppm', 'PTM',
       'local confidence (%)', 'tag (>=0%)', 'mode'],
      dtype='object')
10.145535342313568


Unnamed: 0,Peptide,ALC,length,m/z,z,RT,Area,Mass,ppm
0,EC(+57.02)ELSC(+57.02)K,99,7,463.1921,2,37.66,70400.0,924.3681,1.6
1,VLEGNEQFLNAAK,99,13,716.8768,2,75.43,3970000.0,1431.7358,2.3
2,VLEGNEQFLNAAK,99,13,716.8758,2,71.3,618000.0,1431.7358,0.8
3,SC(+57.02)ELSC(+57.02)K,99,7,442.1857,2,22.79,242000.0,882.3575,-0.7
4,EC(+57.02)ELSC(+57.02)K,98,7,463.1911,2,37.47,48500.0,924.3681,-0.4


In [11]:
# use a count function to enumerate the # of A's (alanines) in each peptide
peaks['A'] = peaks['Peptide'].str.count("A")

# use a count function to enumerate the # of C's (cysteines) in each peptide
peaks['C'] = peaks['Peptide'].str.count("C")

# use a count function to enumerate the # of D's (aspartic acids) in each peptide
peaks['D'] = peaks['Peptide'].str.count("D")

# use a count function to enumerate the # of E's (glutamic acids) in each peptide
peaks['E'] = peaks['Peptide'].str.count("E")

# use a count function to enumerate the # of F's (phenylalanines) in each peptide
peaks['F'] = peaks['Peptide'].str.count("F")

# use a count function to enumerate the # of G's (glycines) in each peptide
peaks['G'] = peaks['Peptide'].str.count("G")

# use a count function to enumerate the # of H's (histidines) in each peptide
peaks['H'] = peaks['Peptide'].str.count("H")

# use a count function to enumerate the # of I's (isoleucines) in each peptide
# in peaks output, there will be no isoleucines (they're lumped in with leucines)
peaks['I'] = peaks['Peptide'].str.count("I")

# use a count function to enumerate the # of K's (lysines) in each peptide
peaks['K'] = peaks['Peptide'].str.count("K")

# use a count function to enumerate the # of L's (leucines) in each peptide
# also these include the isoleucines
peaks['L'] = peaks['Peptide'].str.count("L")

# use a count function to enumerate the # of M's (methionines) in each peptide
peaks['M'] = peaks['Peptide'].str.count("M")

# use a count function to enumerate the # of N's (asparagines) in each peptide
peaks['N'] = peaks['Peptide'].str.count("N")

# use a count function to enumerate the # of P's ([prolines]) in each peptide
peaks['P'] = peaks['Peptide'].str.count("P")

# use a count function to enumerate the # of Q's (glutamines) in each peptide
peaks['Q'] = peaks['Peptide'].str.count("Q")

# use a count function to enumerate the # of R's (arginines) in each peptide
peaks['R'] = peaks['Peptide'].str.count("R")

# use a count function to enumerate the # of S's (serines) in each peptide
peaks['S'] = peaks['Peptide'].str.count("S")

# use a count function to enumerate the # of T's (threonines) in each peptide
peaks['T'] = peaks['Peptide'].str.count("T")

# use a count function to enumerate the # of V's (valines) in each peptide
peaks['V'] = peaks['Peptide'].str.count("V")

# use a count function to enumerate the # of W's (tryptophans) in each peptide
peaks['W'] = peaks['Peptide'].str.count("W")

# use a count function to enumerate the # of Y's (tyrosines) in each peptide
peaks['Y'] = peaks['Peptide'].str.count("Y")

# use a count function to enumerate the # of carbamidomethylated C's in each peptide
peaks['c-carb'] = peaks['Peptide'].str.count("57.02")

# use a count function to enumerate the # of oxidized M's in each peptide
peaks['m-oxid'] = peaks['Peptide'].apply(lambda x: x.count('M(+15.99)'))

# use a count function to enumerate the # of oxidized K's in each peptide
#peaks['k-oxid'] = peaks['Peptide'].apply(lambda x: x.count('K(+15.99)'))

# use a count function to enumerate the # of oxidized P's in each peptide
#peaks['p-oxid'] = peaks['Peptide'].apply(lambda x: x.count('P(+15.99)'))

# use a count function to enumerate the # of oxidized R's in each peptide
#peaks['r-oxid'] = peaks['Peptide'].apply(lambda x: x.count('R(+15.99)'))

# use a count function to enumerate the # of oxidized Y's in each peptide
#peaks['y-oxid'] = peaks['Peptide'].apply(lambda x: x.count('Y(+15.99)'))

# use a lamba function to enumerate the # of deamidated N's in each peptide
peaks['n-deam'] = peaks['Peptide'].apply(lambda x: x.count('N(+.98)'))

# use a count function to enumerate the # of deamidated Q's in each peptide
peaks['q-deam'] = peaks['Peptide'].apply(lambda x: x.count('Q(+.98)'))

# use a count function to enumerate the # of methylated K's in each peptide
#peaks['k-meth'] = peaks['Peptide'].apply(lambda x: x.count('K(+14.02)'))

# use a count function to enumerate the # of methylated R's in each peptide
#peaks['r-meth'] = peaks['Peptide'].apply(lambda x: x.count('R(+14.02)'))

# use a count function to enumerate the # of pyro glu Q's in each peptide
#peaks['q-pyro'] = peaks['Peptide'].apply(lambda x: x.count('Q(-17.03)'))

# use a count function to enumerate the # of acetylation of K's in each peptide
#peaks['k-acet'] = peaks['Peptide'].apply(lambda x: x.count('K(+42.01)'))

# create a column with 'stripped' peptide sequences using strip
peaks['stripped_peptide'] = peaks['Peptide'].str.replace(r"\(.*\)","")

# add a column with the stripped peptide length (number of AAs)
peaks['stripped_length'] = peaks['stripped_peptide'].apply(len)

# total the number of modifications in sequence
peaks['ptm-total'] = peaks['c-carb'] + peaks['m-oxid'] + peaks['n-deam'] + peaks['q-deam'] 

# calculate NAAF numerator for each peptide k
peaks['NAAF_num.'] = peaks['Area'] / peaks['stripped_length']

# write modified dataframe to new txt file, same name + 'stripped'
peaks.to_csv("/home/millieginty/Documents/git-repos/amazon/data/processed/TROCAS7_Fusion_Apr2021-all-samples/Apr21-peaks76-DN50-peptide-proc.csv")

# check out the results
peaks.head()

Unnamed: 0,Peptide,ALC,length,m/z,z,RT,Area,Mass,ppm,A,...,W,Y,c-carb,m-oxid,n-deam,q-deam,stripped_peptide,stripped_length,ptm-total,NAAF_num.
0,EC(+57.02)ELSC(+57.02)K,99,7,463.1921,2,37.66,70400.0,924.3681,1.6,0,...,0,0,2,0,0,0,ECK,3,2,23466.666667
1,VLEGNEQFLNAAK,99,13,716.8768,2,75.43,3970000.0,1431.7358,2.3,2,...,0,0,0,0,0,0,VLEGNEQFLNAAK,13,0,305384.615385
2,VLEGNEQFLNAAK,99,13,716.8758,2,71.3,618000.0,1431.7358,0.8,2,...,0,0,0,0,0,0,VLEGNEQFLNAAK,13,0,47538.461538
3,SC(+57.02)ELSC(+57.02)K,99,7,442.1857,2,22.79,242000.0,882.3575,-0.7,0,...,0,0,2,0,0,0,SCK,3,2,80666.666667
4,EC(+57.02)ELSC(+57.02)K,98,7,463.1911,2,37.47,48500.0,924.3681,-0.4,0,...,0,0,2,0,0,0,ECK,3,2,16166.666667
