## Unipept lowest common ancestor peptide analysis:

### This tool returns the taxonomic lowest common ancestor for a given tryptic peptide. Here we're running all our de novo (PeaksDN), database search (Comet) and de novo-assisted database searched peptides through to determine their specificity and ability to ID organismal and functional source.

### You can run the `pept2lca` command as part of a [web server](https://unipept.ugent.be/datasets) or using the command line interface ([info here](https://unipept.ugent.be/clidocs)).

### I exported the LCA results to a .csv and placed into my /analysis/unipept directory:

In [1]:
cd /home/millieginty/Documents/git-repos/2017-etnp/analyses/pronovo-2020/unipept/SKQ17-PTMopt/

/home/millieginty/Documents/git-repos/2017-etnp/analyses/pronovo-2020/unipept/SKQ17-PTMopt


In [2]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

### 231: 100 m 0.3 um McLane pump MSMS

### There were phylum level peptide IDs from the PeaksDB, Comet, and de novo peptides for this sample.

In [3]:
ls De-novo/

231-dn80-unipept-phy-NAAF.csv  273-PTMopt-DN80_lca.csv
231-PTMopt-DN80_lca.csv        273-PTMopt-DN80-lca-tax.csv
231-PTMopt-DN80-lca-tax.csv    278-PTMopt-DN80_lca.csv
233-PTMopt-DN80_lca.csv        278-PTMopt-DN80-lca-tax.csv
233-PTMopt-DN80-lca-tax.csv    378-PTMopt-DN80_lca.csv
243-PTMopt-DN80_lca.csv        378-PTMopt-DN80-lca-tax.csv
243-PTMopt-DN80-lca-tax.csv    [0m[01;34mPhylum-out[0m/


In [28]:
dn80_lca231 = pd.read_csv('De-novo/231-PTMopt-DN80-lca-tax.csv')

# drop the peptide that aren't specific to the phylum level
dn80_phy231 = dn80_lca231[dn80_lca231.phylum.notnull()]

# drop everything else
dn80_phy231 = dn80_phy231[['peptide', 'phylum']].copy()

print('Peptides specific to the Phylum level:', len(dn80_phy231))
phylum = len(dn80_phy231)

#Cyanobacteria
print('Peptide specific to Cyanobacteria:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Cyanobacteria')]))

Cyanobacteria = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Cyanobacteria')])

#Fungi and fungi-like
print('Peptide specific to Ascomycota:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Ascomycota')]))
print('Peptide specific to Basidiomycota:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Basidiomycota')]))
print('Peptide specific to Chytridiomycota:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Chytridiomycota')]))
print('Peptide specific to Mucoromycota:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Mucoromycota')]))
print('Peptide specific to Zoopagomycota:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Zoopagomycota')]))

Ascomycota = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Ascomycota')])
Basidiomycota = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Basidiomycota')])
Chytridiomycota= len(dn80_phy231[dn80_phy231['phylum'].str.contains('Chytridiomycota')])
Mucoromycota = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Mucoromycota')])
Zoopagomycota = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Zoopagomycota')])

#Hetertrophic bacteria
print('Peptide specific to Actinobacteria:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Actinobacteria')]))
print('Peptide specific to Bacteroidetes:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Bacteroidetes')]))
print('Peptide specific to Firmicutes:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Firmicutes')]))
print('Peptide specific to Proteobacteria:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Proteobacteria')]))
print('Peptide specific to Candidatus Marinimicrobia:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Candidatus Marinimicrobia')]))

Actinobacteria = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Actinobacteria')])
Bacteroidetes = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Bacteroidetes')])
Firmicutes = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Firmicutes')])
Proteobacteria = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Proteobacteria')])
Candidatus_Marinimicrobia = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Candidatus Marinimicrobia')])

#Nitrospina
print('Peptide specific to Nitrospinae:', len(dn80_phy231[dn80_phy231['phylum'].str.contains('Nitrospinae')]))

Nitrospinae = len(dn80_phy231[dn80_phy231['phylum'].str.contains('Nitrospinae')])

# make a dictionary of phylum output
phy_db80_231 = {"Cyanobacteria" : Cyanobacteria, "Ascomycota" : Ascomycota, "Basidiomycota" : Basidiomycota,\
                   "Chytridiomycota" : Chytridiomycota, "Mucoromycota" : Mucoromycota, \
                "Zoopagomycota" : Zoopagomycota, "Actinobacteria" : Actinobacteria, "Bacteroidetes" : Bacteroidetes, \
               "Firmicutes" : Firmicutes, "Proteobacteria" : Proteobacteria, "Nitrospinae" : Nitrospinae, \
              "Candidatus Marinimicrobia" : Candidatus_Marinimicrobia}

# make phylum specific dataframes
dn80_231_cyano = dn80_phy231[dn80_phy231['phylum'].str.contains('Cyanobacteria')]
dn80_231_fungi = dn80_phy231[dn80_phy231['phylum'].str.contains('Ascomycota|Basidiomycota|Chytridiomycota|Mucoromycota|Zoopagomycota')]
dn80_231_hetb = dn80_phy231[dn80_phy231['phylum'].str.contains('Actinobacteria|Bacteroidetes|Firmicutes|Proteobacteria|Candidatus Marinimicrobia')]
dn80_231_nitro = dn80_phy231[dn80_phy231['phylum'].str.contains('Nitrospinae')]

# save as a csv
dn80_phy231.to_csv("De-novo/Phylum-out/231/231-PTMopt-DN80-lca-phy.csv")
dn80_231_cyano.to_csv("De-novo/Phylum-out/231/231-PTMopt-DN80-lca-cyano.csv")
dn80_231_fungi.to_csv("De-novo/Phylum-out/231/231-PTMopt-DN80-lca-fungi.csv")
dn80_231_hetb.to_csv("De-novo/Phylum-out/231/231-PTMopt-DN80-lca-hetb.csv")
dn80_231_nitro.to_csv("De-novo/Phylum-out/231/231-PTMopt-DN80-lca-nitro.csv")

dn80_231_hetb.head()

Peptides specific to the Phylum level: 265
Peptide specific to Cyanobacteria: 15
Peptide specific to Ascomycota: 27
Peptide specific to Basidiomycota: 5
Peptide specific to Chytridiomycota: 2
Peptide specific to Mucoromycota: 1
Peptide specific to Zoopagomycota: 1
Peptide specific to Actinobacteria: 21
Peptide specific to Bacteroidetes: 24
Peptide specific to Firmicutes: 24
Peptide specific to Proteobacteria: 73
Peptide specific to Candidatus Marinimicrobia: 0
Peptide specific to Nitrospinae: 3


Unnamed: 0,peptide,phylum
2,LTGVLYGR,Firmicutes
37,LTGNLYGR,Proteobacteria
41,LATVLSPR,Proteobacteria
70,TGGAFVEK,Proteobacteria
73,WSVLQDR,Proteobacteria


In [29]:
phy_db80_231

{'Cyanobacteria': 15,
 'Ascomycota': 27,
 'Basidiomycota': 5,
 'Chytridiomycota': 2,
 'Mucoromycota': 1,
 'Zoopagomycota': 1,
 'Actinobacteria': 21,
 'Bacteroidetes': 24,
 'Firmicutes': 24,
 'Proteobacteria': 73,
 'Nitrospinae': 3,
 'Candidatus Marinimicrobia': 0}

In [30]:
pdb_lca231 = pd.read_csv('PeaksDB/231-PTMopt-PeaksDB-lca-tax.csv')

# drop the peptide that aren't specific to the phylum level
pdb_phy231 = pdb_lca231[pdb_lca231.phylum.notnull()]

# drop everything else
pdb_phy231 = pdb_phy231[['peptide', 'phylum']].copy()

print('Peptides specific to the Phylum level:', len(pdb_phy231))
phylum = len(pdb_phy231)

#Cyanobacteria
print('Peptide specific to Cyanobacteria:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Cyanobacteria')]))

Cyanobacteria = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Cyanobacteria')])

#Fungi and fungi-like
print('Peptide specific to Ascomycota:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Ascomycota')]))
print('Peptide specific to Basidiomycota:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Basidiomycota')]))
print('Peptide specific to Chytridiomycota:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Chytridiomycota')]))
print('Peptide specific to Mucoromycota:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Mucoromycota')]))
print('Peptide specific to Zoopagomycota:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Zoopagomycota')]))

Ascomycota = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Ascomycota')])
Basidiomycota = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Basidiomycota')])
Chytridiomycota= len(pdb_phy231[pdb_phy231['phylum'].str.contains('Chytridiomycota')])
Mucoromycota = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Mucoromycota')])
Zoopagomycota = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Zoopagomycota')])

#Hetertrophic bacteria
print('Peptide specific to Actinobacteria:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Actinobacteria')]))
print('Peptide specific to Bacteroidetes:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Bacteroidetes')]))
print('Peptide specific to Firmicutes:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Firmicutes')]))
print('Peptide specific to Proteobacteria:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Proteobacteria')]))
print('Peptide specific to Candidatus Marinimicrobia:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Candidatus Marinimicrobia')]))

Actinobacteria = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Actinobacteria')])
Bacteroidetes = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Bacteroidetes')])
Firmicutes = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Firmicutes')])
Proteobacteria = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Proteobacteria')])
Candidatus_Marinimicrobia = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Candidatus Marinimicrobia')])

#Nitrospina
print('Peptide specific to Nitrospinae:', len(pdb_phy231[pdb_phy231['phylum'].str.contains('Nitrospinae')]))

Nitrospinae = len(pdb_phy231[pdb_phy231['phylum'].str.contains('Nitrospinae')])

# make a dictionary of phylum output
phy_pdb_231 = {"Cyanobacteria" : Cyanobacteria, "Ascomycota" : Ascomycota, "Basidiomycota" : Basidiomycota,\
                   "Chytridiomycota" : Chytridiomycota, "Mucoromycota" : Mucoromycota, \
                "Zoopagomycota" : Zoopagomycota, "Actinobacteria" : Actinobacteria, "Bacteroidetes" : Bacteroidetes, \
               "Firmicutes" : Firmicutes, "Proteobacteria" : Proteobacteria, "Nitrospinae" : Nitrospinae, \
              "Candidatus Marinimicrobia" : Candidatus_Marinimicrobia}

# make phylum specific dataframes
pdb_231_cyano = pdb_phy231[pdb_phy231['phylum'].str.contains('Cyanobacteria')]
pdb_231_fungi = pdb_phy231[pdb_phy231['phylum'].str.contains('Ascomycota|Basidiomycota|Chytridiomycota|Mucoromycota|Zoopagomycota')]
pdb_231_hetb = pdb_phy231[pdb_phy231['phylum'].str.contains('Actinobacteria|Bacteroidetes|Firmicutes|Proteobacteria|Candidatus Marinimicrobia')]
pdb_231_nitro = pdb_phy231[pdb_phy231['phylum'].str.contains('Nitrospinae')]

# save as a csv
pdb_phy231.to_csv("PeaksDB/Phylum-out/231/231-PTMopt-pdb-lca-phy.csv")
pdb_231_cyano.to_csv("PeaksDB/Phylum-out/231/231-PTMopt-pdb-lca-cyano.csv")
pdb_231_fungi.to_csv("PeaksDB/Phylum-out/231/231-PTMopt-pdb-lca-fungi.csv")
pdb_231_hetb.to_csv("PeaksDB/Phylum-out/231/231-PTMopt-pdb-lca-hetb.csv")
pdb_231_nitro.to_csv("PeaksDB/Phylum-out/231/231-PTMopt-pdb-lca-nitro.csv")

pdb_phy231.head()

Peptides specific to the Phylum level: 164
Peptide specific to Cyanobacteria: 75
Peptide specific to Ascomycota: 6
Peptide specific to Basidiomycota: 0
Peptide specific to Chytridiomycota: 0
Peptide specific to Mucoromycota: 0
Peptide specific to Zoopagomycota: 0
Peptide specific to Actinobacteria: 3
Peptide specific to Bacteroidetes: 2
Peptide specific to Firmicutes: 1
Peptide specific to Proteobacteria: 40
Peptide specific to Candidatus Marinimicrobia: 0
Peptide specific to Nitrospinae: 17


Unnamed: 0,peptide,phylum
8,LVVGGPYSSVSDASSGLDGSQK,Cyanobacteria
9,YLGSTGGLLNSAETEEK,Cyanobacteria
11,ALTTGVDYAQGLVALGGDDK,Cyanobacteria
15,YNSGEGGCFYSVDTLEAPWNSGR,Cyanobacteria
21,TQFYNDEPEALEYGENFLVHR,Nitrospinae


In [31]:
comet_lca231 = pd.read_csv('Comet/231-PTMopt-Comet-lca-tax.csv')

# drop the peptide that aren't specific to the phylum level
comet_phy231 = comet_lca231[comet_lca231.phylum.notnull()]

# drop everything else
comet_phy231 = comet_phy231[['peptide', 'phylum']].copy()

print('Peptides specific to the Phylum level:', len(comet_phy231))
phylum = len(comet_phy231)

#Cyanobacteria
print('Peptide specific to Cyanobacteria:', len(comet_phy231[comet_phy231['phylum'].str.contains('Cyanobacteria')]))

Cyanobacteria = len(comet_phy231[comet_phy231['phylum'].str.contains('Cyanobacteria')])

#Fungi and fungi-like
print('Peptide specific to Ascomycota:', len(comet_phy231[comet_phy231['phylum'].str.contains('Ascomycota')]))
print('Peptide specific to Basidiomycota:', len(comet_phy231[comet_phy231['phylum'].str.contains('Basidiomycota')]))
print('Peptide specific to Chytridiomycota:', len(comet_phy231[comet_phy231['phylum'].str.contains('Chytridiomycota')]))
print('Peptide specific to Mucoromycota:', len(comet_phy231[comet_phy231['phylum'].str.contains('Mucoromycota')]))
print('Peptide specific to Zoopagomycota:', len(comet_phy231[comet_phy231['phylum'].str.contains('Zoopagomycota')]))

Ascomycota = len(comet_phy231[comet_phy231['phylum'].str.contains('Ascomycota')])
Basidiomycota = len(comet_phy231[comet_phy231['phylum'].str.contains('Basidiomycota')])
Chytridiomycota= len(comet_phy231[comet_phy231['phylum'].str.contains('Chytridiomycota')])
Mucoromycota = len(comet_phy231[comet_phy231['phylum'].str.contains('Mucoromycota')])
Zoopagomycota = len(comet_phy231[comet_phy231['phylum'].str.contains('Zoopagomycota')])

#Hetertrophic bacteria
print('Peptide specific to Actinobacteria:', len(comet_phy231[comet_phy231['phylum'].str.contains('Actinobacteria')]))
print('Peptide specific to Bacteroidetes:', len(comet_phy231[comet_phy231['phylum'].str.contains('Bacteroidetes')]))
print('Peptide specific to Firmicutes:', len(comet_phy231[comet_phy231['phylum'].str.contains('Firmicutes')]))
print('Peptide specific to Proteobacteria:', len(comet_phy231[comet_phy231['phylum'].str.contains('Proteobacteria')]))
print('Peptide specific to Candidatus Marinimicrobia:', len(comet_phy231[comet_phy231['phylum'].str.contains('Candidatus Marinimicrobia')]))

Actinobacteria = len(comet_phy231[comet_phy231['phylum'].str.contains('Actinobacteria')])
Bacteroidetes = len(comet_phy231[comet_phy231['phylum'].str.contains('Bacteroidetes')])
Firmicutes = len(comet_phy231[comet_phy231['phylum'].str.contains('Firmicutes')])
Proteobacteria = len(comet_phy231[comet_phy231['phylum'].str.contains('Proteobacteria')])
Candidatus_Marinimicrobia = len(comet_phy231[comet_phy231['phylum'].str.contains('Candidatus Marinimicrobia')])

#Nitrospina
print('Peptide specific to Nitrospinae:', len(comet_phy231[comet_phy231['phylum'].str.contains('Nitrospinae')]))

Nitrospinae = len(comet_phy231[comet_phy231['phylum'].str.contains('Nitrospinae')])

# make a dictionary of phylum output
phy_comet_231 = {"Cyanobacteria" : Cyanobacteria, "Ascomycota" : Ascomycota, "Basidiomycota" : Basidiomycota,\
                   "Chytridiomycota" : Chytridiomycota, "Mucoromycota" : Mucoromycota, \
                "Zoopagomycota" : Zoopagomycota, "Actinobacteria" : Actinobacteria, "Bacteroidetes" : Bacteroidetes, \
               "Firmicutes" : Firmicutes, "Proteobacteria" : Proteobacteria, "Nitrospinae" : Nitrospinae, \
              "Candidatus Marinimicrobia" : Candidatus_Marinimicrobia}


# make phylum specific dataframes
comet_231_cyano = comet_phy231[comet_phy231['phylum'].str.contains('Cyanobacteria')]
comet_231_fungi = comet_phy231[comet_phy231['phylum'].str.contains('Ascomycota|Basidiomycota|Chytridiomycota|Mucoromycota|Zoopagomycota')]
comet_231_hetb = comet_phy231[comet_phy231['phylum'].str.contains('Actinobacteria|Bacteroidetes|Firmicutes|Proteobacteria|Candidatus Marinimicrobia')]
comet_231_nitro = comet_phy231[comet_phy231['phylum'].str.contains('Nitrospinae')]

# save as a csv
comet_phy231.to_csv("Comet/Phylum-out/231/231-PTMopt-comet-lca-phy.csv")
comet_231_cyano.to_csv("Comet/Phylum-out/231/231-PTMopt-comet-lca-cyano.csv")
comet_231_fungi.to_csv("Comet/Phylum-out/231/231-PTMopt-comet-lca-fungi.csv")
comet_231_hetb.to_csv("Comet/Phylum-out/231/231-PTMopt-comet-lca-hetb.csv")
comet_231_nitro.to_csv("Comet/Phylum-out/231/231-PTMopt-comet-lca-nitro.csv")

comet_phy231.head()

Peptides specific to the Phylum level: 14
Peptide specific to Cyanobacteria: 10
Peptide specific to Ascomycota: 0
Peptide specific to Basidiomycota: 0
Peptide specific to Chytridiomycota: 0
Peptide specific to Mucoromycota: 0
Peptide specific to Zoopagomycota: 0
Peptide specific to Actinobacteria: 0
Peptide specific to Bacteroidetes: 0
Peptide specific to Firmicutes: 0
Peptide specific to Proteobacteria: 0
Peptide specific to Candidatus Marinimicrobia: 0
Peptide specific to Nitrospinae: 4


Unnamed: 0,peptide,phylum
10,TNQNVGLDPETLALATPAR,Cyanobacteria
13,YLGSTGGLLNSAETEEK,Cyanobacteria
21,LFLSPVDSVVR,Cyanobacteria
33,AYTALLDLKPGDEFQLK,Cyanobacteria
36,YNSGEGGCFYSVDTLEAPWNSGR,Cyanobacteria


In [32]:
# made a dataframe from the peaks dn and peaks db dictionaries

phy_231 = pd.DataFrame({'phy_db80_231':pd.Series(phy_db80_231),'phy_pdb_231':pd.Series(phy_pdb_231), \
                        'phy_comet_231':pd.Series(phy_comet_231)})

uni_231 = phy_231.T

uni_231['Fungi tot'] = uni_231['Ascomycota'] + uni_231['Basidiomycota'] + uni_231['Chytridiomycota'] + \
                        uni_231['Mucoromycota'] + uni_231['Zoopagomycota']

uni_231['Het tot'] = uni_231['Actinobacteria'] + uni_231['Bacteroidetes'] + uni_231['Firmicutes'] + \
                        uni_231['Proteobacteria'] + uni_231['Candidatus Marinimicrobia']


uni_231.head()

Unnamed: 0,Cyanobacteria,Ascomycota,Basidiomycota,Chytridiomycota,Mucoromycota,Zoopagomycota,Actinobacteria,Bacteroidetes,Firmicutes,Proteobacteria,Nitrospinae,Candidatus Marinimicrobia,Fungi tot,Het tot
phy_db80_231,15,27,5,2,1,1,21,24,24,73,3,0,36,142
phy_pdb_231,75,6,0,0,0,0,3,2,1,40,17,0,6,46
phy_comet_231,10,0,0,0,0,0,0,0,0,0,4,0,0,0


## Get NAAF values in order to correct UniPept peptides:

In [33]:
# read in the PeaksDB NAAF corrected peptides (from PeaksDB nb)
peaksdb231_whole = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDB-PTMopt/231/ETNP_SKQ17_PEAKSDB20_231-100m-0.3-JA2_15ppm.csv")

print(peaksdb231_whole.columns)

# winnow down to just what we need
peaksdb231 = peaksdb231_whole[['stripped I-L', 'Peptide', 'Accession', 'Area', \
                              'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'c-carb',
       'm-oxid', 'n-deam', 'q-deam', 'k-hydr', 'r-meth', 'stripped length', 'NAAF num.']].copy()

# rename PeaksDB UniPept columns
pdb_phy231.columns = ['stripped I-L', 'phylum']

# inner join of PeaksDB NAAF containing df and PeaksDB Unipept df
pdb231 = pd.concat([peaksdb231, pdb_phy231], axis=1, join='inner')

print('Peptides found between the two:', len(pdb231))

# write to a csv

pdb231.to_csv("PeaksDB/231-PeaksDB-unipept-phy-NAAF.csv")

pdb231.head()

Index(['Unnamed: 0', 'Peptide', '-10lgP', 'Mass', 'Length', 'ppm', 'm/z', 'RT',
       'Area', 'Fraction', 'Scan', 'Source File', '#Spec', '#Spec.1',
       'Accession', 'PTM', 'AScore', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'c-carb',
       'm-oxid', 'n-deam', 'q-deam', 'k-hydr', 'r-meth', 'stripped peptide',
       'stripped length', 'NAAF num.', 'ptm-total', 'stripped I-L'],
      dtype='object')
Peptides found between the two: 164


Unnamed: 0,stripped I-L,Peptide,Accession,Area,A,C,D,E,F,G,...,c-carb,m-oxid,n-deam,q-deam,k-hydr,r-meth,stripped length,NAAF num.,stripped I-L.1,phylum
8,AGAGDDEVNAGSGDDLVR,AGAGDDEVNAGSGDDIVR,ETNP_120m_PROKKA_15747,7880000.0,3,0,4,1,0,4,...,0,0,0,0,0,0,18,437777.777778,LVVGGPYSSVSDASSGLDGSQK,Cyanobacteria
9,ALSADSSGGFLGGAELSQLK,AISADSSGGFIGGAELSQLK,ETNP_90m_PROKKA_138320,1030000.0,3,0,1,1,1,4,...,0,0,0,0,0,0,20,51500.0,YLGSTGGLLNSAETEEK,Cyanobacteria
11,TNYFGLQGTDNGNLTNSFAESELERA,TNYFGIQGTDNGNLTNSFAESELERA,NP_100m_PROKKA_25090:ETNP_140m_PROKKA_82407:ET...,3790000.0,2,0,1,3,2,3,...,0,0,0,0,0,0,26,145769.230769,ALTTGVDYAQGLVALGGDDK,Cyanobacteria
15,GGQPLFFGEGTYANLSQTAR,GGQPLFFGEGTYANLSQTAR,ETNP_100m_particle_PROKKA_120141:WP_011129980:...,3800000.0,2,0,0,1,2,4,...,0,0,0,0,0,0,20,190000.0,YNSGEGGCFYSVDTLEAPWNSGR,Cyanobacteria
21,YNSGEGGCFYSVDTLEAPWNSGR,YNSGEGGC(+57.02)FYSVDTIEAPWNSGR,WP_011129980:WP_036915357,1780000.0,1,1,1,2,1,4,...,1,0,0,0,0,0,23,77391.304348,TQFYNDEPEALEYGENFLVHR,Nitrospinae


In [34]:
# read in the de novo NAAF corrected peptides (from de novo nb)
dn80231_whole = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/PeaksDN-PTMopt/231/ETNP-SKQ17-231-100m-0.3-JA2_PTMopt_15ppm_DN50.csv")

print(dn80231_whole.columns)

# winnow down to just what we need
dn80_231 = dn80231_whole[['stripped peptide', 'Peptide', 'Area', \
                              'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'c-carb',
       'm-oxid', 'n-deam', 'q-deam', 'k-hydr', 'r-meth', 'stripped length', 'NAAF num.']].copy()

# rename the orignial de novo peptides
dn80_231.columns = ['stripped I-L', 'Peptide', 'Area', \
                              'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'c-carb',
       'm-oxid', 'n-deam', 'q-deam', 'k-hydr', 'r-meth', 'stripped length', 'NAAF num.']

# rename dn80 UniPept columns
dn80_phy231.columns = ['stripped I-L', 'phylum']

# inner join of dn80 NAAF containing df and dn80 Unipept df
dn80231 = pd.concat([dn80_231, dn80_phy231], axis=1, join='inner')

print('Peptides found between the two:', len(dn80231))

# write to a csv

dn80231.to_csv("De-novo/231-dn80-unipept-phy-NAAF.csv")

dn80231.head()

Index(['Unnamed: 0', 'Fraction', 'Scan', 'Source File', 'Peptide',
       'Tag Length', 'ALC (%)', 'length', 'm/z', 'z', 'RT', 'Area', 'Mass',
       'ppm', 'PTM', 'local confidence (%)', 'tag (>=0%)', 'mode', 'A', 'C',
       'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S',
       'T', 'V', 'W', 'Y', 'c-carb', 'm-oxid', 'n-deam', 'q-deam', 'k-hydr',
       'r-meth', 'stripped peptide', 'stripped length', 'ptm-total',
       'NAAF num.'],
      dtype='object')
Peptides found between the two: 265


Unnamed: 0,stripped I-L,Peptide,Area,A,C,D,E,F,G,H,...,c-carb,m-oxid,n-deam,q-deam,k-hydr,r-meth,stripped length,NAAF num.,stripped I-L.1,phylum
2,LTGVLYGR,LTGVLYGR,11200000.0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,8,1400000.0,LTGVLYGR,Firmicutes
4,FYEALLEAK,FYEALLEAK,4190000.0,2,0,0,2,1,0,0,...,0,0,0,0,0,0,9,465555.6,FYEALLEAK,Acidobacteria
13,QQLAASPR,QQLAASPR,3990000.0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,8,498750.0,QQLAASPR,Chlorophyta
37,TSGEAFR,TSGEAFR,1760000.0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,7,251428.6,LTGNLYGR,Proteobacteria
41,LNLNNASVR,LN(+.98)LNNASVR,13500000.0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,9,1500000.0,LATVLSPR,Proteobacteria


In [35]:
# read in the Comet NAAF corrected peptides (from Comet nb)

comet231_whole = pd.read_csv("/home/millieginty/Documents/git-repos/2017-etnp/data/pro2020/ETNP-SKQ17/TPP-PTMopt/ETNP-SKQ17-TPP-PTM-opt_15ppm_FUNGI/231/ETNP-SKQ17-231-100m-0.3-JA2_PTMopt_Comet15_1FDR_peptides.csv")

print(comet231_whole.columns)

comet_231 = comet231_whole[['stripped I-L', 'peptide', 'protein', 'precursor_intensity',\
                              'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'c-carb',
       'm-oxid', 'n-deam', 'q-deam', 'k-hydr', 'r-meth', 'stripped length', 'NAAF num.']].copy()

# rename comet UniPept columns
comet_phy231.columns = ['stripped I-L', 'phylum']

# inner join of comet NAAF containing df and comet Unipept df
comet231 = pd.concat([comet_231, comet_phy231], axis=1, join='inner')

print('Peptides found between the two:', len(comet231))

# write to a csv

comet231.to_csv("Comet/231-comet-unipept-phy-NAAF.csv")

comet231.head(10)

Index(['Unnamed: 0', 'spectrum', 'xcorr', 'deltacn', 'expect', 'ions',
       'peptide', 'calc_neutral_pep_mass', 'precursor_intensity', 'protein',
       'L terminus', 'R terminus', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K',
       'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'c-carb',
       'm-oxid', 'n-deam', 'q-deam', 'k-hydr', 'r-meth', 'stripped peptide',
       'stripped length', 'NAAF num.', 'ptm-total', 'stripped I-L'],
      dtype='object')
Peptides found between the two: 14


Unnamed: 0,stripped I-L,peptide,protein,precursor_intensity,A,C,D,E,F,G,...,c-carb,m-oxid,n-deam,q-deam,k-hydr,r-meth,stripped length,NAAF num.,stripped I-L.1,phylum
10,LPQVEGTGGDVQPSQDLVR,R.LPQVEGTGGDVQPSQDLVR.V,gi|54036848|sp|P63284.1|CLPB_ECOLI,89316600.0,0,0,2,1,0,3,...,0,0,0,0,0,0,19,4700874.0,TNQNVGLDPETLALATPAR,Cyanobacteria
13,VAQEAVDAVYAVHPNGAVGR,K.VAQEAVDAVYAVHPNGAVGR.Q,"ETNP_120m_PROKKA_239026,ETNP_100m_particle_PRO...",867407.0,5,0,1,1,0,2,...,0,0,0,0,0,0,20,43370.35,YLGSTGGLLNSAETEEK,Cyanobacteria
21,ALSSGAGNSDFVVGPER,A.AISSGAGNSDFVVGPER.S,"ETNP_100m_PROKKA_57551,ETNP_90m_PROKKA_17321,E...",1994170.0,3,0,1,1,1,3,...,0,0,0,0,0,0,17,117304.1,LFLSPVDSVVR,Cyanobacteria
33,MNKADLVNLVAAR,-.M[147.04]NKADLVNLVAAR.T,"ETNP_120m_PROKKA_53703,ETNP_120m_PROKKA_328578...",13894700.0,3,0,1,0,0,0,...,0,1,0,0,0,0,13,1068823.0,AYTALLDLKPGDEFQLK,Cyanobacteria
36,DLEGDVDVTFNDASTK,K.DLEGDVDVTFNDASTK.L,ETNP_120m_PROKKA_213547,813032.0,1,0,4,1,1,1,...,0,0,0,0,0,0,16,50814.5,YNSGEGGCFYSVDTLEAPWNSGR,Cyanobacteria
38,VLGQNEAVDAVSNALR,R.VIGQNEAVDAVSNAIR.R,gi|54036848|sp|P63284.1|CLPB_ECOLI,84607800.0,3,0,1,1,0,1,...,0,0,0,0,0,0,16,5287988.0,LAENAGANGAVVAENVK,Cyanobacteria
44,GGSGNDTLNGGGGDDLLR,R.GGSGNDTINGGGGDDLIR.M,ETNP_120m_PROKKA_213547,1452480.0,0,0,3,0,0,7,...,0,0,0,0,0,0,18,80693.33,LVVGGPYSSVSDASSGLDGSQK,Cyanobacteria
58,CDTYTTDKSGLAAQR,R.C[160.03]DTYTTDKSGLAAQR.T,"ETNP_120m_PROKKA_281969,ETNP_120m_PROKKA_31399...",677639.0,2,1,2,0,0,1,...,1,0,0,0,0,0,15,45175.93,LDLNNASVR,Cyanobacteria
60,DNFTVTAPELALNEGFDR,K.DNFTVTAPEIALNEGFDR.L,"ETNP_120m_PROKKA_24357,ETNP_120m_PROKKA_126325...",2823130.0,2,0,2,2,2,1,...,0,0,0,0,0,0,18,156840.6,SAQAHGLLPLVCVGESDEQR,Cyanobacteria
64,LAENAGANGAVVAENVK,R.IAENAGANGAVVAENVK.S,"ETNP_120m_PROKKA_99758,ETNP_120m_PROKKA_174478...",843605.0,5,0,0,2,0,2,...,0,0,0,0,0,0,17,49623.82,VTVEEPFYVRPEEHPGAL,Nitrospinae


In [41]:
### Sum NAAF numerators and get NAAF corrected taxa % 

# Get the NAAF numerator totals for phylum-level ID'd peptides (NAAF num = Area/length)

com_NAAFsum = comet231['NAAF num.'].sum()

# comet
# sum the NAAF numerators for each catergory

cometNAAF_231_cyano = comet231[comet231['phylum'].str.contains('Cyanobacteria')]
cometNAAF_231_fungi = comet231[comet231['phylum'].str.contains('Ascomycota|Basidiomycota|Chytridiomycota|Mucoromycota|Zoopagomycota')]
cometNAAF_231_hetb = comet231[comet231['phylum'].str.contains('Actinobacteria|Bacteroidetes|Firmicutes|Proteobacteria|Candidatus Marinimicrobia')]
cometNAAF_231_nitro = comet231[comet231['phylum'].str.contains('Nitrospinae')]

cometNAAF231_sum_cyano = cometNAAF_231_cyano['NAAF num.'].sum()
cometNAAF231_sum_fungi = cometNAAF_231_fungi['NAAF num.'].sum()
cometNAAF231_sum_hetb = cometNAAF_231_hetb['NAAF num.'].sum()
cometNAAF231_sum_nitro = cometNAAF_231_nitro['NAAF num.'].sum()

print("% Comet Cyano = ", (cometNAAF231_sum_cyano / com_NAAFsum)*100)
print("% Comet Fungi = ", (cometNAAF231_sum_fungi / com_NAAFsum)*100)
print("% Comet Het. Bacteria = ", (cometNAAF231_sum_hetb / com_NAAFsum)*100)
print("% Comet Nitrospina = ", (cometNAAF231_sum_nitro / com_NAAFsum)*100)
print("% Comet covered by these catergories = ", ((cometNAAF231_sum_cyano + cometNAAF231_sum_fungi + \
                                             cometNAAF231_sum_hetb + cometNAAF231_sum_nitro)/com_NAAFsum)*100)

% Comet Cyano =  96.53831352505613
% Comet Fungi =  0.0
% Comet Het. Bacteria =  0.0
% Comet Nitrospina =  3.461686474943874
% Comet covered by these catergories =  100.0


In [42]:
### Sum NAAF numerators and get NAAF corrected taxa % 

# Get the NAAF numerator totals for phylum-level ID'd peptides (NAAF num = Area/length)

pdb_NAAFsum = pdb231['NAAF num.'].sum()

# PeaksDB
# sum the NAAF numerators for Cyanobacteria peptides

pdbNAAF_231_cyano = pdb231[pdb231['phylum'].str.contains('Cyanobacteria')]
pdbNAAF_231_fungi = pdb231[pdb231['phylum'].str.contains('Ascomycota|Basidiomycota|Chytridiomycota|Mucoromycota|Zoopagomycota')]
pdbNAAF_231_hetb = pdb231[pdb231['phylum'].str.contains('Actinobacteria|Bacteroidetes|Firmicutes|Proteobacteria|Candidatus Marinimicrobia')]
pdbNAAF_231_nitro = pdb231[pdb231['phylum'].str.contains('Nitrospinae')]

pdbNAAF231_sum_cyano = pdbNAAF_231_cyano['NAAF num.'].sum()
pdbNAAF231_sum_fungi = pdbNAAF_231_fungi['NAAF num.'].sum()
pdbNAAF231_sum_hetb = pdbNAAF_231_hetb['NAAF num.'].sum()
pdbNAAF231_sum_nitro = pdbNAAF_231_nitro['NAAF num.'].sum()

print("% pdb Cyano = ", (pdbNAAF231_sum_cyano / pdb_NAAFsum)*100)
print("% pdb Fungi = ", (pdbNAAF231_sum_fungi / pdb_NAAFsum)*100)
print("% pdb Het. Bacteria = ", (pdbNAAF231_sum_hetb / pdb_NAAFsum)*100)
print("% pdb Nitrospina = ", (pdbNAAF231_sum_nitro / pdb_NAAFsum)*100)
print("% pdb covered by these catergories = ", ((pdbNAAF231_sum_cyano + pdbNAAF231_sum_fungi + \
                                             pdbNAAF231_sum_hetb + pdbNAAF231_sum_nitro)/pdb_NAAFsum)*100)

% pdb Cyano =  76.1100429308966
% pdb Fungi =  3.9040372731237465
% pdb Het. Bacteria =  8.33955541090671
% pdb Nitrospina =  4.211732530189916
% pdb covered by these catergories =  92.56536814511695


In [43]:
### Sum NAAF numerators and get NAAF corrected taxa % 

# Get the NAAF numerator totals for phylum-level ID'd peptides (NAAF num = Area/length)

dn80_NAAFsum = dn80231['NAAF num.'].sum()

# PeaksDB
# sum the NAAF numerators for Cyanobacteria peptides

dn80NAAF_231_cyano = dn80231[dn80231['phylum'].str.contains('Cyanobacteria')]
dn80NAAF_231_fungi = dn80231[dn80231['phylum'].str.contains('Ascomycota|Basidiomycota|Chytridiomycota|Mucoromycota|Zoopagomycota')]
dn80NAAF_231_hetb = dn80231[dn80231['phylum'].str.contains('Actinobacteria|Bacteroidetes|Firmicutes|Proteobacteria|Candidatus Marinimicrobia')]
dn80NAAF_231_nitro = dn80231[dn80231['phylum'].str.contains('Nitrospinae')]

dn80NAAF231_sum_cyano = dn80NAAF_231_cyano['NAAF num.'].sum()
dn80NAAF231_sum_fungi = dn80NAAF_231_fungi['NAAF num.'].sum()
dn80NAAF231_sum_hetb = dn80NAAF_231_hetb['NAAF num.'].sum()
dn80NAAF231_sum_nitro = dn80NAAF_231_nitro['NAAF num.'].sum()

print("% dn80 Cyano = ", (dn80NAAF231_sum_cyano / dn80_NAAFsum)*100)
print("% dn80 Fungi = ", (dn80NAAF231_sum_fungi / dn80_NAAFsum)*100)
print("% dn80 Het. Bacteria = ", (dn80NAAF231_sum_hetb / dn80_NAAFsum)*100)
print("% dn80 Nitrospina = ", (dn80NAAF231_sum_nitro / dn80_NAAFsum)*100)
print("% dn80 covered by these catergories = ", ((dn80NAAF231_sum_cyano + dn80NAAF231_sum_fungi + \
                                             dn80NAAF231_sum_hetb + dn80NAAF231_sum_nitro)/dn80_NAAFsum)*100)

% dn80 Cyano =  23.261588380841925
% dn80 Fungi =  3.8591469937694334
% dn80 Het. Bacteria =  58.46178024167429
% dn80 Nitrospina =  0.7559061221171564
% dn80 covered by these catergories =  86.33842173840279


In [51]:
# calculate a weighted average (based on # of peptides) of taxonomic peptide contribution
# of all method (Comet, PeaksDB, de novo)

comet_peps = len(comet231)
pdb_peps = len(pdb231)
dn_peps = len(dn80231)

ave_231_cyano = ((comet_peps*(cometNAAF231_sum_cyano / com_NAAFsum)) + \
                (pdb_peps*(pdbNAAF231_sum_cyano / pdb_NAAFsum)) + \
                (dn_peps*(dn80NAAF231_sum_cyano / dn80_NAAFsum))) / (comet_peps + pdb_peps + dn_peps)


ave_231_fungi = ((comet_peps*(cometNAAF231_sum_fungi / com_NAAFsum)) + \
                (pdb_peps*(pdbNAAF231_sum_fungi / pdb_NAAFsum)) + \
                (dn_peps*(dn80NAAF231_sum_fungi / dn80_NAAFsum))) / (comet_peps + pdb_peps + dn_peps)

ave_231_hetb = ((comet_peps*(cometNAAF231_sum_hetb / com_NAAFsum)) + \
                (pdb_peps*(pdbNAAF231_sum_hetb / pdb_NAAFsum)) + \
                (dn_peps*(dn80NAAF231_sum_hetb / dn80_NAAFsum))) / (comet_peps + pdb_peps + dn_peps)


ave_231_nitro = ((comet_peps*(cometNAAF231_sum_nitro / com_NAAFsum)) + \
                (pdb_peps*(pdbNAAF231_sum_nitro / pdb_NAAFsum)) + \
                (dn_peps*(dn80NAAF231_sum_nitro / dn80_NAAFsum))) / (comet_peps + pdb_peps + dn_peps)


print("Weighted ave. contribution of Cyano peptides =", ave_231_cyano*100)
print("Weighted ave. contribution of Fungi peptides =", ave_231_fungi*100)
print("Weighted ave. contribution of heterotrophic bac. peptides =", ave_231_hetb*100)
print("Weighted ave. contribution of Nitrospina peptides =", ave_231_nitro*100)

Weighted ave. contribution of Cyano peptides = 45.141996277519056
Weighted ave. contribution of Fungi peptides = 3.753806018377414
Weighted ave. contribution of heterotrophic bac. peptides = 38.05882359239817
Weighted ave. contribution of Nitrospina peptides = 2.1207739683101736
