### Sorting and visualizing UniPept lowest common ancestor analyses in Trocas7

#### Beginning with: LCA peptides in csvs

#### Goal: spectral abundance-corrected (NAAF) taxonomic peptide compositions at the 4 stations before and after 24hrs

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
cd /home/millieginty/Documents/git-repos/amazon/analyses/T7-incubations/unipept/

/home/millieginty/Documents/git-repos/amazon/analyses/T7-incubations/unipept


In [4]:
# read the data into pandas dataframes
CV_T00_GD_206a = pd.read_csv("lca/cleaned/206_CV_T00_GD_lca.csv")

# delete any Metazoa hits because they're trypsin
CV_T00_GD_206b = CV_T00_GD_206a[CV_T00_GD_206a.kingdom != 'Metazoa']

# keep only entries to the phylum level
CV_T00_GD_206 = CV_T00_GD_206b[CV_T00_GD_206b['phylum'].notnull()]

# How many peptides to the phylum level?
print('# of phylum peptides = ', len(CV_T00_GD_206))

CV_T00_GD_206.head(6)

# of phylum peptides =  12


Unnamed: 0,peptide,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,...,tribe,subtribe,genus,subgenus,species group,species subgroup,species,subspecies,varietas,forma
0,VVEVSLPR,Planctomycetales,Bacteria,,,,Planctomycetes,,,Planctomycetia,...,,,,,,,,,,
7,LATVLSPR,Gammaproteobacteria,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,...,,,,,,,,,,
12,VATVSPLR,Solirubrobacteraceae,Bacteria,,,,Actinobacteria,,,Thermoleophilia,...,,,,,,,,,,
17,QEFLNAAK,Phorcysia thermohydrogeniphila,Bacteria,,,,Aquificae,,,Aquificae,...,Phorcysia,,,,Phorcysia thermohydrogeniphila,,,,,
27,VGHWLPR,Streptoalloteichus hindustanus,Bacteria,,,,Actinobacteria,,,Actinomycetia,...,Streptoalloteichus,,,,Streptoalloteichus hindustanus,,,,,
68,MTNGNFK,Moraxella,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,...,Moraxella,,,,,,,,,


### Now I want to read in the file containing the stripped peptides with NAAF values
####  - NAAF stands for 'noramlized area abunace factor'

### I want to join the dataframes if they share an index (stripped peptide with equated leucine and isoleucines)
#### - That means I'll reindex the processed peptide file

In [5]:
CV_T00_GD_206_NAAFa = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/processed/I-L_NAAFs/206_CV_T00_GD_DN50_ILnaafs.csv")
#CV_T00_GD_206_NAAFb = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/processed/I-L_NAAFs/206B_CV_T00_GD_DN50_ILnaafs.csv")

frames = [CV_T00_GD_206_NAAFa]

CV_T00_GD_206_NAAF = pd.concat(frames, sort=False)

CV_T00_GD_206_NAAF.set_index('stripped_peptide')
CV_T00_GD_206_NAAF = CV_T00_GD_206_NAAF.loc[:, ~CV_T00_GD_206_NAAF.columns.str.contains('^Unnamed')]

CV_T00_GD_206_NAAF.rename(columns = {'stripped_peptide':'peptide'}, inplace = True)

print('# of total peptides = ', len(CV_T00_GD_206_NAAF))

print('column names:', CV_T00_GD_206_NAAF.columns)

CV_T00_GD_206_NAAF.head()

# of total peptides =  283
column names: Index(['peptide', 'Area', 'NAAF_num.'], dtype='object')


Unnamed: 0,peptide,Area,NAAF_num.
0,VVEVSLPR,1990000.0,248750.0
1,VLEGNEQFLNAAK,681000.0,52384.62
2,LSSPATLNSR,73100000.0,7310000.0
3,LSSPATLNSR,73100000.0,7310000.0
4,LSSPATLNSR,73100000.0,7310000.0


In [6]:
# get the intersection of the phylum peptides and corresponding peptides w/ NAAFs

over = np.intersect1d(CV_T00_GD_206["peptide"], CV_T00_GD_206_NAAF["peptide"])

print(over)

['APVLSMR' 'EFHPGLAR' 'LATVLSPR' 'QEFLNAAK' 'VATVSPLR' 'VGHWLPR'
 'VPGVDLPR' 'VVEVSLPR']


In [7]:
# join the dataframes if the peptide values are the same using 'join'
# since a couple are de novo only (more for bacteria), we won't have all the UniPept peptides overlap 

CV_T00_GD_206.set_index('peptide', inplace=True)
CV_T00_GD_206_NAAF.set_index('peptide', inplace=True)

CV_T00_GD_206_Phy = CV_T00_GD_206.join(CV_T00_GD_206_NAAF, how='left', rsuffix='_other')


print('# of total phylum-level peptides = ', len(CV_T00_GD_206_Phy))

CV_T00_GD_206_Phy.head()

# of total phylum-level peptides =  23


Unnamed: 0_level_0,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,subclass,...,genus,subgenus,species group,species subgroup,species,subspecies,varietas,forma,Area,NAAF_num.
peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APVLSMR,Brevundimonas,Bacteria,,,,Proteobacteria,,,Alphaproteobacteria,,...,,,,,,,,,49900.0,7128.571
EFHPGLAR,Oleiphilus messinensis,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,,...,,,Oleiphilus messinensis,,,,,,13600.0,1700.0
FDHLPAAR,Proteobacteria,Bacteria,,,,Proteobacteria,,,,,...,,,,,,,,,,
LAPLLNVFK,Croceitalea dokdonensis DOKDO 023,Bacteria,,,,Bacteroidetes,,,Flavobacteriia,,...,,,Croceitalea dokdonensis,,Croceitalea dokdonensis DOKDO 023,,,,,
LATVLSPR,Gammaproteobacteria,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,,...,,,,,,,,,42100000.0,5262500.0


In [9]:
# write to a csv

CV_T00_GD_206_Phy.to_csv("lca/NAAF/CV_T00_GD_206_Phy_naaf.csv")