### Sorting and visualizing UniPept lowest common ancestor analyses in Trocas7

#### Beginning with: LCA peptides in csvs

#### Goal: spectral abundance-corrected (NAAF) taxonomic peptide compositions at the 4 stations before and after 24hrs

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
cd /home/millieginty/Documents/git-repos/amazon/analyses/T7-incubations/unipept/

/home/millieginty/Documents/git-repos/amazon/analyses/T7-incubations/unipept


In [3]:
# read the data into pandas dataframes
CV_T24_GD_406a = pd.read_csv("lca/cleaned/406_CV_T24_GD_lca.csv")

# delete any Metazoa hits because they're trypsin
CV_T24_GD_406b = CV_T24_GD_406a[CV_T24_GD_406a.kingdom != 'Metazoa']

# keep only entries to the phylum level
CV_T24_GD_406 = CV_T24_GD_406b[CV_T24_GD_406b['phylum'].notnull()]

# How many peptides to the phylum level?
print('# of phylum peptides = ', len(CV_T24_GD_406))

CV_T24_GD_406.head(6)

# of phylum peptides =  24


Unnamed: 0,peptide,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,...,tribe,subtribe,genus,subgenus,species group,species subgroup,species,subspecies,varietas,forma
4,LATVLSPR,Gammaproteobacteria,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,...,,,,,,,,,,
7,VATVSPLR,Solirubrobacteraceae,Bacteria,,,,Actinobacteria,,,Thermoleophilia,...,,,,,,,,,,
10,VFATLTGSR,Proteobacteria,Bacteria,,,,Proteobacteria,,,,...,,,,,,,,,,
12,QEFLNAAK,Phorcysia thermohydrogeniphila,Bacteria,,,,Aquificae,,,Aquificae,...,Phorcysia,,,,Phorcysia thermohydrogeniphila,,,,,
15,NLLTADEK,Calothrix,Bacteria,,,,Cyanobacteria,,,,...,Calothrix,,,,,,,,,
23,LTLEGEESR,Archangium,Bacteria,,,,Proteobacteria,delta/epsilon subdivisions,,Deltaproteobacteria,...,Archangium,,,,,,,,,


### Now I want to read in the file containing the stripped peptides with NAAF values
####  - NAAF stands for 'noramlized area abunace factor'

### I want to join the dataframes if they share an index (stripped peptide with equated leucine and isoleucines)
#### - That means I'll reindex the processed peptide file

In [4]:
CV_T24_GD_406_NAAFa = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/processed/I-L_NAAFs/406A_CV_T24_GD_DN50_ILnaafs.csv")
CV_T24_GD_406_NAAFb = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/processed/I-L_NAAFs/406B_CV_T24_GD_DN50_ILnaafs.csv")

frames = [CV_T24_GD_406_NAAFa, CV_T24_GD_406_NAAFb]

CV_T24_GD_406_NAAF = pd.concat(frames, sort=False)

CV_T24_GD_406_NAAF.set_index('stripped_peptide')
CV_T24_GD_406_NAAF = CV_T24_GD_406_NAAF.loc[:, ~CV_T24_GD_406_NAAF.columns.str.contains('^Unnamed')]

CV_T24_GD_406_NAAF.rename(columns = {'stripped_peptide':'peptide'}, inplace = True)

print('# of total peptides = ', len(CV_T24_GD_406_NAAF))

print('column names:', CV_T24_GD_406_NAAF.columns)

CV_T24_GD_406_NAAF.head()

# of total peptides =  443
column names: Index(['peptide', 'Area', 'NAAF_num.'], dtype='object')


Unnamed: 0,peptide,Area,NAAF_num.
0,ECK,48500.0,16166.67
1,SCK,61200.0,20400.0
2,LLTADEK,42300.0,6042.857
3,SPATLNSR,160000.0,20000.0
4,LSSPATLNSR,35300000.0,3530000.0


In [5]:
# get the intersection of the phylum peptides and corresponding peptides w/ NAAFs

over = np.intersect1d(CV_T24_GD_406["peptide"], CV_T24_GD_406_NAAF["peptide"])

print(over)

['EFHPGLAR' 'FFWSLAR' 'GPSVSLPR' 'LATVLSPR' 'LTLEGEESR' 'NLLTADEK'
 'QEFLNAAK' 'SNNVLALR' 'SPGVSLPR' 'TPSAPMMR' 'TTTVSLPR' 'VATVSPLR'
 'VAVTSAHK' 'VFATLTGSR' 'YGPLAVDK']


In [6]:
# join the dataframes if the peptide values are the same using 'join'
# since a couple are de novo only (more for bacteria), we won't have all the UniPept peptides overlap 

CV_T24_GD_406.set_index('peptide', inplace=True)
CV_T24_GD_406_NAAF.set_index('peptide', inplace=True)

CV_T24_GD_406_Phy = CV_T24_GD_406.join(CV_T24_GD_406_NAAF, how='left', rsuffix='_other')


print('# of total phylum-level peptides = ', len(CV_T24_GD_406_Phy))

CV_T24_GD_406_Phy.head()

# of total phylum-level peptides =  44


Unnamed: 0_level_0,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,subclass,...,genus,subgenus,species group,species subgroup,species,subspecies,varietas,forma,Area,NAAF_num.
peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ALTVVDPR,Opitutae,Bacteria,,,,Verrucomicrobia,,,Opitutae,,...,,,,,,,,,,
ATLVSVTPR,Verrucomicrobiales,Bacteria,,,,Verrucomicrobia,,,Verrucomicrobiae,,...,,,,,,,,,,
EFHPGLAR,Oleiphilus messinensis,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,,...,,,Oleiphilus messinensis,,,,,,48200.0,6025.0
FFWSLAR,Bacillaceae,Bacteria,,,,Firmicutes,,,Bacilli,,...,,,,,,,,,15200.0,2171.428571
FLGDVVSK,Deltaproteobacteria,Bacteria,,,,Proteobacteria,delta/epsilon subdivisions,,Deltaproteobacteria,,...,,,,,,,,,,


In [7]:
# write to a csv

CV_T24_GD_406_Phy.to_csv("lca/NAAF/CV_T24_GD_406_Phy_naaf.csv")