### Sorting and visualizing UniPept lowest common ancestor analyses in Trocas7

#### Beginning with: LCA peptides in csvs

#### Goal: spectral abundance-corrected (NAAF) taxonomic peptide compositions at the 4 stations before and after 24hrs

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
cd /home/millieginty/Documents/git-repos/amazon/analyses/T7-incubations/unipept/

/home/millieginty/Documents/git-repos/amazon/analyses/T7-incubations/unipept


In [3]:
# read the data into pandas dataframes
CV_T24_GF_306a = pd.read_csv("lca/cleaned/306_CV_T24_GF_lca.csv")

# delete any Metazoa hits because they're trypsin
CV_T24_GF_306b = CV_T24_GF_306a[CV_T24_GF_306a.kingdom != 'Metazoa']

# keep only entries to the phylum level
CV_T24_GF_306 = CV_T24_GF_306b[CV_T24_GF_306b['phylum'].notnull()]

# How many peptides to the phylum level?
print('# of phylum peptides = ', len(CV_T24_GF_306))

CV_T24_GF_306.head(6)

# of phylum peptides =  21


Unnamed: 0,peptide,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,...,tribe,subtribe,genus,subgenus,species group,species subgroup,species,subspecies,varietas,forma
11,LATVLSPR,Gammaproteobacteria,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,...,,,,,,,,,,
16,VATVSPLR,Solirubrobacteraceae,Bacteria,,,,Actinobacteria,,,Thermoleophilia,...,,,,,,,,,,
19,LDGHTDNTGPR,Acinetobacter,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,...,Acinetobacter,,,,,,,,,
28,VFATLTGSR,Proteobacteria,Bacteria,,,,Proteobacteria,,,,...,,,,,,,,,,
41,LTLEWENK,Acinetobacter,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,...,Acinetobacter,,,,,,,,,
138,LLHLHYSK,Megasphaera,Bacteria,,,,Firmicutes,,,Negativicutes,...,Megasphaera,,,,,,,,,


### Now I want to read in the file containing the stripped peptides with NAAF values
####  - NAAF stands for 'noramlized area abunace factor'

### I want to join the dataframes if they share an index (stripped peptide with equated leucine and isoleucines)
#### - That means I'll reindex the processed peptide file

In [4]:
CV_T24_GF_306_NAAFa = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/processed/I-L_NAAFs/306A_CV_T24_GF_DN50_ILnaafs.csv")
CV_T24_GF_306_NAAFb = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/processed/I-L_NAAFs/306B_CV_T24_GF_DN50_ILnaafs.csv")
CV_T24_GF_306_NAAFc = pd.read_csv("/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/processed/I-L_NAAFs/306C_CV_T24_GF_DN50_ILnaafs.csv")

frames = [CV_T24_GF_306_NAAFa, CV_T24_GF_306_NAAFb, CV_T24_GF_306_NAAFc]

CV_T24_GF_306_NAAF = pd.concat(frames, sort=False)

CV_T24_GF_306_NAAF.set_index('stripped_peptide')
CV_T24_GF_306_NAAF = CV_T24_GF_306_NAAF.loc[:, ~CV_T24_GF_306_NAAF.columns.str.contains('^Unnamed')]

CV_T24_GF_306_NAAF.rename(columns = {'stripped_peptide':'peptide'}, inplace = True)

print('# of total peptides = ', len(CV_T24_GF_306_NAAF))

print('column names:', CV_T24_GF_306_NAAF.columns)

CV_T24_GF_306_NAAF.head()

# of total peptides =  586
column names: Index(['peptide', 'Area', 'NAAF_num.'], dtype='object')


Unnamed: 0,peptide,Area,NAAF_num.
0,SCK,242000.0,80666.666667
1,TEELDR,46300.0,7716.666667
2,TEELNR,17600.0,2933.333333
3,AEYENLAEK,315000.0,35000.0
4,SCK,242000.0,80666.666667


In [5]:
# get the intersection of the phylum peptides and corresponding peptides w/ NAAFs

over = np.intersect1d(CV_T24_GF_306["peptide"], CV_T24_GF_306_NAAF["peptide"])

print(over)

['EFHPGLAR' 'EHDLATLK' 'LATVLSPR' 'LDGHTDNTGPR' 'LTLEWENK' 'QEFLNAAK'
 'VATVSPLR' 'VFATLTGSR' 'VFEAPNK' 'YWMAVLK']


In [6]:
# join the dataframes if the peptide values are the same using 'join'
# since a couple are de novo only (more for bacteria), we won't have all the UniPept peptides overlap 

CV_T24_GF_306.set_index('peptide', inplace=True)
CV_T24_GF_306_NAAF.set_index('peptide', inplace=True)

CV_T24_GF_306_Phy = CV_T24_GF_306.join(CV_T24_GF_306_NAAF, how='left', rsuffix='_other')


print('# of total phylum-level peptides = ', len(CV_T24_GF_306_Phy))

CV_T24_GF_306_Phy.head()

# of total phylum-level peptides =  49


Unnamed: 0_level_0,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,subclass,...,genus,subgenus,species group,species subgroup,species,subspecies,varietas,forma,Area,NAAF_num.
peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EFHPGLAR,Oleiphilus messinensis,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,,...,,,Oleiphilus messinensis,,,,,,118000.0,14750.0
EFHPGLAR,Oleiphilus messinensis,Bacteria,,,,Proteobacteria,,,Gammaproteobacteria,,...,,,Oleiphilus messinensis,,,,,,618000.0,77250.0
EHDLATLK,Candidatus Rokubacteria,Bacteria,,,,Candidatus Rokubacteria,,,,,...,,,,,,,,,6970.0,871.25
HGLTLFDLK,Desulfuromonas,Bacteria,,,,Proteobacteria,delta/epsilon subdivisions,,Deltaproteobacteria,,...,,,,,,,,,,
LASQLQALLEK,Fibrobacteres,Bacteria,,,,Fibrobacteres,,,,,...,,,,,,,,,,


In [7]:
# write to a csv

CV_T24_GF_306_Phy.to_csv("lca/NAAF/CV_T24_GF_306_Phy_naaf.csv")