### Sorting and visualizing UniPept lowest common ancestor analyses over diatom degradation experiment

#### Beginning with: LCA peptides in csvs

#### Goal: spectral abundance-corrected (NAAF) algal and bacterial peptide over the 4 timepoints

## Issue: the `join` commands at the end for diatom and bacterial peptides need to be run independently from one another. To do this, run the diatom command, restart the kernal, then run the bacterial. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# read the data into pandas dataframes
tryp_322_DB = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/analyses/unipept/lca/cleaned/T0-322-trypsin-lca-DB.csv")

tryp_322_DB.head(6)

Unnamed: 0,peptide,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,...,tribe,subtribe,genus,subgenus,species,group,species.1,subgroup,species.2,subspecies
0,LPQVEGTGGDVQPSQDLVR,root,,,,,,,,,...,,,,,,,,,,
1,VLGQNEAVDAVSNALR,root,,,,,,,,,...,,,,,,,,,,
2,ALDLLDEAASSLR,root,,,,,,,,,...,,,,,,,,,,
3,VTDAELAEVLAR,root,,,,,,,,,...,,,,,,,,,,
4,STEFDNLLLVGPLAGK,,,,,,,,,,...,,,,,,,,,,
5,SKNVQVFVEK,Bacillariophyta,Eukaryota,,,,Bacillariophyta,,,,...,,,,,,,,,,


In [3]:
# keep only the rows where kingdom = Bacteria

tryp_322_DB_Bac1 = tryp_322_DB.loc[(tryp_322_DB['kingdom'] == 'Bacteria')]

tryp_322_DB_Bac2 = tryp_322_DB.loc[(tryp_322_DB['subkingdom'] == 'Bacteria')]

print('# of kingdom = ', len(tryp_322_DB_Bac1))

print('# of subkingdom = ', len(tryp_322_DB_Bac2))

frames = [tryp_322_DB_Bac1, tryp_322_DB_Bac2]
  
tryp_322_DB_Bac = pd.concat(frames)

print('# of total Bacteria peptides = ', len(tryp_322_DB_Bac))

tryp_322_DB_Bac.head()

# of kingdom =  8
# of subkingdom =  1
# of total Bacteria peptides =  9


Unnamed: 0,peptide,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,...,tribe,subtribe,genus,subgenus,species,group,species.1,subgroup,species.2,subspecies
239,QGYSNPR,Barnesiella,viscericola,Bacteria,,,,Bacteroidetes,,,...,,,,Barnesiella,,,,Barnesiella,viscericola,
329,DSPPALALK,Verrucomicrobia,bacterium,Bacteria,,,,Verrucomicrobia,,,...,,,,,,,,Verrucomicrobia,bacterium,
413,SVDPYDR,Streptomyces,violaceoruber,Bacteria,,,,Actinobacteria,,,...,,,,Streptomyces,,,,Streptomyces,violaceoruber,
548,NDEALAFLR,Chloroflexi,bacterium,Bacteria,,,,Chloroflexi,,,...,,,,,,,,Chloroflexi,bacterium,
612,SGDCTQER,Myxococcaceae,bacterium,Bacteria,,,,Proteobacteria,delta/epsilon,subdivisions,...,Myxococcaceae,,,,,,,,Myxococcaceae,bacterium


In [4]:
# keep only the rows where phylum = Bacteria

tryp_322_DB_Dia1 = tryp_322_DB.loc[(tryp_322_DB['phylum'] == 'Bacillariophyta')]

tryp_322_DB_Dia2 = tryp_322_DB.loc[(tryp_322_DB['subphylum'] == 'Bacillariophyta')]

print('# of phylum = ', len(tryp_322_DB_Dia1))

print('# of subphylum = ', len(tryp_322_DB_Dia2))

frames = [tryp_322_DB_Dia1, tryp_322_DB_Dia2]
  
tryp_322_DB_Dia = pd.concat(frames)

print('# of total diatom peptides = ', len(tryp_322_DB_Dia))

print('colunm names:', tryp_322_DB_Dia.columns)

tryp_322_DB_Dia.head()

# of phylum =  33
# of subphylum =  28
# of total diatom peptides =  61
colunm names: Index(['peptide', 'lca', 'superkingdom', 'kingdom', 'subkingdom',
       'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass',
       'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
       'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe',
       'genus', 'subgenus', 'species', 'group', 'species.1', 'subgroup',
       'species.2', 'subspecies'],
      dtype='object')


Unnamed: 0,peptide,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,...,tribe,subtribe,genus,subgenus,species,group,species.1,subgroup,species.2,subspecies
5,SKNVQVFVEK,Bacillariophyta,Eukaryota,,,,Bacillariophyta,,,,...,,,,,,,,,,
8,SELLCGTDGLPHLVADGR,Bacillariophyta,Eukaryota,,,,Bacillariophyta,,,,...,,,,,,,,,,
10,DVPGTGNEFVGDFR,Bacillariophyta,Eukaryota,,,,Bacillariophyta,,,,...,,,,,,,,,,
12,DVTGEGEFVGDFR,Thalassiosirales,Eukaryota,,,,Bacillariophyta,,,Coscinodiscophyceae,...,,,,,,,,,,
37,NGALDFGWDK,Bacillariophyta,Eukaryota,,,,Bacillariophyta,,,,...,,,,,,,,,,


### Now I want to read in the file containing the stripped peptides with NAAF values
####  - NAAF stands for 'noramlized area abunace factor'

### I want to join the dataframes if they share an index (stripped peptide with equated leucine and isoleucines)
#### - That means I'll reindex the processed peptide file

In [5]:
tryp_322_DB_NAAF = pd.read_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/I-L_NAAFs/T0_322_trypsin_DB_peptides.csv")

tryp_322_DB_NAAF.set_index('stripped_IL')
tryp_322_DB_NAAF = tryp_322_DB_NAAF.loc[:, ~tryp_322_DB_NAAF.columns.str.contains('^Unnamed')]

tryp_322_DB_NAAF.rename(columns = {'stripped_IL':'peptide'}, inplace = True)

print('# of total peptides = ', len(tryp_322_DB_NAAF))

print('column names:', tryp_322_DB_NAAF.columns)

tryp_322_DB_NAAF.head()

# of total peptides =  826
column names: Index(['peptide', 'Area', 'NAAF_num.'], dtype='object')


Unnamed: 0,peptide,Area,NAAF_num.
0,LPQVEGTGGDVQPSQDLVR,134000000.0,7052632.0
1,VLGQNEAVDAVSNALR,142000000.0,8875000.0
2,ALDLLDEAASSLR,171000000.0,13153850.0
3,VTDAELAEVLAR,192000000.0,16000000.0
4,STEFDNLLLVGPLAGK,78300.0,4893.75


In [6]:
# get the intersection of the diatom peptides and corresponding peptides w/ NAAFs

over = np.intersect1d(tryp_322_DB_Dia["peptide"], tryp_322_DB_NAAF["peptide"])

print(over)

#print(tryp_322_DB_Dia[tryp_322_DB_NAAF.peptide.isin(over)])

#print(tryp_322_DB_NAAF[tryp_322_DB_Dia.peptide.isin(over)])

['AFEPVLLLGQDK' 'ALLAGTTGEELER' 'ALQELQHGR' 'APGDFGLDGGFLK' 'AQELANGR'
 'AQLSPEVK' 'ATMEEVYK' 'CSESPAFTK' 'CSESPAFTKR' 'CVLGEGGGR' 'DENVWVPVTK'
 'DFLGSLLQER' 'DVPGTGNEFVGDFR' 'DVTGEGEFVGDFR' 'DYSDLDGAPEER' 'EGLGYSLGR'
 'FGLFSPAVYGAK' 'FSLGPCAER' 'GLDTVLFER' 'GNEFVGDFR' 'KPGDHSLAQGLKGDA'
 'LAQLAFLGNLLTR' 'LGVEEGQTR' 'LKAQELANGR' 'LLFSPWTETDFR' 'LLGNGQLNVPVLVK'
 'LNFAPTLQPR' 'LSDLQTDLR' 'LSQEKFDEYR' 'NFEDLPVNAQR' 'NGALDFGWDK'
 'NGALDFGWDSFDEETK' 'NGFLDFGWDK' 'NGLENYCYSLK' 'QSLLEANVDAR' 'QVSYAMAK'
 'SEGTDLQTK' 'SELLCGTDGLPHLVADGR' 'SETLQGGALR' 'SKNVQVFVEK' 'SLDDEDDETR'
 'SLLAYYAK' 'SLPFLTVPEK' 'SNEETLAWLR' 'SPPALALKQQLER' 'SQLNSETFK'
 'SYSVQSNPEGSR' 'TFNAVLGTPR' 'TLGDELSALKK' 'TLLPVDYYR' 'TSELNNGR'
 'TVAVDGAGK' 'VEGSVPLLK' 'VGSNGVTSACLLK' 'VLGPLMSK' 'VNPGDLLYVQK'
 'VTNLAPLEFPCK' 'VVNAAQEK' 'VYVEDGALR' 'WCPAGLGK' 'YCATDPDNDLDR']


In [7]:
# join the dataframes if the peptide values are the same using 'join'
# since a couple are de novo only (more for bacteria), we won't have all the UniPept peptides overlap 

tryp_322_DB_Dia.set_index('peptide', inplace=True)
tryp_322_DB_NAAF.set_index('peptide', inplace=True)

tryp_322_DB_Diatoms = tryp_322_DB_Dia.join(tryp_322_DB_NAAF, how='left', rsuffix='_other')


print('# of total diatom peptides = ', len(tryp_322_DB_Diatoms))

tryp_322_DB_Diatoms.head()

# of total diatom peptides =  63


Unnamed: 0_level_0,lca,superkingdom,kingdom,subkingdom,superphylum,phylum,subphylum,superclass,class,subclass,...,genus,subgenus,species,group,species.1,subgroup,species.2,subspecies,Area,NAAF_num.
peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFEPVLLLGQDK,Bacillariophyta,Eukaryota,,,,Bacillariophyta,,,,,...,,,,,,,,,6480.0,540.0
ALLAGTTGEELER,Thalassiosira,pseudonana,Eukaryota,,,,Bacillariophyta,,,Coscinodiscophyceae,...,,Thalassiosira,,,,Thalassiosira,pseudonana,,49300.0,3792.307692
ALQELQHGR,Thalassiosira,pseudonana,Eukaryota,,,,Bacillariophyta,,,Coscinodiscophyceae,...,,Thalassiosira,,,,Thalassiosira,pseudonana,,343000.0,38111.111111
APGDFGLDGGFLK,Thalassiosira,pseudonana,Eukaryota,,,,Bacillariophyta,,,Coscinodiscophyceae,...,,Thalassiosira,,,,Thalassiosira,pseudonana,,255000.0,19615.384615
AQELANGR,Fragilariopsis,cylindrus,Eukaryota,,,,Bacillariophyta,,,Bacillariophyceae,...,,Fragilariopsis,,,,Fragilariopsis,cylindrus,,219000.0,27375.0


In [8]:
# write to a csv

tryp_322_DB_Diatoms.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/I-L_NAAFs/T0_322_trypsin_DB_NAAF_diatom_peptides.csv")

In [9]:
# join the dataframes if the peptide values are the same using 'join'
# since a couple are de novo only (more for bacteria), we won't have all the UniPept peptides overlap 

tryp_322_DB_Bac.set_index('peptide', inplace=True)
tryp_322_DB_NAAF.set_index('peptide', inplace=True)

tryp_322_DB_Bacteria = tryp_322_DB_Bac.join(tryp_322_DB_NAAF, how='left', rsuffix='_other')


print('# of total Bacteria peptides = ', len(tryp_322_DB_Bacteria))

tryp_322_DB_Bacteria.head()

KeyError: "None of ['peptide'] are in the columns"

In [None]:
# write to a csv

tryp_322_DB_Bacteria.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/data/processed/I-L_NAAFs/T0_322_trypsin_DB_NAAF_bacteria_peptides.csv")