### Combining stripped peptides from *de novo* sequencing and PeaksDB search results Trocas 7 incubation samples

The dataset:

    16 treatments: 4 stations, 2 timepoints (Time 0 an Time 24 hrs), 2 size fractions (GFF; GF75):
    
    Stations: 
    
        - Macapa South (MS) South stem, upriver
        - Macapa North (MN) North stem, upriver
        - Chaves (CV) South stem, downriver
        - Baylique (BY) North stem, downriver


    Proteomics samples from 2 trips to UWPR (July 2020 on the QE; April 2021 on the Fusion)
    There were at least triplicate samples for each treatment
    Many in April 2021 injected twice

Starting with:

    Peaks de novo results of PTM-optimized sequencing
    PeaksDB de novo-assisted results from PTM-optimized database searches
    
    Multiple samples per treatment

Goal:

    Txt files with combined de novo and PeaksDB for each sample
    
Using:

    - pandas
    - matplotlib
    - numpy

In [1]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

In [2]:
cd /home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations/

/home/millieginty/Documents/git-repos/amazon/data/Trocas7-incubations


### 1. Chaves, timepoint 0, size fraction 0.3-0.7 um (denoted as CV_T00_GF)
### T7 samples #s: 105, 106
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/106_CV_T00_GF_PDB
    Trocas-incubations/processed/PeaksDB/106_CV_T00_GF_DN

In [5]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_106a = pd.read_csv("processed/PeaksDN/106_CV_T00_GF_DN/106A_CV_T00_GF_DN50_stripped_peptides.txt", header=None)
peaks50_106b = pd.read_csv("processed/PeaksDN/106_CV_T00_GF_DN/106B_CV_T00_GF_DN50_stripped_peptides.txt", header=None)
peaks50_106c = pd.read_csv("processed/PeaksDN/106_CV_T00_GF_DN/106C_CV_T00_GF_DN50_stripped_peptides.txt", header=None)
peaksdb_106a = pd.read_csv("processed/PeaksDB/106_CV_T00_GF_PDB/106A_CV_T00_GF_PDB_stripped_peptides.txt", header=None)
peaksdb_106b = pd.read_csv("processed/PeaksDB/106_CV_T00_GF_PDB/106B_CV_T00_GF_PDB_stripped_peptides.txt", header=None)
peaksdb_106c = pd.read_csv("processed/PeaksDB/106_CV_T00_GF_PDB/106C_CV_T00_GF_PDB_stripped_peptides.txt", header=None)


frames = [peaks50_106a, peaks50_106b, peaks50_106c, peaksdb_106a, peaksdb_106b, peaksdb_106c]
#index = [index]

# concatenate dataframes
tot_106 = pd.concat(frames)

# deduplicate
tot_106_nr = tot_106.drop_duplicates()

print('total 210 peptides, redundant', len(tot_106))
print('total 210 peptides, nonredundant', len(tot_106_nr))

tot_106.to_csv("processed/stripped_peptides/106_CV_T00_GF_stripped_peptides.txt", header=False, index=False)

tot_106.head()

total 210 peptides, redundant 985
total 210 peptides, nonredundant 678


Unnamed: 0,0
0,LSSPATLNSR
1,LSSPATLNSR
2,LSSPATLNSR
3,LSSPATLDSR
4,LATVLSPR


### 2. Chaves, timepoint 0, size fraction>0.7 um (denoted as CV_T00_GD)
### T7 samples #s:  206
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/206_CV_T00_GD_PDB
    Trocas-incubations/processed/PeaksDB/206_CV_T00_GD_DN

In [6]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_206a = pd.read_csv("processed/PeaksDN/206_CV_T00_GD_DN/206_CV_T00_GD_DN50_stripped_peptides.txt", header=None)
#peaks50_206b = pd.read_csv("processed/PeaksDN/206_CV_T00_GD_DN/206B_CV_T00_GD_DN50_stripped_peptides.txt", header=None)
peaksdb_206a = pd.read_csv("processed/PeaksDB/206_CV_T00_GD_PDB/206_CV_T00_GD_PDB_stripped_peptides.txt", header=None)
#peaksdb_206b = pd.read_csv("processed/PeaksDB/206_CV_T00_GD_PDB/206B_CV_T00_GD_PDB_stripped_peptides.txt", header=None)

frames = [peaks50_206a, peaksdb_206a]
#index = [index]

# concatenate dataframes
tot_206 = pd.concat(frames)

# deduplicate
tot_206_nr = tot_206.drop_duplicates()

print('total 206 peptides, redundant', len(tot_206))
print('total 206 peptides, nonredundant', len(tot_206_nr))

tot_206.to_csv("processed/stripped_peptides/206_CV_T00_GD_stripped_peptides.txt", header=False, index=False)

tot_206.head()

total 206 peptides, redundant 372
total 206 peptides, nonredundant 248


Unnamed: 0,0
0,VVEVSLPR
1,VLEGNEQFLNAAK
2,LSSPATLNSR
3,LSSPATLNSR
4,LSSPATLNSR


### 3. Chaves, timepoint 24 hrs, size fraction 0.3-0.7 um (denoted as CV_T24_GF)
### T7 samples #s: 306A, 306B, 306 (306C)
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/306_CV_T24_GF_PDB
    Trocas-incubations/processed/PeaksDB/306_CV_T24_GF_DN

In [7]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_306a = pd.read_csv("processed/PeaksDN/306_CV_T24_GF_DN/306A_CV_T24_GF_DN50_stripped_peptides.txt", header=None)
peaks50_306b = pd.read_csv("processed/PeaksDN/306_CV_T24_GF_DN/306B_CV_T24_GF_DN50_stripped_peptides.txt", header=None)
peaks50_306c = pd.read_csv("processed/PeaksDN/306_CV_T24_GF_DN/306C_CV_T24_GF_DN50_stripped_peptides.txt", header=None)
peaksdb_306a = pd.read_csv("processed/PeaksDB/306_CV_T24_GF_PDB/306A_CV_T24_GF_PDB_stripped_peptides.txt", header=None)
peaksdb_306b = pd.read_csv("processed/PeaksDB/306_CV_T24_GF_PDB/306B_CV_T24_GF_PDB_stripped_peptides.txt", header=None)
peaksdb_306c = pd.read_csv("processed/PeaksDB/306_CV_T24_GF_PDB/306C_CV_T24_GF_PDB_stripped_peptides.txt", header=None)

frames = [peaks50_306a, peaks50_306b, peaks50_306c, peaksdb_306a, peaksdb_306b, peaksdb_306c]
#index = [index]

# concatenate dataframes
tot_306 = pd.concat(frames)

# deduplicate
tot_306_nr = tot_306.drop_duplicates()

print('total 306 peptides, redundant', len(tot_306))
print('total 306 peptides, nonredundant', len(tot_306_nr))

tot_306.to_csv("processed/stripped_peptides/306_CV_T24_GF_stripped_peptides.txt", header=False, index=False)

tot_306.head()

total 306 peptides, redundant 871
total 306 peptides, nonredundant 564


Unnamed: 0,0
0,SCK
1,TEELDR
2,TEELNR
3,AEYENLAEK
4,SCK


### 4. Chaves, timepoint 24 hrs, size fraction >0.7 um (denoted as CV_T24_GD)
### T7 samples #s: 406A, 406B
### Exported NAAF and stripped peptides contained in the following directories:

    Trocas-incubations/processed/PeaksDB/406_CV_T24_GD_PDB
    Trocas-incubations/processed/PeaksDB/406_CV_T24_GD_DN

In [8]:
# for each of the 4 incubation samples:
# read in NAAF totals csvs made in PeaksDN (de novo), and PeaksDB (de novo assisted database search) notebooks
# bringing de novo peptides >50% ALC and PeaksDB peptides <1% FDR

peaks50_406a = pd.read_csv("processed/PeaksDN/406_CV_T24_GD_DN/406A_CV_T24_GD_DN50_stripped_peptides.txt", header=None)
peaks50_406b = pd.read_csv("processed/PeaksDN/406_CV_T24_GD_DN/406B_CV_T24_GD_DN50_stripped_peptides.txt", header=None)
peaksdb_406a = pd.read_csv("processed/PeaksDB/406_CV_T24_GD_PDB/406A_CV_T24_GD_PDB_stripped_peptides.txt", header=None)
peaksdb_406b = pd.read_csv("processed/PeaksDB/406_CV_T24_GD_PDB/406B_CV_T24_GD_PDB_stripped_peptides.txt", header=None)


frames = [peaks50_406a, peaks50_406b, peaksdb_406a, peaksdb_406b, ]
#index = [index]

# concatenate dataframes
tot_406 = pd.concat(frames)

# deduplicate
tot_406_nr = tot_406.drop_duplicates()

print('total 406 peptides, redundant', len(tot_406))
print('total 406 peptides, nonredundant', len(tot_406_nr))

tot_406.to_csv("processed/stripped_peptides/406_CV_T24_GD_stripped_peptides.txt", header=False, index=False)

tot_406.head()

total 406 peptides, redundant 649
total 406 peptides, nonredundant 441


Unnamed: 0,0
0,ECK
1,SCK
2,LLTADEK
3,SPATLNSR
4,LSSPATLNSR
