In [1]:
# LIBRARIES
#import pandas library for working with tabular data
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde
#import regular expresson (regex)
import re
#check pandas version
pd.__version__

'1.0.5'

## PROTEUS2 secondary structure prediction of Day 5 DB 324 trypsin digested proteins, cytoplasmtic and membrane (respectively).

### Normally create the list of _T. weiss_ protein IDs wanted to pull out of the larger TW database using the Galaxy `Filter sequences by ID from a tabular file` tool. The list I get by searching the UniPept LCA peptides in the IL-adjusted database. 

### Still, since there are few proteins here I can just do it manually, pulling out the desired protein sequences into a new FASTA to submit to Proteus2.

### This worked! Proteus2 accepted the submission. Now the results come in an email from Proteus2 in a garbage format:

### The output of Proteus2 comes in an email in nominally FASTA format, but with extra line breaks and spacings within protein and prediction sequences. Also, the name gets cut off.

#### They look like this:

#### >Thalas

MMKLAALAAL MGSAAAFAPA QTGKASTQLR AFEDELGAQP PLGFFDPFGM 

CCHHHHHHHH HHHHHCCCCC CCCCCCCCCC CCCCCCCCCC CCCCCCCCCC 

LSGDCTQERF DRLRYVEIKH GRICMLAFLG QIVTRAGIHL PGSINYAGDS 

CCCCCCHHHH HHHHHHHHHH HHHHHHHHHH HHHHHHHCCC CCCCCCCCCC 

FDSFPNGVAA LFGPNSIPTA GLVQIIAFIG VLECAFMRDV PGTGNEFVGD 

CCCCCCCCCC CCCCCCCCHH HHHHHHHHHH HHHHHHHHCC CCCCCCCCCC 

FRNGYIDFGW DDFDEETKLQ KRAIQSGTIS NMMKLAALAA LMGSAAAFAP 

CCCCCCCCCC CCCCHHHHHH HHHHHHHHHH HHHHHHHHHC CCCCCCCCCC 



### Output means:

- H = Helix
- E = Beta Strand
- C = Coil
- T = Membrane helix
- B = Membrane strand
- S = Signal peptide
- c = Cleavage site

In [2]:
cd /home/millieginty/Documents/git-repos/rot-mayer/analyses/proteus2/Proteins-Proteus2/

/home/millieginty/Documents/git-repos/rot-mayer/analyses/proteus2/Proteins-Proteus2


In [3]:
!head T5-324-trypsin-DB-diatom-cytoplasm-proteins

>Thalas

RRGNFTEGKR PKTTMRFALP QALLLTLLRS PNTSAFSLRK VSASAFVARA 
CCCCCCCCCC CCCCEEHHHH HHHCCCCCCC CCCCEEEEEE ECCCHHHCCC 

TAAAPFTSSS ASSSPAFSPS SSNSVFSSSS RPSSSLNMAE KERPFTTWTF 
CCCCCCCCCC CCCCCCCCCC CCCCEECCCC CCCCCCCCCC CCCCCCEECC 

DKHCETMDWT PEPTASLSAV DASASAVAEL QDADLVVVGV FAPVKDDDEE 
CCCCCCCCCC CCCCCEEEEE CCCCCCCCCC CECEEEEEEE EECCCCCCCC 


In [4]:
# remove empty lines
# remove empty spaces from lines
# remove carat protein sequence names
# collapse the 2 lines from every protein sequence line (protein seq, secondard stucture pred.) into one line

!sed '/^[[:space:]]*$/d' T5-324-trypsin-DB-diatom-cytoplasm-proteins | cat \
| sed '/>/d' \
| tr -d "[:blank:]" > T5-324-trypsin-DB-diatom-cytoplasm-proteins.txt

!awk '{printf "%s%s",$0,(NR%2?FS:RS)}' T5-324-trypsin-DB-diatom-cytoplasm-proteins.txt > \
T5-324-trypsin-DB-diatom-cytoplasm-proteins_sort.csv

In [5]:
!head T5-324-trypsin-DB-diatom-cytoplasm-proteins_sort.csv

RRGNFTEGKRPKTTMRFALPQALLLTLLRSPNTSAFSLRKVSASAFVARA CCCCCCCCCCCCCCEEHHHHHHHCCCCCCCCCCCEEEEEEECCCHHHCCC
TAAAPFTSSSASSSPAFSPSSSNSVFSSSSRPSSSLNMAEKERPFTTWTF CCCCCCCCCCCCCCCCCCCCCCCCEECCCCCCCCCCCCCCCCCCCCEECC
DKHCETMDWTPEPTASLSAVDASASAVAELQDADLVVVGVFAPVKDDDEE CCCCCCCCCCCCCCCEEEEECCCCCCCCCCCECEEEEEEEEECCCCCCCC
DEDAKAEEKEVDPLEFVGKAKELDEALGGALTDLAAENGKEFRNGGEAGS CCCCCECCCCCCCEECCHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCE
ATPVMRVLEGGKAKRYLLLGLGPQPKDDKPLESTTLMKAAAALATACHDQ EEEEEECCCCCCCEEEEEEEECCCCCCCCHHHHHHHHHHHHHHHHHHHHH
KKVASCNLLLPTALASTPSNVQDLSTAFYSNLYSDNRYRTGKKVVKMAES HHCCCCEHCCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCEEECC
LDAVKLFMESGTASSDDLAASLTTGKQLAKGVSLTKDLVNAPHNVLNSMS CCCCEEEECCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCHHHH
LADTAKRLAEQSGGSLTCTLLDKKDCEERGMGAYLGVARGSETEPQFLHL HHHHHHHHHHHCCEEEEEEEECCHHHHHHCHHHHHHHHHHCCCCEEEEEE
TYKPSEGDLKKKVGVLGKGLLFDTGGYNLKTSMMELMKFDCGGAAAVLGA EEEECCCCCCEEEEEEEEEEEEEECCCCECECCHHHHHCCHHHHHHHHHH
ARAVGDMQPEGVEAHFLVAACENMLSGRAVVPSDVLTASNGKTLEVLNTD HHHHHHHECCCEEEEEEEEEEE

In [6]:
# read into pandas with space delimeter

PeaksDB_324_cyt_prot = pd.read_csv("T5-324-trypsin-DB-diatom-cytoplasm-proteins_sort.csv", delim_whitespace=True, header=None)


# name columns

PeaksDB_324_cyt_prot.columns =['Stripped protein sequence', 'Secondary structure pred.'] 

In [7]:
PeaksDB_324_cyt_prot.head(13)

Unnamed: 0,Stripped protein sequence,Secondary structure pred.
0,RRGNFTEGKRPKTTMRFALPQALLLTLLRSPNTSAFSLRKVSASAF...,CCCCCCCCCCCCCCEEHHHHHHHCCCCCCCCCCCEEEEEEECCCHH...
1,TAAAPFTSSSASSSPAFSPSSSNSVFSSSSRPSSSLNMAEKERPFT...,CCCCCCCCCCCCCCCCCCCCCCCCEECCCCCCCCCCCCCCCCCCCC...
2,DKHCETMDWTPEPTASLSAVDASASAVAELQDADLVVVGVFAPVKD...,CCCCCCCCCCCCCCCEEEEECCCCCCCCCCCECEEEEEEEEECCCC...
3,DEDAKAEEKEVDPLEFVGKAKELDEALGGALTDLAAENGKEFRNGG...,CCCCCECCCCCCCEECCHHHHHHHHHHHHHHHHHHHHHCCCCCCCC...
4,ATPVMRVLEGGKAKRYLLLGLGPQPKDDKPLESTTLMKAAAALATA...,EEEEEECCCCCCCEEEEEEEECCCCCCCCHHHHHHHHHHHHHHHHH...
5,KKVASCNLLLPTALASTPSNVQDLSTAFYSNLYSDNRYRTGKKVVK...,HHCCCCEHCCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCE...
6,LDAVKLFMESGTASSDDLAASLTTGKQLAKGVSLTKDLVNAPHNVL...,CCCCEEEECCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCC...
7,LADTAKRLAEQSGGSLTCTLLDKKDCEERGMGAYLGVARGSETEPQ...,HHHHHHHHHHHCCEEEEEEEECCHHHHHHCHHHHHHHHHHCCCCEE...
8,TYKPSEGDLKKKVGVLGKGLLFDTGGYNLKTSMMELMKFDCGGAAA...,EEEECCCCCCEEEEEEEEEEEEEECCCCECECCHHHHHCCHHHHHH...
9,ARAVGDMQPEGVEAHFLVAACENMLSGRAVVPSDVLTASNGKTLEV...,HHHHHHHECCCEEEEEEEEEEEEEECCCCCCCCEEEECCCCEEEEE...


In [8]:
# add a column with the stripped peptide length (number of AAs)
PeaksDB_324_cyt_prot['Sequence length'] = PeaksDB_324_cyt_prot['Stripped protein sequence'].apply(len)

# use a count function to enumerate the # of C's (coil residues) for each peptide
PeaksDB_324_cyt_prot['C'] = PeaksDB_324_cyt_prot['Secondary structure pred.'].str.count("C")

# use a count function to enumerate the # of H's (helices residues) in each peptide
PeaksDB_324_cyt_prot['H'] = PeaksDB_324_cyt_prot['Secondary structure pred.'].str.count("H")

# use a count function to enumerate the # of E's (beta strand residues) in each peptide
PeaksDB_324_cyt_prot['E'] = PeaksDB_324_cyt_prot['Secondary structure pred.'].str.count("E")

#use a count function to enumerate the # of T's (membrane helix residues) in each peptide
PeaksDB_324_cyt_prot['T'] = PeaksDB_324_cyt_prot['Secondary structure pred.'].str.count("T")

# use a count function to enumerate the # of B's (membrane strand residues) in each peptide
PeaksDB_324_cyt_prot['B'] = PeaksDB_324_cyt_prot['Secondary structure pred.'].str.count("B")

# use a count function to enumerate the # of B's (signal peptide residues) in each peptide
PeaksDB_324_cyt_prot['S'] = PeaksDB_324_cyt_prot['Secondary structure pred.'].str.count("S")

# use a count function to enumerate the # of c's (cleavage site residues) in each peptide
PeaksDB_324_cyt_prot['c'] = PeaksDB_324_cyt_prot['Secondary structure pred.'].str.count("c")

# add a column with the % C
PeaksDB_324_cyt_prot['% C'] = PeaksDB_324_cyt_prot['C'] / PeaksDB_324_cyt_prot['Sequence length']

# add a column with the % H
PeaksDB_324_cyt_prot['% H'] = PeaksDB_324_cyt_prot['H'] / PeaksDB_324_cyt_prot['Sequence length']

# add a column with the % E
PeaksDB_324_cyt_prot['% E'] = PeaksDB_324_cyt_prot['E'] / PeaksDB_324_cyt_prot['Sequence length']

# add a column with the % T
PeaksDB_324_cyt_prot['% T'] = PeaksDB_324_cyt_prot['T'] / PeaksDB_324_cyt_prot['Sequence length']

# add a column with the % B
PeaksDB_324_cyt_prot['% B'] = PeaksDB_324_cyt_prot['B'] / PeaksDB_324_cyt_prot['Sequence length']

# add a column with the % S
PeaksDB_324_cyt_prot['% S'] = PeaksDB_324_cyt_prot['S'] / PeaksDB_324_cyt_prot['Sequence length']

# add a column with the % c
PeaksDB_324_cyt_prot['% c'] = PeaksDB_324_cyt_prot['c'] / PeaksDB_324_cyt_prot['Sequence length']

# additive check

PeaksDB_324_cyt_prot['% check'] = PeaksDB_324_cyt_prot['% C'] + PeaksDB_324_cyt_prot['% H'] + PeaksDB_324_cyt_prot['% E'] \
                                + PeaksDB_324_cyt_prot['% T'] + PeaksDB_324_cyt_prot['% B'] + \
                                PeaksDB_324_cyt_prot['% S'] + PeaksDB_324_cyt_prot['% c']

In [9]:
PeaksDB_324_cyt_prot.head()

Unnamed: 0,Stripped protein sequence,Secondary structure pred.,Sequence length,C,H,E,T,B,S,c,% C,% H,% E,% T,% B,% S,% c,% check
0,RRGNFTEGKRPKTTMRFALPQALLLTLLRSPNTSAFSLRKVSASAF...,CCCCCCCCCCCCCCEEHHHHHHHCCCCCCCCCCCEEEEEEECCCHH...,50,31,10,9,0,0,0,0,0.62,0.2,0.18,0.0,0.0,0.0,0.0,1.0
1,TAAAPFTSSSASSSPAFSPSSSNSVFSSSSRPSSSLNMAEKERPFT...,CCCCCCCCCCCCCCCCCCCCCCCCEECCCCCCCCCCCCCCCCCCCC...,50,46,0,4,0,0,0,0,0.92,0.0,0.08,0.0,0.0,0.0,0.0,1.0
2,DKHCETMDWTPEPTASLSAVDASASAVAELQDADLVVVGVFAPVKD...,CCCCCCCCCCCCCCCEEEEECCCCCCCCCCCECEEEEEEEEECCCC...,50,35,0,15,0,0,0,0,0.7,0.0,0.3,0.0,0.0,0.0,0.0,1.0
3,DEDAKAEEKEVDPLEFVGKAKELDEALGGALTDLAAENGKEFRNGG...,CCCCCECCCCCCCEECCHHHHHHHHHHHHHHHHHHHHHCCCCCCCC...,50,25,21,4,0,0,0,0,0.5,0.42,0.08,0.0,0.0,0.0,0.0,1.0
4,ATPVMRVLEGGKAKRYLLLGLGPQPKDDKPLESTTLMKAAAALATA...,EEEEEECCCCCCCEEEEEEEECCCCCCCCHHHHHHHHHHHHHHHHH...,50,15,21,14,0,0,0,0,0.3,0.42,0.28,0.0,0.0,0.0,0.0,1.0


In [10]:
index = ['324 total']

data = {
        '% C total': PeaksDB_324_cyt_prot['% C'].sum(),
        '% H total': PeaksDB_324_cyt_prot['% H'].sum(),
        '% E total': PeaksDB_324_cyt_prot['% E'].sum(),
        '% T total': PeaksDB_324_cyt_prot['% T'].sum(),
        '% B total': PeaksDB_324_cyt_prot['% B'].sum(),
        '% S total': PeaksDB_324_cyt_prot['% S'].sum(),
        '% c total': PeaksDB_324_cyt_prot['% c'].sum(),
        '% check sum': PeaksDB_324_cyt_prot['% check'].sum()
       }

PeaksDB_324_cyt_prot_totals = pd.DataFrame(data, columns=['% C total', '% H total', '% E total', '% T total', \
                                                      '% B total',  '% S total', '% c total', \
                                                      '% check sum'], index=index)

PeaksDB_324_cyt_prot_totals['overall % sum'] = PeaksDB_324_cyt_prot_totals['% C total'] \
                                            + PeaksDB_324_cyt_prot_totals['% H total'] \
                                            + PeaksDB_324_cyt_prot_totals['% E total'] \
                                            + PeaksDB_324_cyt_prot_totals['% T total'] \
                                            + PeaksDB_324_cyt_prot_totals['% B total'] \
                                            + PeaksDB_324_cyt_prot_totals['% S total'] \
                                            + PeaksDB_324_cyt_prot_totals['% c total'] 


PeaksDB_324_cyt_prot_totals['overall % C'] = PeaksDB_324_cyt_prot_totals['% C total'] / PeaksDB_324_cyt_prot_totals['overall % sum']

PeaksDB_324_cyt_prot_totals['overall % H'] = PeaksDB_324_cyt_prot_totals['% H total'] / PeaksDB_324_cyt_prot_totals['overall % sum']

PeaksDB_324_cyt_prot_totals['overall % E'] = PeaksDB_324_cyt_prot_totals['% E total'] / PeaksDB_324_cyt_prot_totals['overall % sum']

PeaksDB_324_cyt_prot_totals['overall % T'] = PeaksDB_324_cyt_prot_totals['% T total'] / PeaksDB_324_cyt_prot_totals['overall % sum']

PeaksDB_324_cyt_prot_totals['overall % B'] = PeaksDB_324_cyt_prot_totals['% B total'] / PeaksDB_324_cyt_prot_totals['overall % sum']

PeaksDB_324_cyt_prot_totals['overall % S'] = PeaksDB_324_cyt_prot_totals['% S total'] / PeaksDB_324_cyt_prot_totals['overall % sum']

PeaksDB_324_cyt_prot_totals['overall % c'] = PeaksDB_324_cyt_prot_totals['% c total'] / PeaksDB_324_cyt_prot_totals['overall % sum']

# write to csv

PeaksDB_324_cyt_prot_totals.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/analyses/proteus2/Proteins-Proteus2/Day0_324 PeaksDB_dia_cyt_prot_trypsin_totals")

PeaksDB_324_cyt_prot_totals.head()

Unnamed: 0,% C total,% H total,% E total,% T total,% B total,% S total,% c total,% check sum,overall % sum,overall % C,overall % H,overall % E,overall % T,overall % B,overall % S,overall % c
324 total,25.469085,20.885082,13.645834,0.0,0.0,0.0,0.0,60.0,60.0,0.424485,0.348085,0.227431,0.0,0.0,0.0,0.0


# Now for the membrane proteins

In [12]:
!head T5-324-trypsin-DB-diatom-membrane-proteins

>Thalas

MMKLAALAAL MGSAAAFAPA QTGKAFTQLR AFEDELGAQP PLGFFDPFGM 
CCHHHHHHHH HCCCCCCCCC CCCCCCCCCC CCCCCCCCCC CCCCCCCCCC 

LSGDCTQERF DRLRYVELKH GRLCMLAFLG QLVTRAGLHL PGSLNYAGDS 
CCCCCCHHHH HHHHHHHHHH HHHHHHHHHH HHHHHHHCCC CCCCCCCCCC 

FDSFPNGVAA LFGPNSLPTA GLVQLLAFLG VLECAFMRDV PGTGNEFVGD 
HHHHHCCCCC CCCCCCCHHH HHHHHHHHHH HHHHHHHHHH CCCCCCCCCC 


In [13]:
# remove empty lines
# remove empty spaces from lines
# remove carat protein sequence names
# collapse the 2 lines from every protein sequence line (protein seq, secondard stucture pred.) into one line

!sed '/^[[:space:]]*$/d' T5-324-trypsin-DB-diatom-membrane-proteins | cat \
| sed '/>/d' \
| tr -d "[:blank:]" > T5-324-trypsin-DB-diatom-membrane-proteins.txt

!awk '{printf "%s%s",$0,(NR%2?FS:RS)}' T5-324-trypsin-DB-diatom-membrane-proteins.txt > \
T5-324-trypsin-DB-diatom-membrane-proteins_sort.csv

In [14]:
!head T5-324-trypsin-DB-diatom-membrane-proteins_sort.csv

MMKLAALAALMGSAAAFAPAQTGKAFTQLRAFEDELGAQPPLGFFDPFGM CCHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
LSGDCTQERFDRLRYVELKHGRLCMLAFLGQLVTRAGLHLPGSLNYAGDS CCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCC
FDSFPNGVAALFGPNSLPTAGLVQLLAFLGVLECAFMRDVPGTGNEFVGD HHHHHCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCC
FRNGYLDFGWDDFNEETKLQKRALELNNGRRNGYLDFGWDDFDEETKLQK CCCCCCCCCCCCCCCHHHHHHHCCCCCCCCCCCCCCCCCCCCCHHHHHHH
RALELNNGRAAMMGLLGLMVHEELLPLGYDPDLPLLGHLQ HHHHHHHCHHHHHHHHHHHHHHHHHCCCCCCHHHHHHHHC
SKMKLAVLAALFGSAAAFAPAQTGKTTTALNAFESELGAQPPLGFFDPLG CCCHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
LLDDADQERFDRLRYVELKHGRLAQLAFLGNLLTRAGVHLPGNLDYAGDS CCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCC
FDSFPNGWAALSGPDALPGAGFAQLVAFLGALELGVMKDVTGEAEFVGDF CCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCC
RNGALDFGWDSFDEETKLSKRALELNNGRAAMMGLLGLMVHEQLGGELPL CCCCCCCCCCCCCHHHHHHHHHHHHHHCHHHHHHHHHHHHHHHHCCCCCE
VGPM ECCC


In [15]:
# read into pandas with space delimeter

PeaksDB_324_mem_prot = pd.read_csv("T5-324-trypsin-DB-diatom-membrane-proteins_sort.csv", delim_whitespace=True, header=None)


# name columns

PeaksDB_324_mem_prot.columns =['Stripped protein sequence', 'Secondary structure pred.'] 

In [16]:
PeaksDB_324_mem_prot.head(13)

Unnamed: 0,Stripped protein sequence,Secondary structure pred.
0,MMKLAALAALMGSAAAFAPAQTGKAFTQLRAFEDELGAQPPLGFFD...,CCHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
1,LSGDCTQERFDRLRYVELKHGRLCMLAFLGQLVTRAGLHLPGSLNY...,CCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCC...
2,FDSFPNGVAALFGPNSLPTAGLVQLLAFLGVLECAFMRDVPGTGNE...,HHHHHCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCCC...
3,FRNGYLDFGWDDFNEETKLQKRALELNNGRRNGYLDFGWDDFDEET...,CCCCCCCCCCCCCCCHHHHHHHCCCCCCCCCCCCCCCCCCCCCHHH...
4,RALELNNGRAAMMGLLGLMVHEELLPLGYDPDLPLLGHLQ,HHHHHHHCHHHHHHHHHHHHHHHHHCCCCCCHHHHHHHHC
5,SKMKLAVLAALFGSAAAFAPAQTGKTTTALNAFESELGAQPPLGFF...,CCCHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
6,LLDDADQERFDRLRYVELKHGRLAQLAFLGNLLTRAGVHLPGNLDY...,CCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCC...
7,FDSFPNGWAALSGPDALPGAGFAQLVAFLGALELGVMKDVTGEAEF...,CCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCC...
8,RNGALDFGWDSFDEETKLSKRALELNNGRAAMMGLLGLMVHEQLGG...,CCCCCCCCCCCCCHHHHHHHHHHHHHHCHHHHHHHHHHHHHHHHCC...
9,VGPM,ECCC


In [17]:
# add a column with the stripped peptide length (number of AAs)
PeaksDB_324_mem_prot['Sequence length'] = PeaksDB_324_mem_prot['Stripped protein sequence'].apply(len)

# use a count function to enumerate the # of C's (coil residues) for each peptide
PeaksDB_324_mem_prot['C'] = PeaksDB_324_mem_prot['Secondary structure pred.'].str.count("C")

# use a count function to enumerate the # of H's (helices residues) in each peptide
PeaksDB_324_mem_prot['H'] = PeaksDB_324_mem_prot['Secondary structure pred.'].str.count("H")

# use a count function to enumerate the # of E's (beta strand residues) in each peptide
PeaksDB_324_mem_prot['E'] = PeaksDB_324_mem_prot['Secondary structure pred.'].str.count("E")

#use a count function to enumerate the # of T's (membrane helix residues) in each peptide
PeaksDB_324_mem_prot['T'] = PeaksDB_324_mem_prot['Secondary structure pred.'].str.count("T")

# use a count function to enumerate the # of B's (membrane strand residues) in each peptide
PeaksDB_324_mem_prot['B'] = PeaksDB_324_mem_prot['Secondary structure pred.'].str.count("B")

# use a count function to enumerate the # of B's (signal peptide residues) in each peptide
PeaksDB_324_mem_prot['S'] = PeaksDB_324_mem_prot['Secondary structure pred.'].str.count("S")

# use a count function to enumerate the # of c's (cleavage site residues) in each peptide
PeaksDB_324_mem_prot['c'] = PeaksDB_324_mem_prot['Secondary structure pred.'].str.count("c")

# add a column with the % C
PeaksDB_324_mem_prot['% C'] = PeaksDB_324_mem_prot['C'] / PeaksDB_324_mem_prot['Sequence length']

# add a column with the % H
PeaksDB_324_mem_prot['% H'] = PeaksDB_324_mem_prot['H'] / PeaksDB_324_mem_prot['Sequence length']

# add a column with the % E
PeaksDB_324_mem_prot['% E'] = PeaksDB_324_mem_prot['E'] / PeaksDB_324_mem_prot['Sequence length']

# add a column with the % T
PeaksDB_324_mem_prot['% T'] = PeaksDB_324_mem_prot['T'] / PeaksDB_324_mem_prot['Sequence length']

# add a column with the % B
PeaksDB_324_mem_prot['% B'] = PeaksDB_324_mem_prot['B'] / PeaksDB_324_mem_prot['Sequence length']

# add a column with the % S
PeaksDB_324_mem_prot['% S'] = PeaksDB_324_mem_prot['S'] / PeaksDB_324_mem_prot['Sequence length']

# add a column with the % c
PeaksDB_324_mem_prot['% c'] = PeaksDB_324_mem_prot['c'] / PeaksDB_324_mem_prot['Sequence length']

# additive check

PeaksDB_324_mem_prot['% check'] = PeaksDB_324_mem_prot['% C'] + PeaksDB_324_mem_prot['% H'] + PeaksDB_324_mem_prot['% E'] \
                                + PeaksDB_324_mem_prot['% T'] + PeaksDB_324_mem_prot['% B'] + \
                                PeaksDB_324_mem_prot['% S'] + PeaksDB_324_mem_prot['% c']

In [18]:
PeaksDB_324_mem_prot.head()

Unnamed: 0,Stripped protein sequence,Secondary structure pred.,Sequence length,C,H,E,T,B,S,c,% C,% H,% E,% T,% B,% S,% c,% check
0,MMKLAALAALMGSAAAFAPAQTGKAFTQLRAFEDELGAQPPLGFFD...,CCHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,50,41,9,0,0,0,0,0,0.82,0.18,0.0,0.0,0.0,0.0,0.0,1.0
1,LSGDCTQERFDRLRYVELKHGRLCMLAFLGQLVTRAGLHLPGSLNY...,CCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCC...,50,19,31,0,0,0,0,0,0.38,0.62,0.0,0.0,0.0,0.0,0.0,1.0
2,FDSFPNGVAALFGPNSLPTAGLVQLLAFLGVLECAFMRDVPGTGNE...,HHHHHCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCCC...,50,22,28,0,0,0,0,0,0.44,0.56,0.0,0.0,0.0,0.0,0.0,1.0
3,FRNGYLDFGWDDFNEETKLQKRALELNNGRRNGYLDFGWDDFDEET...,CCCCCCCCCCCCCCCHHHHHHHCCCCCCCCCCCCCCCCCCCCCHHH...,50,36,14,0,0,0,0,0,0.72,0.28,0.0,0.0,0.0,0.0,0.0,1.0
4,RALELNNGRAAMMGLLGLMVHEELLPLGYDPDLPLLGHLQ,HHHHHHHCHHHHHHHHHHHHHHHHHCCCCCCHHHHHHHHC,40,8,32,0,0,0,0,0,0.2,0.8,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
index = ['324 total']

data = {
        '% C total': PeaksDB_324_mem_prot['% C'].sum(),
        '% H total': PeaksDB_324_mem_prot['% H'].sum(),
        '% E total': PeaksDB_324_mem_prot['% E'].sum(),
        '% T total': PeaksDB_324_mem_prot['% T'].sum(),
        '% B total': PeaksDB_324_mem_prot['% B'].sum(),
        '% S total': PeaksDB_324_mem_prot['% S'].sum(),
        '% c total': PeaksDB_324_mem_prot['% c'].sum(),
        '% check sum': PeaksDB_324_mem_prot['% check'].sum()
       }

PeaksDB_324_mem_prot_totals = pd.DataFrame(data, columns=['% C total', '% H total', '% E total', '% T total', \
                                                      '% B total',  '% S total', '% c total', \
                                                      '% check sum'], index=index)

PeaksDB_324_mem_prot_totals['overall % sum'] = PeaksDB_324_mem_prot_totals['% C total'] \
                                            + PeaksDB_324_mem_prot_totals['% H total'] \
                                            + PeaksDB_324_mem_prot_totals['% E total'] \
                                            + PeaksDB_324_mem_prot_totals['% T total'] \
                                            + PeaksDB_324_mem_prot_totals['% B total'] \
                                            + PeaksDB_324_mem_prot_totals['% S total'] \
                                            + PeaksDB_324_mem_prot_totals['% c total'] 


PeaksDB_324_mem_prot_totals['overall % C'] = PeaksDB_324_mem_prot_totals['% C total'] / PeaksDB_324_mem_prot_totals['overall % sum']

PeaksDB_324_mem_prot_totals['overall % H'] = PeaksDB_324_mem_prot_totals['% H total'] / PeaksDB_324_mem_prot_totals['overall % sum']

PeaksDB_324_mem_prot_totals['overall % E'] = PeaksDB_324_mem_prot_totals['% E total'] / PeaksDB_324_mem_prot_totals['overall % sum']

PeaksDB_324_mem_prot_totals['overall % T'] = PeaksDB_324_mem_prot_totals['% T total'] / PeaksDB_324_mem_prot_totals['overall % sum']

PeaksDB_324_mem_prot_totals['overall % B'] = PeaksDB_324_mem_prot_totals['% B total'] / PeaksDB_324_mem_prot_totals['overall % sum']

PeaksDB_324_mem_prot_totals['overall % S'] = PeaksDB_324_mem_prot_totals['% S total'] / PeaksDB_324_mem_prot_totals['overall % sum']

PeaksDB_324_mem_prot_totals['overall % c'] = PeaksDB_324_mem_prot_totals['% c total'] / PeaksDB_324_mem_prot_totals['overall % sum']

# write to csv

PeaksDB_324_mem_prot_totals.to_csv("/home/millieginty/Documents/git-repos/rot-mayer/analyses/proteus2/Proteins-Proteus2/Day0_324 PeaksDB_dia_mem_prot_trypsin_totals")

PeaksDB_324_mem_prot_totals.head()

Unnamed: 0,% C total,% H total,% E total,% T total,% B total,% S total,% c total,% check sum,overall % sum,overall % C,overall % H,overall % E,overall % T,overall % B,overall % S,overall % c
324 total,72.906111,39.827542,17.271244,23.995103,0.0,0.0,0.0,154.0,154.0,0.473416,0.25862,0.112151,0.155812,0.0,0.0,0.0
