In [16]:
import pandas as pd
import numpy as np
import pyopenms as oms
from urllib.request import urlretrieve
from Bio import SeqIO

In [None]:
### fasta to dataframe
def fasta_to_dataframe(fasta):

    list_protein = []
    for protein in SeqIO.parse(fasta,"fasta"):
        protein_info = [protein.id, protein.name, protein.description, str(protein.seq)]
        list_protein.append(protein_info)

    column = ["ID", "Name", "Description", "seq"]
    df_protein = pd.DataFrame(list_protein, columns=column)

    return df_protein

### GET Trypsin PEPTIDE count
def get_trypsin_peptide_count(bsa):
    dig = oms.ProteaseDigestion()
    trypsin_peptide_count = []
    bsa = oms.AASequence.fromString(bsa)
    dig.digest(bsa, trypsin_peptide_count)

    return trypsin_peptide_count

### GET Lys-C PEPTIDE count
def get_lysc_peptide_count(bsa):
    dig = oms.ProteaseDigestion()
    dig.setEnzyme('Lys-C')
    lysc_peptide_count = []
    bsa = oms.AASequence.fromString(bsa)
    dig.digest(bsa, lysc_peptide_count)

    return lysc_peptide_count

In [None]:
DIR = "/path/to/data/"
fasta1 = "uniprot_human_reference_isoforms_20180717.fasta"

In [None]:
df_seq = fasta_to_dataframe(DIR + fasta1)
df_seq

Unnamed: 0,ID,Name,Description,seq
0,sp|Q16827|PTPRO_HUMAN,sp|Q16827|PTPRO_HUMAN,sp|Q16827|PTPRO_HUMAN Receptor-type tyrosine-p...,MGHLPTGIHGARRLLPLLWLFVLFKNATAFHVTVQDDNNIVVSLEA...
1,sp|Q16827-2|PTPRO_HUMAN,sp|Q16827-2|PTPRO_HUMAN,sp|Q16827-2|PTPRO_HUMAN Isoform 2 of Receptor-...,MGHLPTGIHGARRLLPLLWLFVLFKNATAFHVTVQDDNNIVVSLEA...
2,sp|Q16827-3|PTPRO_HUMAN,sp|Q16827-3|PTPRO_HUMAN,sp|Q16827-3|PTPRO_HUMAN Isoform 3 of Receptor-...,MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILRKKHLQMARECGA...
3,sp|Q16827-4|PTPRO_HUMAN,sp|Q16827-4|PTPRO_HUMAN,sp|Q16827-4|PTPRO_HUMAN Isoform 4 of Receptor-...,MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILRKKHLQMARECGA...
4,sp|Q16827-5|PTPRO_HUMAN,sp|Q16827-5|PTPRO_HUMAN,sp|Q16827-5|PTPRO_HUMAN Isoform 5 of Receptor-...,MGHLPTGIHGARRLLPLLWLFVLFKNATAFHVTVQDDNNIVVSLEA...
...,...,...,...,...
95052,tr|E3W988|E3W988_HUMAN,tr|E3W988|E3W988_HUMAN,tr|E3W988|E3W988_HUMAN Disintegrin and metallo...,MFRLWLLLAGLCGLLASRPGFQNSLLQIVIPEKIQTNTNDSSEIEY...
95053,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN TRIM15 OS=Homo ...,MPATPSLKVVHELPACTLCAGPLEDAVTIPCGHTFCRLCLPALSQM...
95054,tr|G3XAP6|G3XAP6_HUMAN,tr|G3XAP6|G3XAP6_HUMAN,tr|G3XAP6|G3XAP6_HUMAN Cartilage oligomeric ma...,MLRELQETNAALQDVRELLRQQVREITFLKNTVMECDACGMQQSVR...
95055,tr|Q9H3A6|Q9H3A6_HUMAN,tr|Q9H3A6|Q9H3A6_HUMAN,tr|Q9H3A6|Q9H3A6_HUMAN PRO2179 OS=Homo sapiens...,MALWETGGGDCGRGQGAEDFHYGLILQSLWLQSDPKYHWEQQCEGL...


In [None]:
df_seq["Trypsin"] = df_seq["seq"].apply(lambda x: get_trypsin_peptide_count(x))
df_seq["Trypsin count"] = df_seq["Trypsin"].apply(lambda x: len(x))

df_seq["Lys-C"] = df_seq["seq"].apply(lambda x: get_lysc_peptide_count(x))
df_seq["Lys-C count"] = df_seq["Lys-C"].apply(lambda x: len(x))

df_seq

Unnamed: 0,ID,Name,Description,seq,Trypsin,Trypsin count,Lys-C,Lys-C count
0,sp|Q16827|PTPRO_HUMAN,sp|Q16827|PTPRO_HUMAN,sp|Q16827|PTPRO_HUMAN Receptor-type tyrosine-p...,MGHLPTGIHGARRLLPLLWLFVLFKNATAFHVTVQDDNNIVVSLEA...,"[MGHLPTGIHGAR, R, LLPLLWLFVLFK, NATAFHVTVQDDNN...",99,"[MGHLPTGIHGARRLLPLLWLFVLFK, NATAFHVTVQDDNNIVVS...",59
1,sp|Q16827-2|PTPRO_HUMAN,sp|Q16827-2|PTPRO_HUMAN,sp|Q16827-2|PTPRO_HUMAN Isoform 2 of Receptor-...,MGHLPTGIHGARRLLPLLWLFVLFKNATAFHVTVQDDNNIVVSLEA...,"[MGHLPTGIHGAR, R, LLPLLWLFVLFK, NATAFHVTVQDDNN...",97,"[MGHLPTGIHGARRLLPLLWLFVLFK, NATAFHVTVQDDNNIVVS...",59
2,sp|Q16827-3|PTPRO_HUMAN,sp|Q16827-3|PTPRO_HUMAN,sp|Q16827-3|PTPRO_HUMAN Isoform 3 of Receptor-...,MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILRKKHLQMARECGA...,"[MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILR, K, K, HLQ...",44,"[MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILRK, K, HLQMA...",23
3,sp|Q16827-4|PTPRO_HUMAN,sp|Q16827-4|PTPRO_HUMAN,sp|Q16827-4|PTPRO_HUMAN Isoform 4 of Receptor-...,MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILRKKHLQMARECGA...,"[MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILR, K, K, HLQ...",42,"[MVTEMNPNVVVISVLAILSTLLIGLLLVTLIILRK, K, HLQMA...",23
4,sp|Q16827-5|PTPRO_HUMAN,sp|Q16827-5|PTPRO_HUMAN,sp|Q16827-5|PTPRO_HUMAN Isoform 5 of Receptor-...,MGHLPTGIHGARRLLPLLWLFVLFKNATAFHVTVQDDNNIVVSLEA...,"[MGHLPTGIHGAR, R, LLPLLWLFVLFK, NATAFHVTVQDDNN...",40,"[MGHLPTGIHGARRLLPLLWLFVLFK, NATAFHVTVQDDNNIVVS...",27
...,...,...,...,...,...,...,...,...
95052,tr|E3W988|E3W988_HUMAN,tr|E3W988|E3W988_HUMAN,tr|E3W988|E3W988_HUMAN Disintegrin and metallo...,MFRLWLLLAGLCGLLASRPGFQNSLLQIVIPEKIQTNTNDSSEIEY...,"[MFR, LWLLLAGLCGLLASRPGFQNSLLQIVIPEK, IQTNTNDS...",9,"[MFRLWLLLAGLCGLLASRPGFQNSLLQIVIPEK, IQTNTNDSSE...",6
95053,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN TRIM15 OS=Homo ...,MPATPSLKVVHELPACTLCAGPLEDAVTIPCGHTFCRLCLPALSQM...,"[MPATPSLK, VVHELPACTLCAGPLEDAVTIPCGHTFCR, LCLP...",53,"[MPATPSLK, VVHELPACTLCAGPLEDAVTIPCGHTFCRLCLPAL...",24
95054,tr|G3XAP6|G3XAP6_HUMAN,tr|G3XAP6|G3XAP6_HUMAN,tr|G3XAP6|G3XAP6_HUMAN Cartilage oligomeric ma...,MLRELQETNAALQDVRELLRQQVREITFLKNTVMECDACGMQQSVR...,"[MLR, ELQETNAALQDVR, ELLR, QQVR, EITFLK, NTVME...",66,"[MLRELQETNAALQDVRELLRQQVREITFLK, NTVMECDACGMQQ...",21
95055,tr|Q9H3A6|Q9H3A6_HUMAN,tr|Q9H3A6|Q9H3A6_HUMAN,tr|Q9H3A6|Q9H3A6_HUMAN PRO2179 OS=Homo sapiens...,MALWETGGGDCGRGQGAEDFHYGLILQSLWLQSDPKYHWEQQCEGL...,"[MALWETGGGDCGR, GQGAEDFHYGLILQSLWLQSDPK, YHWEQ...",19,"[MALWETGGGDCGRGQGAEDFHYGLILQSLWLQSDPK, YHWEQQC...",5


In [23]:
trypsin_count = df_seq[["ID", "Trypsin count"]]
trypsin_count

Unnamed: 0,ID,Trypsin count
0,sp|Q16827|PTPRO_HUMAN,99
1,sp|Q16827-2|PTPRO_HUMAN,97
2,sp|Q16827-3|PTPRO_HUMAN,44
3,sp|Q16827-4|PTPRO_HUMAN,42
4,sp|Q16827-5|PTPRO_HUMAN,40
...,...,...
95052,tr|E3W988|E3W988_HUMAN,9
95053,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN,53
95054,tr|G3XAP6|G3XAP6_HUMAN,66
95055,tr|Q9H3A6|Q9H3A6_HUMAN,19


In [24]:
lycs_count = df_seq[["ID", "Lys-C count"]]
lycs_count

Unnamed: 0,ID,Lys-C count
0,sp|Q16827|PTPRO_HUMAN,59
1,sp|Q16827-2|PTPRO_HUMAN,59
2,sp|Q16827-3|PTPRO_HUMAN,23
3,sp|Q16827-4|PTPRO_HUMAN,23
4,sp|Q16827-5|PTPRO_HUMAN,27
...,...,...
95052,tr|E3W988|E3W988_HUMAN,6
95053,tr|A0A0G2JJP1|A0A0G2JJP1_HUMAN,24
95054,tr|G3XAP6|G3XAP6_HUMAN,21
95055,tr|Q9H3A6|Q9H3A6_HUMAN,5


In [None]:
# export dataframe to csv
# df_seq.to_csv(DIR + "uniprot_human_reference_isoforms_20180717_Trypsin_LysC.csv", index=False, sep="\t")
# trypsin_count.to_csv(DIR + "uniprot_human_reference_isoforms_20180717_Trypsin_count.csv", index=False, sep="\t")
# lycs_count.to_csv(DIR + "uniprot_human_reference_isoforms_20180717_LysC_count.csv", index=False, sep="\t")