In [1]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
import pandas as pd
from pprint import pprint
import re
import ujson as json
import os

In [2]:
fastaFile = '../Sequences/Protein/sequences.fasta'
workdir = '../assets/ProtParam/'

In [3]:
pat = re.compile('([A-z0-9_]+)\s+\|([A-z0-9_\'-]+)[\s\|]+')

attr_lyst = (
    'aromaticity',
    'count_amino_acids',
    'flexibility',
    'get_amino_acids_percent',
    'gravy',
    'instability_index',
    'isoelectric_point',
    'length',
    'molar_extinction_coefficient',
    'molecular_weight',
    'monoisotopic',
    'secondary_structure_fraction')

def protParam(seq: str, description: str):
    analysed_seq = ProteinAnalysis(seq)
    for attr in attr_lyst:
        func = getattr(analysed_seq, attr)
        if not callable(func):
            yield attr, func
        else:
            yield attr, func()
    accession, protein = pat.search(description).groups()
    for res in (('accession', accession), ('protein', protein)):
        yield res

In [4]:
seqOb = SeqIO.parse(fastaFile, "fasta")
dfrm = pd.DataFrame(dict(protParam(str(record.seq).replace('X', 'A'), record.description)) for record in seqOb)
dfrm

Unnamed: 0,aromaticity,count_amino_acids,flexibility,get_amino_acids_percent,gravy,instability_index,isoelectric_point,length,molar_extinction_coefficient,molecular_weight,monoisotopic,secondary_structure_fraction,accession,protein
0,0.101930,"{'A': 309, 'C': 138, 'D': 211, 'E': 239, 'F': ...","[1.0167738095238095, 0.9989523809523808, 0.985...","{'A': 0.07014755959137343, 'C': 0.031328036322...",-0.023314,34.924048,6.035828,4405,"(543550, 552175)",489983.4465,False,"(0.33371169125993183, 0.21679909194097619, 0.2...",YP_009725295,orf1a
1,0.186047,"{'A': 2, 'C': 2, 'D': 2, 'E': 3, 'F': 6, 'G': ...","[0.9632500000000002, 0.9761666666666665, 0.998...","{'A': 0.046511627906976744, 'C': 0.04651162790...",1.448837,50.963023,4.167297,43,"(6990, 7115)",5180.2131,False,"(0.5813953488372092, 0.06976744186046512, 0.41...",YP_009725296,ORF7b
2,0.061111,"{'A': 7, 'C': 1, 'D': 9, 'E': 18, 'F': 5, 'G':...","[1.0167738095238095, 0.9989523809523808, 0.985...","{'A': 0.03888888888888889, 'C': 0.005555555555...",-0.378333,28.832222,5.363953,180,"(12950, 12950)",19775.0617,False,"(0.3111111111111111, 0.25555555555555554, 0.27...",YP_009725297,leader
3,0.024096,"{'A': 5, 'C': 3, 'D': 5, 'E': 5, 'F': 1, 'G': ...","[0.9924404761904762, 1.0428214285714286, 0.966...","{'A': 0.060240963855421686, 'C': 0.03614457831...",0.198795,51.968675,5.179260,83,"(5500, 5625)",9239.7338,False,"(0.3373493975903615, 0.1686746987951807, 0.349...",YP_009725303,nsp7
4,0.101156,"{'A': 16, 'C': 5, 'D': 23, 'E': 23, 'F': 21, '...","[1.0004642857142858, 0.9671071428571429, 1.002...","{'A': 0.046242774566473986, 'C': 0.01445086705...",-0.075723,36.282399,5.056580,346,"(32890, 33140)",38812.9124,False,"(0.36127167630057805, 0.21098265895953758, 0.2...",YP_009725310,endoRNAse
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,0.132231,"{'A': 5, 'C': 7, 'D': 7, 'E': 6, 'F': 8, 'G': ...","[0.9480595238095239, 0.9510476190476191, 0.964...","{'A': 0.04132231404958678, 'C': 0.057851239669...",0.219008,45.791074,5.416809,121,"(15930, 16305)",13830.8536,False,"(0.396694214876033, 0.19008264462809918, 0.181...",QHD43422,ORF8
1045,0.098361,"{'A': 1, 'C': 0, 'D': 4, 'E': 5, 'F': 3, 'G': ...","[0.9877738095238093, 0.9425357142857143, 0.995...","{'A': 0.01639344262295082, 'C': 0.0, 'D': 0.06...",0.232787,31.162295,4.677063,61,"(8480, 8480)",7272.4566,False,"(0.4426229508196722, 0.14754098360655737, 0.27...",QHD43420,ORF6
1046,0.069212,"{'A': 37, 'C': 0, 'D': 24, 'E': 12, 'F': 13, '...","[1.044154761904762, 1.0403214285714284, 1.0424...","{'A': 0.0883054892601432, 'C': 0.0, 'D': 0.057...",-0.971360,55.083103,10.069031,419,"(43890, 43890)",45625.1383,False,"(0.18615751789976134, 0.31026252983293556, 0.1...",QHD43423,nucleocapsid
1047,0.123967,"{'A': 9, 'C': 6, 'D': 2, 'E': 8, 'F': 10, 'G':...","[0.9447142857142858, 0.9467857142857143, 0.950...","{'A': 0.0743801652892562, 'C': 0.0495867768595...",0.318182,48.655372,8.230774,121,"(7450, 7825)",13744.0136,False,"(0.3801652892561984, 0.15702479338842976, 0.27...",QHD43421,ORF7a


### Get PDB related records

```py
dfrm[dfrm.protein.str.lower().eq('chain')]
```

In [5]:
jsonFocusCols = (
    'count_amino_acids', 
    'flexibility', 
    'get_amino_acids_percent', 
    'molar_extinction_coefficient', 
    'secondary_structure_fraction')

def saveResults(dfrm: pd.DataFrame, outputFolder: str, **kwargs):
    for col in jsonFocusCols:
        dfrm[col] = dfrm[col].apply(json.dumps)
    dfrm.to_csv(os.path.join(outputFolder, 'protein_seq_protparam_result.tsv'), **kwargs)

In [6]:
saveResults(dfrm, workdir, sep='\t', index=False)