# Generate Sequence Protein Features

In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [5]:
def lerFastaBio(arquivo):
    arquivoFasta = SeqIO.parse(open(arquivo),'fasta') #lê o arquivo com o Biopython
    
    dict_fasta = {} 

    for i in arquivoFasta:
        dict_fasta[i.id] = str(i.seq) 

    return dict_fasta

In [43]:
aminoacids = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [44]:
file = 'proteomas/cerevisiae.fasta'

proteoma = lerFastaBio(file)

In [45]:
df_bio = pd.DataFrame(list(proteoma.items()), columns=['Locus', 'Sequence'])
df_bio

Unnamed: 0,Locus,Sequence
0,YPL071C,MSSRFARSNGNPNHIRKRNHSPDPIGIDNYKRKRLIIDLENLSLND...
1,YLL050C,MSRSGVAVADESLTAFNDLKLGKKYKFILFGLNDAKTEIVVKETST...
2,YMR172W,MSGMGIAILCIVRTKIYRITISFDYSTLMSPFFLFLMMPTTLKDGY...
3,YOR185C,MSAPAQNNAEVPTFKLVLVGDGGTGKTTFVKRHLTGEFEKKYIATI...
4,YLL032C,MDNFKIYSTVITTAFLQVPHLYTTNRLWKPIEAPFLVEFLQKRISS...
...,...,...
6595,YBR021W,MPDNLSLHLSGSSKRLNSRQLMESSNETFAPNNVDLEKEYKSSQSN...
6596,YDR320W-B,MRVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR
6597,YBR232C,MFILAEVSDFILDIVAPLCPTISEACLTKHSIRKCTSEGTLSGESW...
6598,YDL245C,MASEQSSPEINADNLNSSAADVHVQPPGEKEWSDGFYDKEVINGNT...


## Protein Features

In [52]:
# Functions

def molecular_weight(protein):
    # Calculate MW from Protein sequence
    
    X = ProteinAnalysis(protein)
    
    return X.molecular_weight()


def gravy(protein):
    # Calculate the GRAVY (Grand Average of Hydropathy) according to Kyte and Doolitle, 1982

    X = ProteinAnalysis(protein)
    
    return X.gravy()


def aromaticity(protein):
    '''
    Calculates the aromaticity value of a protein according to Lobry, 1994. 
    It is simply the relative frequency of Phe+Trp+Tyr.
    '''
    X = ProteinAnalysis(protein)
    
    return X.aromaticity()
    

def secondary_structure_fraction(protein):  
    '''
    Amino acids in helix: V, I, Y, F, W, L. 
    Amino acids in Turn: N, P, G, S. 
    Amino acids in sheet: E, M, A, L.

    Returns a tuple of three floats (Helix, Turn, Sheet)
    '''
    X = ProteinAnalysis(protein)
    
    t = X.secondary_structure_fraction()
    
    return t[0], t[1], t[2]


def get_amino_acids_percent(protein, param):
    # Calculate the amino acid content in percentages.
    
    X = ProteinAnalysis(protein)
    
    return X.get_amino_acids_percent()[param]


In [57]:
# Generate Features

df_bio['Sequence_Length'] = df_bio.apply(lambda x: len(x.Sequence), axis=1)

df_bio['Molecular_weight'] = df_bio.apply(lambda x: molecular_weight(x.Sequence), axis=1)
    
df_bio['Gravy'] = df_bio.apply(lambda x: gravy(x.Sequence), axis=1)

df_bio['Aromaticity'] = df_bio.apply(lambda x: aromaticity(x.Sequence), axis=1)

df_bio['Sec_Struct_Helix'], df_bio['Sec_Struct_Turn'], df_bio['Sec_Struct_Sheet'] = zip(*df_bio.apply(
    lambda x: secondary_structure_fraction(x.Sequence), axis=1))

for am in aminoacids:
    df_bio['Percent_'+am] = df_bio.apply(lambda x: get_amino_acids_percent(x.Sequence, am), axis=1)

In [56]:
df_bio

Unnamed: 0,Locus,Sequence,Sequence_Length,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,Percent_E,...,Percent_Q,Percent_R,Percent_S,Percent_T,Percent_V,Percent_W,Percent_Y,Gravy,Aromaticity,Molecular_weight
0,YPL071C,MSSRFARSNGNPNHIRKRNHSPDPIGIDNYKRKRLIIDLENLSLND...,156,0.288462,0.211538,0.185897,0.044872,0.006410,0.128205,0.044872,...,0.012821,0.064103,0.057692,0.038462,0.038462,0.032051,0.038462,-0.900641,0.096154,18434.5023
1,YLL050C,MSRSGVAVADESLTAFNDLKLGKKYKFILFGLNDAKTEIVVKETST...,143,0.307692,0.244755,0.237762,0.076923,0.006993,0.083916,0.069930,...,0.006993,0.048951,0.111888,0.055944,0.083916,0.006993,0.048951,-0.379720,0.111888,15900.5720
2,YMR172W,MSGMGIAILCIVRTKIYRITISFDYSTLMSPFFLFLMMPTTLKDGY...,719,0.219750,0.326843,0.214186,0.055633,0.002782,0.058414,0.047288,...,0.050070,0.050070,0.112656,0.083449,0.030598,0.004172,0.018081,-0.801808,0.043115,79414.9726
3,YOR185C,MSAPAQNNAEVPTFKLVLVGDGGTGKTTFVKRHLTGEFEKKYIATI...,220,0.318182,0.195455,0.236364,0.077273,0.013636,0.063636,0.063636,...,0.050000,0.036364,0.027273,0.059091,0.081818,0.013636,0.040909,-0.383636,0.109091,24990.2165
4,YLL032C,MDNFKIYSTVITTAFLQVPHLYTTNRLWKPIEAPFLVEFLQKRISS...,825,0.335758,0.244848,0.225455,0.042424,0.010909,0.042424,0.066667,...,0.046061,0.035152,0.081212,0.059394,0.043636,0.002424,0.043636,-0.297939,0.100606,94596.1505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,YBR021W,MPDNLSLHLSGSSKRLNSRQLMESSNETFAPNNVDLEKEYKSSQSN...,633,0.391785,0.246445,0.227488,0.080569,0.025276,0.034755,0.037915,...,0.023697,0.033175,0.101106,0.039494,0.069510,0.034755,0.052133,0.235229,0.164297,71735.0271
6596,YDR320W-B,MRVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR,45,0.400000,0.111111,0.311111,0.044444,0.066667,0.000000,0.000000,...,0.022222,0.066667,0.066667,0.022222,0.088889,0.000000,0.022222,0.440000,0.088889,5328.7400
6597,YBR232C,MFILAEVSDFILDIVAPLCPTISEACLTKHSIRKCTSEGTLSGESW...,119,0.386555,0.252101,0.268908,0.067227,0.033613,0.050420,0.067227,...,0.000000,0.033613,0.176471,0.067227,0.100840,0.016807,0.000000,0.688235,0.075630,12749.5491
6598,YDL245C,MASEQSSPEINADNLNSSAADVHVQPPGEKEWSDGFYDKEVINGNT...,567,0.391534,0.248677,0.238095,0.068783,0.024691,0.024691,0.049383,...,0.024691,0.029982,0.081129,0.045855,0.081129,0.022928,0.045855,0.357319,0.142857,62930.7446
