# Generate Sequence Protein Features

https://towardsdatascience.com/visualizing-and-analyzing-proteins-in-python-bd99521ccd


Fórum de dúvida Biopython: https://www.biostars.org/p/9526952/#9527107


Trabalho Relativo a Estrutura Secundária: https://www.researchgate.net/publication/230589968_Analysis_of_Domain-Swapped_Oligomers_Reveals_Local_Sequence_Preferences_and_Structural_Imprints_at_the_Linker_Regions_and_Swapped_Interfaces 

In [2]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def lerFastaBio(arquivo):
    arquivoFasta = SeqIO.parse(open(arquivo),'fasta') #lê o arquivo com o Biopython
    
    dict_fasta = {} 

    for i in arquivoFasta:
        dict_fasta[i.id] = str(i.seq) 

    return dict_fasta

In [4]:
aminoacids = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [5]:
#file = 'proteomas/cerevisiae.fasta'
#file = 'proteomas/protein_elegans.fasta'
#file = 'proteomas/protein_droso.fasta'
#file = 'proteomas/protein_mansoni.fasta'
file = 'proteomas/protein_musculus.fasta'

proteoma = lerFastaBio(file)

In [6]:
df_bio = pd.DataFrame(list(proteoma.items()), columns=['Locus', 'Sequence'])
df_bio

Unnamed: 0,Locus,Sequence
0,10090.ENSMUSP00000000001,MGCTLSAEDKAAVERSKMIDRNLREDGEKAAKEVKLLLLGAGESGK...
1,10090.ENSMUSP00000000003,MMRVIILLLTLHVLGVSSVMSLKKKIDGPWQTIYLAASTMEKINEG...
2,10090.ENSMUSP00000000010,MSISGTLSSYYVDSIISHESEDAPPAKFPSGQYANPRQPGHAEHLD...
3,10090.ENSMUSP00000000028,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...
4,10090.ENSMUSP00000000049,MVSPVLALFSAFLCHVAIAGRICPKPDDLPFATVVPLKTSYDPGEQ...
...,...,...
22043,10090.ENSMUSP00000141161,MRRMALKKLKVIPKEGYLLLLDFDDEDDDIKVSEEALSEVKSPAFD...
22044,10090.ENSMUSP00000141166,MDDSEVESTASILASVKEQEAQFEKLTRALEEERRHVSAQLERVRV...
22045,10090.ENSMUSP00000141169,MEQRHSLAGNCDDGEKSEREAQGFEHRTCMDSGDPSFGQNDPPTIL...
22046,10090.ENSMUSP00000141173,MRKMALKKLKVIPKEGYLLLLDFDDEDDDIKVSEEALSEVKSPAFD...


## Protein Features

In [7]:
# Functions


def aromaticity(protein):
    
    '''
    Calculate the aromaticity according to Lobry, 1994.

    Calculates the aromaticity value of a protein according to Lobry, 1994.
    It is simply the relative frequency of Phe+Trp+Tyr.
    '''
    
    X = ProteinAnalysis(protein)
    
    return X.aromaticity()



# Reescrita da função do Biopython, dado uma discrepância de trabalhos citados
def secondary_structure_fraction(protein):
    
    '''
    Amino acids in helix: A, E, L, M, Q, R.
    Amino acids in Turn: D, G, N, K, P, S
    Amino acids in sheet: C, F, H, I, T, W, V, Y

    Returns a tuple of three floats (Helix, Turn, Sheet)
    
    These are beta sheets, alpha helixes, and turns (where the residues change direction).

    '''
    X = ProteinAnalysis(protein)
    
    aa_percentages = X.get_amino_acids_percent()
    
    helix = sum(aa_percentages[r] for r in "AELMQR")
    
    turn = sum(aa_percentages[r] for r in "DGNKPS")
    
    sheet = sum(aa_percentages[r] for r in "CFHITWVY")
    
    return helix, turn, sheet
    


def get_amino_acids_percent(protein, param):
    # Calculate the amino acid content in percentages.
    
    X = ProteinAnalysis(protein)
    
    return X.get_amino_acids_percent()[param]


def count_amino_acids(protein, param):
    # Calculate Count standard amino acids, return a dict.
    
    X = ProteinAnalysis(protein)
    
    return X.count_amino_acids()[param]


In [8]:
# Generate Features

df_bio['Sequence_Length'] = df_bio.apply(lambda x: len(x.Sequence), axis=1)

df_bio['Aromaticity'] = df_bio.apply(lambda x: aromaticity(x.Sequence), axis=1)

df_bio['Sec_Struct_Helix'], df_bio['Sec_Struct_Turn'], df_bio['Sec_Struct_Sheet'] = zip(*df_bio.apply(
    lambda x: secondary_structure_fraction(x.Sequence), axis=1))

for am in aminoacids:
    df_bio['Percent_'+am] = df_bio.apply(lambda x: get_amino_acids_percent(x.Sequence, am), axis=1)
    
#for am in aminoacids:
    #df_bio['Count_'+am] = df_bio.apply(lambda x: count_amino_acids(x.Sequence, am), axis=1)

In [9]:
df_bio

Unnamed: 0,Locus,Sequence,Sequence_Length,Aromaticity,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,...,Percent_M,Percent_N,Percent_P,Percent_Q,Percent_R,Percent_S,Percent_T,Percent_V,Percent_W,Percent_Y
0,10090.ENSMUSP00000000001,MGCTLSAEDKAAVERSKMIDRNLREDGEKAAKEVKLLLLGAGESGK...,354,0.098870,0.347458,0.313559,0.338983,0.067797,0.028249,0.070621,...,0.025424,0.039548,0.008475,0.036723,0.056497,0.062147,0.064972,0.062147,0.008475,0.042373
1,10090.ENSMUSP00000000003,MMRVIILLLTLHVLGVSSVMSLKKKIDGPWQTIYLAASTMEKINEG...,174,0.097701,0.362069,0.304598,0.333333,0.034483,0.022989,0.022989,...,0.057471,0.057471,0.022989,0.040230,0.057471,0.040230,0.045977,0.074713,0.005747,0.051724
2,10090.ENSMUSP00000000010,MSISGTLSSYYVDSIISHESEDAPPAKFPSGQYANPRQPGHAEHLD...,250,0.088000,0.412000,0.380000,0.208000,0.100000,0.012000,0.028000,...,0.020000,0.036000,0.092000,0.056000,0.068000,0.096000,0.032000,0.028000,0.016000,0.044000
3,10090.ENSMUSP00000000028,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...,566,0.102473,0.378092,0.314488,0.307420,0.054770,0.019435,0.077739,...,0.030035,0.033569,0.024735,0.049470,0.053004,0.084806,0.051237,0.070671,0.014134,0.033569
4,10090.ENSMUSP00000000049,MVSPVLALFSAFLCHVAIAGRICPKPDDLPFATVVPLKTSYDPGEQ...,345,0.110145,0.260870,0.371014,0.368116,0.055072,0.066667,0.034783,...,0.023188,0.037681,0.098551,0.017391,0.040580,0.057971,0.069565,0.060870,0.014493,0.040580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22043,10090.ENSMUSP00000141161,MRRMALKKLKVIPKEGYLLLLDFDDEDDDIKVSEEALSEVKSPAFD...,222,0.058559,0.355856,0.432432,0.211712,0.036036,0.018018,0.090090,...,0.049550,0.067568,0.022523,0.049550,0.027027,0.076577,0.027027,0.054054,0.004505,0.022523
22044,10090.ENSMUSP00000141166,MDDSEVESTASILASVKEQEAQFEKLTRALEEERRHVSAQLERVRV...,938,0.058635,0.374200,0.378465,0.247335,0.069296,0.010661,0.066098,...,0.020256,0.050107,0.057569,0.039446,0.077825,0.075693,0.051173,0.060768,0.010661,0.031983
22045,10090.ENSMUSP00000141169,MEQRHSLAGNCDDGEKSEREAQGFEHRTCMDSGDPSFGQNDPPTIL...,689,0.056604,0.357039,0.365747,0.277213,0.053701,0.017417,0.052250,...,0.017417,0.034833,0.084180,0.072569,0.044993,0.089985,0.071118,0.066763,0.007257,0.023222
22046,10090.ENSMUSP00000141173,MRKMALKKLKVIPKEGYLLLLDFDDEDDDIKVSEEALSEVKSPAFD...,222,0.058559,0.355856,0.432432,0.211712,0.036036,0.018018,0.085586,...,0.049550,0.067568,0.022523,0.049550,0.031532,0.072072,0.027027,0.049550,0.004505,0.022523


## PPI Dataset 

In [10]:
df_ppi = pd.read_csv('data/proteins_features_musculus.csv')
#df_ppi = pd.read_csv('data/proteins_features_mansoni.csv')
#df_ppi = pd.read_csv('data/proteins_features_elegans.csv')
#df_ppi = pd.read_csv('data/proteins_features_droso.csv')
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential
0,10090.ENSMUSP00000058283,0.001173,0.000027,4.986994e-09,0.337233,0.415020,0
1,10090.ENSMUSP00000096357,0.033762,0.003878,3.027879e-05,0.463799,0.360696,0
2,10090.ENSMUSP00000029876,0.086750,0.016763,6.749329e-04,0.500409,0.221219,0
3,10090.ENSMUSP00000098051,0.002397,0.000031,0.000000e+00,0.328959,1.000000,0
4,10090.ENSMUSP00000024954,0.084455,0.018026,2.011643e-04,0.495821,0.254087,1
...,...,...,...,...,...,...,...
19604,10090.ENSMUSP00000036881,0.034374,0.003978,2.314849e-05,0.452939,0.363883,0
19605,10090.ENSMUSP00000079340,0.032487,0.003888,1.982637e-05,0.451966,0.512801,0
19606,10090.ENSMUSP00000030675,0.048144,0.004925,5.594263e-05,0.465132,0.327001,0
19607,10090.ENSMUSP00000139765,0.000255,0.000002,6.073191e-09,0.304505,0.000000,0


In [11]:
#Para cerevisiae
#df_ppi['Locus'] = df_ppi.apply(lambda x:x.Protein_key[5::], axis=1)

#Para outros organismos
df_ppi['Locus'] = df_ppi['Protein_key'] 

In [12]:
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential,Locus
0,10090.ENSMUSP00000058283,0.001173,0.000027,4.986994e-09,0.337233,0.415020,0,10090.ENSMUSP00000058283
1,10090.ENSMUSP00000096357,0.033762,0.003878,3.027879e-05,0.463799,0.360696,0,10090.ENSMUSP00000096357
2,10090.ENSMUSP00000029876,0.086750,0.016763,6.749329e-04,0.500409,0.221219,0,10090.ENSMUSP00000029876
3,10090.ENSMUSP00000098051,0.002397,0.000031,0.000000e+00,0.328959,1.000000,0,10090.ENSMUSP00000098051
4,10090.ENSMUSP00000024954,0.084455,0.018026,2.011643e-04,0.495821,0.254087,1,10090.ENSMUSP00000024954
...,...,...,...,...,...,...,...,...
19604,10090.ENSMUSP00000036881,0.034374,0.003978,2.314849e-05,0.452939,0.363883,0,10090.ENSMUSP00000036881
19605,10090.ENSMUSP00000079340,0.032487,0.003888,1.982637e-05,0.451966,0.512801,0,10090.ENSMUSP00000079340
19606,10090.ENSMUSP00000030675,0.048144,0.004925,5.594263e-05,0.465132,0.327001,0,10090.ENSMUSP00000030675
19607,10090.ENSMUSP00000139765,0.000255,0.000002,6.073191e-09,0.304505,0.000000,0,10090.ENSMUSP00000139765


In [13]:
df_ppi[df_ppi['is_essential'] == 1]

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential,Locus
4,10090.ENSMUSP00000024954,0.084455,0.018026,0.000201,0.495821,0.254087,1,10090.ENSMUSP00000024954
14,10090.ENSMUSP00000055225,0.037281,0.004532,0.000103,0.468617,0.322163,1,10090.ENSMUSP00000055225
17,10090.ENSMUSP00000040307,0.155243,0.029671,0.000973,0.525728,0.197779,1,10090.ENSMUSP00000040307
20,10090.ENSMUSP00000033797,0.031620,0.006095,0.000019,0.449879,0.403221,1,10090.ENSMUSP00000033797
52,10090.ENSMUSP00000020171,0.012699,0.003945,0.000001,0.437209,0.605843,1,10090.ENSMUSP00000020171
...,...,...,...,...,...,...,...,...
19547,10090.ENSMUSP00000112532,0.020910,0.003163,0.000015,0.443197,0.272610,1,10090.ENSMUSP00000112532
19554,10090.ENSMUSP00000031723,0.019329,0.005970,0.000006,0.440485,0.629602,1,10090.ENSMUSP00000031723
19575,10090.ENSMUSP00000117496,0.099296,0.021331,0.000172,0.498904,0.260882,1,10090.ENSMUSP00000117496
19593,10090.ENSMUSP00000052283,0.021012,0.003361,0.000031,0.456796,0.203517,1,10090.ENSMUSP00000052283


## Merge

In [14]:
# Merge Datasets
df = df_bio.merge(df_ppi, how='inner', on='Locus')
df

Unnamed: 0,Locus,Sequence,Sequence_Length,Aromaticity,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,...,Percent_V,Percent_W,Percent_Y,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential
0,10090.ENSMUSP00000000001,MGCTLSAEDKAAVERSKMIDRNLREDGEKAAKEVKLLLLGAGESGK...,354,0.098870,0.347458,0.313559,0.338983,0.067797,0.028249,0.070621,...,0.062147,0.008475,0.042373,10090.ENSMUSP00000000001,0.101132,0.017358,8.738543e-04,0.505683,0.204578,0
1,10090.ENSMUSP00000000010,MSISGTLSSYYVDSIISHESEDAPPAKFPSGQYANPRQPGHAEHLD...,250,0.088000,0.412000,0.380000,0.208000,0.100000,0.012000,0.028000,...,0.028000,0.016000,0.044000,10090.ENSMUSP00000000010,0.013770,0.001888,5.630024e-06,0.419366,0.391794,1
2,10090.ENSMUSP00000000028,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...,566,0.102473,0.378092,0.314488,0.307420,0.054770,0.019435,0.077739,...,0.070671,0.014134,0.033569,10090.ENSMUSP00000000028,0.033201,0.007882,1.619622e-05,0.452290,0.347985,0
3,10090.ENSMUSP00000000049,MVSPVLALFSAFLCHVAIAGRICPKPDDLPFATVVPLKTSYDPGEQ...,345,0.110145,0.260870,0.371014,0.368116,0.055072,0.066667,0.034783,...,0.060870,0.014493,0.040580,10090.ENSMUSP00000000049,0.024633,0.003037,2.041550e-04,0.445842,0.193947,1
4,10090.ENSMUSP00000000058,MGLETEKADVQLFMADDAYSHHSGVDYADPEKYVDSSHDRDPHQLN...,162,0.111111,0.290123,0.314815,0.395062,0.061728,0.024691,0.074074,...,0.086420,0.018519,0.030864,10090.ENSMUSP00000000058,0.036057,0.009224,3.883830e-05,0.474938,0.444407,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19604,10090.ENSMUSP00000141134,MQNQTIVTEFVLLGLSQNPKVEKLLFVIFLLLYLATIGGNMTIVVT...,312,0.125000,0.298077,0.259615,0.442308,0.057692,0.035256,0.025641,...,0.092949,0.003205,0.028846,10090.ENSMUSP00000141134,0.000510,0.000113,0.000000e+00,0.361608,0.844444,0
19605,10090.ENSMUSP00000141137,MASCQVPLTFMDVALEFSKEEWECLDSAQKALYRDVMLENYNSLVS...,697,0.100430,0.197991,0.411765,0.390244,0.007174,0.071736,0.040172,...,0.037303,0.001435,0.057389,10090.ENSMUSP00000141137,0.035598,0.010581,5.526060e-06,0.468303,0.354273,0
19606,10090.ENSMUSP00000141166,MDDSEVESTASILASVKEQEAQFEKLTRALEEERRHVSAQLERVRV...,938,0.058635,0.374200,0.378465,0.247335,0.069296,0.010661,0.066098,...,0.060768,0.010661,0.031983,10090.ENSMUSP00000141166,0.088076,0.018371,7.952839e-05,0.494944,0.204089,0
19607,10090.ENSMUSP00000141169,MEQRHSLAGNCDDGEKSEREAQGFEHRTCMDSGDPSFGQNDPPTIL...,689,0.056604,0.357039,0.365747,0.277213,0.053701,0.017417,0.052250,...,0.066763,0.007257,0.023222,10090.ENSMUSP00000141169,0.000306,0.000053,8.696790e-11,0.344127,0.066667,0


In [15]:
df['is_essential'].value_counts()

0    17741
1     1868
Name: is_essential, dtype: int64

In [16]:
# Dataset Final

df.to_csv('data/base_musculus.csv', index=False)
#df.to_csv('data/base_elegans.csv', index=False)
#df.to_csv('data/base_drosophila.csv', index=False)