# Generate Sequence Protein Features

https://towardsdatascience.com/visualizing-and-analyzing-proteins-in-python-bd99521ccd


Fórum de dúvida Biopython: https://www.biostars.org/p/9526952/#9527107


Trabalho Relativo a Estrutura Secundária: https://www.researchgate.net/publication/230589968_Analysis_of_Domain-Swapped_Oligomers_Reveals_Local_Sequence_Preferences_and_Structural_Imprints_at_the_Linker_Regions_and_Swapped_Interfaces 

In [3]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
def lerFastaBio(arquivo):
    arquivoFasta = SeqIO.parse(open(arquivo),'fasta') #lê o arquivo com o Biopython
    
    dict_fasta = {} 

    for i in arquivoFasta:
        dict_fasta[i.id] = str(i.seq) 

    return dict_fasta

In [5]:
aminoacids = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [6]:
#file = 'proteomas/cerevisiae.fasta'
#file = 'proteomas/protein_elegans.fasta'
#file = 'proteomas/protein_droso.fasta'
file = 'proteomas/protein_mansoni.fasta'

proteoma = lerFastaBio(file)

In [7]:
df_bio = pd.DataFrame(list(proteoma.items()), columns=['Locus', 'Sequence'])
df_bio

Unnamed: 0,Locus,Sequence
0,6183.Smp_000020.1,MANEDSKTFVNKEFLTGVVEGFYGRPWTYPQRKELFRRMNQMGMNA...
1,6183.Smp_000030.1,MGTDSSMKAVEEQSGPKKEETSKDDLSEEDRQLQDELNMLVDRLEE...
2,6183.Smp_000040.1,MSVQSGKSASVDFWCWDEVVRKTKSTVLVLESLRSEQQQISADLDL...
3,6183.Smp_000050.1,MSAATFLHVAAWYSAHEIIDDILKRHSESLIEKKTIEGFTPLHISA...
4,6183.Smp_000070.1,MFLCMLAYTFILFATIPILYRVPKVQYYVKFSVLCLLILIGSFFYS...
...,...,...
10695,6183.Smp_900070.1,VKKLIVGNLVDLPISIGLNYYWCVGFILGIFMVVQVISGVVLSLFY...
10696,6183.Smp_900080.1,MIGIVLLGGGLFILSLLLCQFRLFNYIIIVESFNVIVLLVSLLDSI...
10697,6183.Smp_900090.1,MFAGFYWFIKGLIYSLLLGLIVWCHQYCIGLSYSSINICLINGLFI...
10698,6183.Smp_900100.1,MSSSLLALLIVAFFFTLIIGSITFYVLGFSFSLDHYISLKEWYSSF...


## Protein Features

In [8]:
# Functions


def aromaticity(protein):
    
    '''
    Calculate the aromaticity according to Lobry, 1994.

    Calculates the aromaticity value of a protein according to Lobry, 1994.
    It is simply the relative frequency of Phe+Trp+Tyr.
    '''
    
    X = ProteinAnalysis(protein)
    
    return X.aromaticity()



# Reescrita da função do Biopython, dado uma discrepância de trabalhos citados
def secondary_structure_fraction(protein):
    
    '''
    Amino acids in helix: A, E, L, M, Q, R.
    Amino acids in Turn: D, G, N, K, P, S
    Amino acids in sheet: C, F, H, I, T, W, V, Y

    Returns a tuple of three floats (Helix, Turn, Sheet)
    
    These are beta sheets, alpha helixes, and turns (where the residues change direction).

    '''
    X = ProteinAnalysis(protein)
    
    aa_percentages = X.get_amino_acids_percent()
    
    helix = sum(aa_percentages[r] for r in "AELMQR")
    
    turn = sum(aa_percentages[r] for r in "DGNKPS")
    
    sheet = sum(aa_percentages[r] for r in "CFHITWVY")
    
    return helix, turn, sheet
    


def get_amino_acids_percent(protein, param):
    # Calculate the amino acid content in percentages.
    
    X = ProteinAnalysis(protein)
    
    return X.get_amino_acids_percent()[param]


def count_amino_acids(protein, param):
    # Calculate Count standard amino acids, return a dict.
    
    X = ProteinAnalysis(protein)
    
    return X.count_amino_acids()[param]


In [9]:
# Generate Features

df_bio['Sequence_Length'] = df_bio.apply(lambda x: len(x.Sequence), axis=1)

df_bio['Aromaticity'] = df_bio.apply(lambda x: aromaticity(x.Sequence), axis=1)

df_bio['Sec_Struct_Helix'], df_bio['Sec_Struct_Turn'], df_bio['Sec_Struct_Sheet'] = zip(*df_bio.apply(
    lambda x: secondary_structure_fraction(x.Sequence), axis=1))

for am in aminoacids:
    df_bio['Percent_'+am] = df_bio.apply(lambda x: get_amino_acids_percent(x.Sequence, am), axis=1)
    
#for am in aminoacids:
    #df_bio['Count_'+am] = df_bio.apply(lambda x: count_amino_acids(x.Sequence, am), axis=1)

In [10]:
df_bio

Unnamed: 0,Locus,Sequence,Sequence_Length,Aromaticity,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,...,Percent_M,Percent_N,Percent_P,Percent_Q,Percent_R,Percent_S,Percent_T,Percent_V,Percent_W,Percent_Y
0,6183.Smp_000020.1,MANEDSKTFVNKEFLTGVVEGFYGRPWTYPQRKELFRRMNQMGMNA...,1092,0.079670,0.336081,0.353480,0.310440,0.069597,0.025641,0.054945,...,0.021978,0.047619,0.083333,0.031136,0.051282,0.087912,0.065018,0.065018,0.012821,0.027473
1,6183.Smp_000030.1,MGTDSSMKAVEEQSGPKKEETSKDDLSEEDRQLQDELNMLVDRLEE...,995,0.063317,0.384925,0.328643,0.286432,0.074372,0.021106,0.057286,...,0.031156,0.040201,0.042211,0.033166,0.045226,0.069347,0.072362,0.055276,0.008040,0.031156
2,6183.Smp_000040.1,MSVQSGKSASVDFWCWDEVVRKTKSTVLVLESLRSEQQQISADLDL...,413,0.065375,0.426150,0.322034,0.251816,0.087167,0.014528,0.060533,...,0.016949,0.050847,0.024213,0.062954,0.055690,0.065375,0.041162,0.060533,0.012107,0.043584
3,6183.Smp_000050.1,MSAATFLHVAAWYSAHEIIDDILKRHSESLIEKKTIEGFTPLHISA...,1376,0.092297,0.337209,0.331395,0.331395,0.057413,0.013081,0.045058,...,0.025436,0.035610,0.052326,0.035610,0.044331,0.077035,0.067587,0.056686,0.015988,0.032703
4,6183.Smp_000070.1,MFLCMLAYTFILFATIPILYRVPKVQYYVKFSVLCLLILIGSFFYS...,263,0.121673,0.315589,0.319392,0.365019,0.076046,0.022814,0.034221,...,0.045627,0.030418,0.049430,0.026616,0.038023,0.064639,0.038023,0.076046,0.011407,0.038023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10695,6183.Smp_900070.1,VKKLIVGNLVDLPISIGLNYYWCVGFILGIFMVVQVISGVVLSLFY...,364,0.203297,0.255495,0.258242,0.486264,0.030220,0.021978,0.019231,...,0.024725,0.027473,0.030220,0.010989,0.013736,0.101648,0.024725,0.120879,0.027473,0.085165
10696,6183.Smp_900080.1,MIGIVLLGGGLFILSLLLCQFRLFNYIIIVESFNVIVLLVSLLDSI...,86,0.081395,0.337209,0.232558,0.430233,0.000000,0.034884,0.011628,...,0.046512,0.034884,0.000000,0.011628,0.034884,0.081395,0.011628,0.116279,0.000000,0.011628
10697,6183.Smp_900090.1,MFAGFYWFIKGLIYSLLLGLIVWCHQYCIGLSYSSINICLINGLFI...,419,0.150358,0.291169,0.264916,0.443914,0.038186,0.028640,0.014320,...,0.047733,0.035800,0.019093,0.009547,0.011933,0.114558,0.011933,0.136038,0.031026,0.050119
10698,6183.Smp_900100.1,MSSSLLALLIVAFFFTLIIGSITFYVLGFSFSLDHYISLKEWYSSF...,120,0.316667,0.225000,0.241667,0.533333,0.016667,0.025000,0.025000,...,0.016667,0.033333,0.000000,0.000000,0.000000,0.108333,0.025000,0.083333,0.033333,0.083333


## PPI Dataset 

In [11]:
df_ppi = pd.read_csv('data/proteins_features_mansoni.csv')
#df_ppi = pd.read_csv('data/proteins_features_elegans.csv')
#df_ppi = pd.read_csv('data/proteins_features_droso.csv')
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering
0,6183.Smp_167870.1,0.000158,0.000024,0.000000,0.337394,0.000000
1,6183.Smp_067260.1,0.026001,0.003761,0.000008,0.461707,0.572949
2,6183.Smp_080210.1,0.016861,0.001683,0.000026,0.455833,0.206665
3,6183.Smp_066740.1,0.053735,0.005257,0.000227,0.474275,0.373814
4,6183.Smp_144890.1,0.002364,0.000245,0.000002,0.403127,0.161905
...,...,...,...,...,...,...
6342,6183.Smp_002190.1,0.028049,0.003872,0.000006,0.453940,0.633784
6343,6183.Smp_030150.1,0.024582,0.005005,0.000008,0.464071,0.524069
6344,6183.Smp_095130.1,0.007564,0.000514,0.000007,0.406565,0.156028
6345,6183.Smp_194820.1,0.161204,0.025531,0.000452,0.526788,0.346183


In [12]:
#Para cerevisiae
#df_ppi['Locus'] = df_ppi.apply(lambda x:x.Protein_key[5::], axis=1)

#Para outros organismos
df_ppi['Locus'] = df_ppi['Protein_key'] 

In [13]:
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,Locus
0,6183.Smp_167870.1,0.000158,0.000024,0.000000,0.337394,0.000000,6183.Smp_167870.1
1,6183.Smp_067260.1,0.026001,0.003761,0.000008,0.461707,0.572949,6183.Smp_067260.1
2,6183.Smp_080210.1,0.016861,0.001683,0.000026,0.455833,0.206665,6183.Smp_080210.1
3,6183.Smp_066740.1,0.053735,0.005257,0.000227,0.474275,0.373814,6183.Smp_066740.1
4,6183.Smp_144890.1,0.002364,0.000245,0.000002,0.403127,0.161905,6183.Smp_144890.1
...,...,...,...,...,...,...,...
6342,6183.Smp_002190.1,0.028049,0.003872,0.000006,0.453940,0.633784,6183.Smp_002190.1
6343,6183.Smp_030150.1,0.024582,0.005005,0.000008,0.464071,0.524069,6183.Smp_030150.1
6344,6183.Smp_095130.1,0.007564,0.000514,0.000007,0.406565,0.156028,6183.Smp_095130.1
6345,6183.Smp_194820.1,0.161204,0.025531,0.000452,0.526788,0.346183,6183.Smp_194820.1


In [56]:
df_ppi[df_ppi['is_essential'] == 1]

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential,Locus
4,4932.YKL078W,0.081978,0.028308,0.000319,0.504730,0.297116,1,YKL078W
17,4932.YGL116W,0.100214,0.029536,0.000572,0.504771,0.247926,1,YGL116W
27,4932.YNL263C,0.008707,0.001341,0.000018,0.437657,0.272859,1,YNL263C
35,4932.YBL030C,0.067192,0.019700,0.000489,0.491993,0.213445,1,YBL030C
39,4932.YMR211W,0.004436,0.001568,0.000001,0.404482,0.467236,1,YMR211W
...,...,...,...,...,...,...,...,...
6067,4932.YGL120C,0.102185,0.034531,0.000570,0.509166,0.281001,1,YGL120C
6076,4932.YLR153C,0.034993,0.008481,0.000478,0.469114,0.183542,1,YLR153C
6077,4932.YDR376W,0.007886,0.001370,0.000012,0.460659,0.263298,1,YDR376W
6081,4932.YLL035W,0.007393,0.002633,0.000002,0.428803,0.390909,1,YLL035W


## Merge

In [14]:
# Merge Datasets
df = df_bio.merge(df_ppi, how='inner', on='Locus')
df

Unnamed: 0,Locus,Sequence,Sequence_Length,Aromaticity,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,...,Percent_T,Percent_V,Percent_W,Percent_Y,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering
0,6183.Smp_000020.1,MANEDSKTFVNKEFLTGVVEGFYGRPWTYPQRKELFRRMNQMGMNA...,1092,0.079670,0.336081,0.353480,0.310440,0.069597,0.025641,0.054945,...,0.065018,0.065018,0.012821,0.027473,6183.Smp_000020.1,0.010243,0.002098,0.000003,0.435368,0.322596
1,6183.Smp_000030.1,MGTDSSMKAVEEQSGPKKEETSKDDLSEEDRQLQDELNMLVDRLEE...,995,0.063317,0.384925,0.328643,0.286432,0.074372,0.021106,0.057286,...,0.072362,0.055276,0.008040,0.031156,6183.Smp_000030.1,0.129215,0.022459,0.000269,0.520135,0.361267
2,6183.Smp_000040.1,MSVQSGKSASVDFWCWDEVVRKTKSTVLVLESLRSEQQQISADLDL...,413,0.065375,0.426150,0.322034,0.251816,0.087167,0.014528,0.060533,...,0.041162,0.060533,0.012107,0.043584,6183.Smp_000040.1,0.141822,0.023353,0.000384,0.526089,0.308607
3,6183.Smp_000050.1,MSAATFLHVAAWYSAHEIIDDILKRHSESLIEKKTIEGFTPLHISA...,1376,0.092297,0.337209,0.331395,0.331395,0.057413,0.013081,0.045058,...,0.067587,0.056686,0.015988,0.032703,6183.Smp_000050.1,0.014970,0.001933,0.000017,0.440294,0.323180
4,6183.Smp_000070.1,MFLCMLAYTFILFATIPILYRVPKVQYYVKFSVLCLLILIGSFFYS...,263,0.121673,0.315589,0.319392,0.365019,0.076046,0.022814,0.034221,...,0.038023,0.076046,0.011407,0.038023,6183.Smp_000070.1,0.047431,0.003481,0.000489,0.489314,0.152558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6342,6183.Smp_900030.1,MFIGLWYNKLVEWISLRLVKVMISEWWYSFVMISVFCGLVMTRCPY...,173,0.213873,0.219653,0.225434,0.554913,0.011561,0.028902,0.005780,...,0.017341,0.138728,0.046243,0.052023,6183.Smp_900030.1,0.000158,0.000006,0.000000,0.326000,0.000000
6343,6183.Smp_900050.1,MLIGVVLLSLIFILSVLVFQGGVVSYSLGSAYLFFGDISLHFSSGI...,527,0.161290,0.290323,0.280835,0.428843,0.030361,0.034156,0.018975,...,0.024668,0.125237,0.015180,0.064516,6183.Smp_900050.1,0.011976,0.000391,0.000008,0.408476,0.635439
6344,6183.Smp_900070.1,VKKLIVGNLVDLPISIGLNYYWCVGFILGIFMVVQVISGVVLSLFY...,364,0.203297,0.255495,0.258242,0.486264,0.030220,0.021978,0.019231,...,0.024725,0.120879,0.027473,0.085165,6183.Smp_900070.1,0.013709,0.000407,0.000022,0.404464,0.483828
6345,6183.Smp_900090.1,MFAGFYWFIKGLIYSLLLGLIVWCHQYCIGLSYSSINICLINGLFI...,419,0.150358,0.291169,0.264916,0.443914,0.038186,0.028640,0.014320,...,0.011933,0.136038,0.031026,0.050119,6183.Smp_900090.1,0.012134,0.000324,0.000013,0.388032,0.597744


In [58]:
df['is_essential'].value_counts()

0    4895
1    1100
Name: is_essential, dtype: int64

In [59]:
# Dataset Final

df.to_csv('data/base_mansoni.csv', index=False)
#df.to_csv('data/base_elegans.csv', index=False)
#df.to_csv('data/base_drosophila.csv', index=False)