# Generate Sequence Protein Features

https://towardsdatascience.com/visualizing-and-analyzing-proteins-in-python-bd99521ccd

In [28]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
def lerFastaBio(arquivo):
    arquivoFasta = SeqIO.parse(open(arquivo),'fasta') #lê o arquivo com o Biopython
    
    dict_fasta = {} 

    for i in arquivoFasta:
        dict_fasta[i.id] = str(i.seq) 

    return dict_fasta

In [30]:
aminoacids = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [31]:
file = 'proteomas/protein_droso.fasta'

proteoma = lerFastaBio(file)

In [32]:
df_bio = pd.DataFrame(list(proteoma.items()), columns=['Locus', 'Sequence'])
df_bio

Unnamed: 0,Locus,Sequence
0,7227.FBpp0070001,MTCTLVLLIASVLHFRMRGSCLLDIERFPVIPGTIYAGHIAYCAIL...
1,7227.FBpp0070002,MDISKVDSTRALVNHWRIFRIMGIHPPGKRTFWGRHYTAYSMVWNV...
2,7227.FBpp0070005,MQQLLVVALVAVAIASLPQLAAGKKLGKRCVNCTYRTYYTYGDGRS...
3,7227.FBpp0070006,MSLSRLNTLMKLTPKPTAIGSLKMQRNLSALPESGPTGSFKTLAVS...
4,7227.FBpp0070007,MSMEECFVPRLPVDSEFIQFHDWAIKYEKSHILKSSCQLGTAKCCP...
...,...,...
13927,7227.FBpp0306226,MKAASLPVYQQQLRKNNFLAQHKWVKDGKMEGRKIEKRKYKASKST...
13928,7227.FBpp0306228,MILLRFVLSLVLLINTTECSFLKGCNYGGSCVDEKWRNDTHWNIIC...
13929,7227.FBpp0306231,METSEAVNTCEEGDQLNGSFRLQCSMSSVDMESESDLSLAFEREEQ...
13930,7227.FBpp0306232,MVRENKAAWKAQYFIKVVELFDEFPKCFIVGADNVGSKQMQNIRTS...


## Protein Features

In [33]:
# Functions


def aromaticity(protein):
    
    '''
    Calculate the aromaticity according to Lobry, 1994.

    Calculates the aromaticity value of a protein according to Lobry, 1994.
    It is simply the relative frequency of Phe+Trp+Tyr.
    '''
    
    X = ProteinAnalysis(protein)
    
    return X.aromaticity()
    

def secondary_structure_fraction(protein):  
    '''
    Amino acids in helix: V, I, Y, F, W, L. 
    Amino acids in Turn: N, P, G, S. 
    Amino acids in sheet: E, M, A, L.

    Returns a tuple of three floats (Helix, Turn, Sheet)
    
    These are beta sheets, alpha helixes, and turns (where the residues change direction).

    '''
    X = ProteinAnalysis(protein)
    
    t = X.secondary_structure_fraction()
    
    return t[0], t[1], t[2]


def get_amino_acids_percent(protein, param):
    # Calculate the amino acid content in percentages.
    
    X = ProteinAnalysis(protein)
    
    return X.get_amino_acids_percent()[param]


def count_amino_acids(protein, param):
    # Calculate Count standard amino acids, return a dict.
    
    X = ProteinAnalysis(protein)
    
    return X.count_amino_acids()[param]


In [34]:
# Generate Features

df_bio['Sequence_Length'] = df_bio.apply(lambda x: len(x.Sequence), axis=1)

df_bio['Aromaticity'] = df_bio.apply(lambda x: aromaticity(x.Sequence), axis=1)

df_bio['Sec_Struct_Helix'], df_bio['Sec_Struct_Turn'], df_bio['Sec_Struct_Sheet'] = zip(*df_bio.apply(
    lambda x: secondary_structure_fraction(x.Sequence), axis=1))

for am in aminoacids:
    df_bio['Percent_'+am] = df_bio.apply(lambda x: get_amino_acids_percent(x.Sequence, am), axis=1)
    
#for am in aminoacids:
    #df_bio['Count_'+am] = df_bio.apply(lambda x: count_amino_acids(x.Sequence, am), axis=1)

In [35]:
df_bio

Unnamed: 0,Locus,Sequence,Sequence_Length,Aromaticity,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,...,Percent_M,Percent_N,Percent_P,Percent_Q,Percent_R,Percent_S,Percent_T,Percent_V,Percent_W,Percent_Y
0,7227.FBpp0070001,MTCTLVLLIASVLHFRMRGSCLLDIERFPVIPGTIYAGHIAYCAIL...,210,0.090476,0.361905,0.128571,0.342857,0.071429,0.028571,0.038095,...,0.038095,0.014286,0.038095,0.038095,0.057143,0.038095,0.076190,0.066667,0.028571,0.028571
1,7227.FBpp0070002,MDISKVDSTRALVNHWRIFRIMGIHPPGKRTFWGRHYTAYSMVWNV...,387,0.105943,0.395349,0.186047,0.299742,0.062016,0.033592,0.015504,...,0.054264,0.033592,0.031008,0.020672,0.059432,0.077519,0.080103,0.062016,0.023256,0.036176
2,7227.FBpp0070005,MQQLLVVALVAVAIASLPQLAAGKKLGKRCVNCTYRTYYTYGDGRS...,441,0.049887,0.188209,0.306122,0.156463,0.070295,0.136054,0.054422,...,0.006803,0.061224,0.081633,0.029478,0.063492,0.058957,0.049887,0.068027,0.000000,0.036281
3,7227.FBpp0070006,MSLSRLNTLMKLTPKPTAIGSLKMQRNLSALPESGPTGSFKTLAVS...,312,0.057692,0.291667,0.214744,0.304487,0.105769,0.019231,0.057692,...,0.025641,0.028846,0.044872,0.032051,0.035256,0.086538,0.060897,0.067308,0.003205,0.012821
4,7227.FBpp0070007,MSMEECFVPRLPVDSEFIQFHDWAIKYEKSHILKSSCQLGTAKCCP...,272,0.106618,0.301471,0.150735,0.257353,0.036765,0.036765,0.062500,...,0.040441,0.036765,0.044118,0.051471,0.051471,0.047794,0.055147,0.055147,0.014706,0.018382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13927,7227.FBpp0306226,MKAASLPVYQQQLRKNNFLAQHKWVKDGKMEGRKIEKRKYKASKST...,61,0.065574,0.180328,0.163934,0.245902,0.098361,0.000000,0.016393,...,0.049180,0.032787,0.016393,0.098361,0.049180,0.065574,0.081967,0.049180,0.016393,0.032787
13928,7227.FBpp0306228,MILLRFVLSLVLLINTTECSFLKGCNYGGSCVDEKWRNDTHWNIIC...,81,0.111111,0.320988,0.185185,0.197531,0.000000,0.086420,0.061728,...,0.037037,0.049383,0.037037,0.012346,0.024691,0.049383,0.074074,0.037037,0.049383,0.037037
13929,7227.FBpp0306231,METSEAVNTCEEGDQLNGSFRLQCSMSSVDMESESDLSLAFEREEQ...,194,0.041237,0.195876,0.324742,0.268041,0.056701,0.015464,0.082474,...,0.020619,0.051546,0.103093,0.051546,0.056701,0.113402,0.041237,0.025773,0.000000,0.005155
13930,7227.FBpp0306232,MVRENKAAWKAQYFIKVVELFDEFPKCFIVGADNVGSKQMQNIRTS...,317,0.069401,0.290221,0.242902,0.296530,0.116719,0.006309,0.044164,...,0.022082,0.047319,0.059937,0.028391,0.028391,0.063091,0.044164,0.063091,0.003155,0.015773


## PPI Dataset 

In [36]:
df_ppi = pd.read_csv('data/proteins_features_droso.csv')
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential
0,7227.FBpp0112228,0.050906,0.019763,0.000150,0.444818,0.208164,0
1,7227.FBpp0292805,0.000619,0.000076,0.000000,0.307274,1.000000,0
2,7227.FBpp0070949,0.001061,0.000230,0.000005,0.341165,0.106061,0
3,7227.FBpp0077151,0.007247,0.001527,0.000017,0.376307,0.386932,0
4,7227.FBpp0077011,0.102519,0.038830,0.001129,0.488365,0.197406,0
...,...,...,...,...,...,...,...
11311,7227.FBpp0076790,0.012108,0.005630,0.000039,0.419425,0.422606,1
11312,7227.FBpp0075257,0.003182,0.000908,0.000009,0.362155,0.706349,0
11313,7227.FBpp0291347,0.061511,0.025098,0.000483,0.467915,0.244997,0
11314,7227.FBpp0100048,0.030490,0.005293,0.000770,0.439263,0.111139,0


In [37]:
#df_ppi['Locus'] = df_ppi.apply(lambda x:x.Protein_key[5::], axis=1)
df_ppi['Locus'] = df_ppi['Protein_key'] 

In [38]:
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential,Locus
0,7227.FBpp0112228,0.050906,0.019763,0.000150,0.444818,0.208164,0,7227.FBpp0112228
1,7227.FBpp0292805,0.000619,0.000076,0.000000,0.307274,1.000000,0,7227.FBpp0292805
2,7227.FBpp0070949,0.001061,0.000230,0.000005,0.341165,0.106061,0,7227.FBpp0070949
3,7227.FBpp0077151,0.007247,0.001527,0.000017,0.376307,0.386932,0,7227.FBpp0077151
4,7227.FBpp0077011,0.102519,0.038830,0.001129,0.488365,0.197406,0,7227.FBpp0077011
...,...,...,...,...,...,...,...,...
11311,7227.FBpp0076790,0.012108,0.005630,0.000039,0.419425,0.422606,1,7227.FBpp0076790
11312,7227.FBpp0075257,0.003182,0.000908,0.000009,0.362155,0.706349,0,7227.FBpp0075257
11313,7227.FBpp0291347,0.061511,0.025098,0.000483,0.467915,0.244997,0,7227.FBpp0291347
11314,7227.FBpp0100048,0.030490,0.005293,0.000770,0.439263,0.111139,0,7227.FBpp0100048


In [39]:
df_ppi[df_ppi['is_essential'] == 1]

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential,Locus
32,7227.FBpp0088658,0.037296,0.014157,0.000199,0.443833,0.236640,1,7227.FBpp0088658
59,7227.FBpp0072164,0.037649,0.014198,0.000227,0.446945,0.198608,1,7227.FBpp0072164
68,7227.FBpp0078336,0.019885,0.008289,0.000087,0.435207,0.313492,1,7227.FBpp0078336
122,7227.FBpp0099825,0.005479,0.002154,0.000004,0.394043,0.356425,1,7227.FBpp0099825
127,7227.FBpp0079648,0.034026,0.012406,0.000198,0.444853,0.223810,1,7227.FBpp0079648
...,...,...,...,...,...,...,...,...
11228,7227.FBpp0083801,0.035440,0.010177,0.000696,0.445666,0.174015,1,7227.FBpp0083801
11235,7227.FBpp0076320,0.042863,0.016233,0.000647,0.458132,0.250567,1,7227.FBpp0076320
11259,7227.FBpp0297103,0.016615,0.008599,0.000197,0.425002,0.499204,1,7227.FBpp0297103
11307,7227.FBpp0111799,0.018825,0.005665,0.000181,0.420431,0.157853,1,7227.FBpp0111799


## Merge

In [40]:
# Merge Datasets
df = df_bio.merge(df_ppi, how='inner', on='Locus')
df

Unnamed: 0,Locus,Sequence,Sequence_Length,Aromaticity,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,...,Percent_V,Percent_W,Percent_Y,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential
0,7227.FBpp0070005,MQQLLVVALVAVAIASLPQLAAGKKLGKRCVNCTYRTYYTYGDGRS...,441,0.049887,0.188209,0.306122,0.156463,0.070295,0.136054,0.054422,...,0.068027,0.000000,0.036281,7227.FBpp0070005,0.018029,0.006546,2.121380e-04,0.425099,0.210422,0
1,7227.FBpp0070006,MSLSRLNTLMKLTPKPTAIGSLKMQRNLSALPESGPTGSFKTLAVS...,312,0.057692,0.291667,0.214744,0.304487,0.105769,0.019231,0.057692,...,0.067308,0.003205,0.012821,7227.FBpp0070006,0.011843,0.001449,1.621972e-04,0.394320,0.318819,0
2,7227.FBpp0070007,MSMEECFVPRLPVDSEFIQFHDWAIKYEKSHILKSSCQLGTAKCCP...,272,0.106618,0.301471,0.150735,0.257353,0.036765,0.036765,0.062500,...,0.055147,0.014706,0.018382,7227.FBpp0070007,0.004507,0.001906,2.038849e-06,0.377497,0.421176,0
3,7227.FBpp0070025,MNILRRLDRLIAPTVRRSAAISTSLWPHRWMSQSTTISPGSPSSKS...,545,0.075229,0.286239,0.240367,0.271560,0.071560,0.018349,0.044037,...,0.062385,0.009174,0.034862,7227.FBpp0070025,0.015289,0.002002,1.127532e-03,0.402916,0.210042,0
4,7227.FBpp0070031,MDTTPIFQSSFSIRSLLSVDKKEESPISKHNSGSSFSSCSSSSSNS...,260,0.107692,0.234615,0.288462,0.176923,0.061538,0.007692,0.034615,...,0.026923,0.019231,0.053846,7227.FBpp0070031,0.029342,0.013983,5.248949e-05,0.441718,0.361792,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11311,7227.FBpp0306211,MARLISGVRNLFHRYPFVTNSAIYGSLYVGAEYSQQFASKRWLATA...,204,0.147059,0.392157,0.196078,0.264706,0.102941,0.014706,0.029412,...,0.073529,0.029412,0.058824,7227.FBpp0306211,0.000177,0.000006,1.872049e-08,0.290879,0.000000,0
11312,7227.FBpp0306213,MVKILQAYNFARQQTYALNGDILAASLIGNNRIAISSAEQFIEIYD...,1536,0.083984,0.311198,0.236979,0.262370,0.074870,0.019531,0.052734,...,0.065104,0.005208,0.035807,7227.FBpp0306213,0.001237,0.000074,6.935191e-06,0.324301,0.648352,0
11313,7227.FBpp0306214,MSGGDYDSGDYFMRSRKQRDKPSLWDSFQDPPSKKTSGSDADWKKL...,1393,0.117014,0.366116,0.183058,0.289304,0.083274,0.022254,0.055994,...,0.071070,0.023690,0.035176,7227.FBpp0306214,0.014759,0.005067,1.719578e-04,0.421110,0.268595,0
11314,7227.FBpp0306223,MEREIAHSLAGGEERSSDVAPGQVKTFEELRLYRNLLNGLKRNNFV...,1028,0.071012,0.259728,0.250000,0.245136,0.057393,0.007782,0.052529,...,0.057393,0.003891,0.034047,7227.FBpp0306223,0.055590,0.016066,4.047679e-04,0.451477,0.227401,1


In [41]:
df['is_essential'].value_counts()

0    11026
1      290
Name: is_essential, dtype: int64

In [42]:
# Dataset Final

df.to_csv('data/base_droso.csv', index=False)