# Generate Sequence Protein Features

https://towardsdatascience.com/visualizing-and-analyzing-proteins-in-python-bd99521ccd

In [70]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [71]:
def lerFastaBio(arquivo):
    arquivoFasta = SeqIO.parse(open(arquivo),'fasta') #lê o arquivo com o Biopython
    
    dict_fasta = {} 

    for i in arquivoFasta:
        dict_fasta[i.id] = str(i.seq) 

    return dict_fasta

In [72]:
aminoacids = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [73]:
file = 'proteomas/cerevisiae.fasta'

proteoma = lerFastaBio(file)

In [74]:
df_bio = pd.DataFrame(list(proteoma.items()), columns=['Locus', 'Sequence'])
df_bio

Unnamed: 0,Locus,Sequence
0,YPL071C,MSSRFARSNGNPNHIRKRNHSPDPIGIDNYKRKRLIIDLENLSLND...
1,YLL050C,MSRSGVAVADESLTAFNDLKLGKKYKFILFGLNDAKTEIVVKETST...
2,YMR172W,MSGMGIAILCIVRTKIYRITISFDYSTLMSPFFLFLMMPTTLKDGY...
3,YOR185C,MSAPAQNNAEVPTFKLVLVGDGGTGKTTFVKRHLTGEFEKKYIATI...
4,YLL032C,MDNFKIYSTVITTAFLQVPHLYTTNRLWKPIEAPFLVEFLQKRISS...
...,...,...
6595,YBR021W,MPDNLSLHLSGSSKRLNSRQLMESSNETFAPNNVDLEKEYKSSQSN...
6596,YDR320W-B,MRVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR
6597,YBR232C,MFILAEVSDFILDIVAPLCPTISEACLTKHSIRKCTSEGTLSGESW...
6598,YDL245C,MASEQSSPEINADNLNSSAADVHVQPPGEKEWSDGFYDKEVINGNT...


## Protein Features

In [75]:
# Functions
    

def secondary_structure_fraction(protein):  
    '''
    Amino acids in helix: V, I, Y, F, W, L. 
    Amino acids in Turn: N, P, G, S. 
    Amino acids in sheet: E, M, A, L.

    Returns a tuple of three floats (Helix, Turn, Sheet)
    
    These are beta sheets, alpha helixes, and turns (where the residues change direction).

    '''
    X = ProteinAnalysis(protein)
    
    t = X.secondary_structure_fraction()
    
    return t[0], t[1], t[2]


def get_amino_acids_percent(protein, param):
    # Calculate the amino acid content in percentages.
    
    X = ProteinAnalysis(protein)
    
    return X.get_amino_acids_percent()[param]


def count_amino_acids(protein, param):
    # Calculate Count standard amino acids, return a dict.
    
    X = ProteinAnalysis(protein)
    
    return X.count_amino_acids()[param]


In [76]:
# Generate Features

df_bio['Sequence_Length'] = df_bio.apply(lambda x: len(x.Sequence), axis=1)

df_bio['Sec_Struct_Helix'], df_bio['Sec_Struct_Turn'], df_bio['Sec_Struct_Sheet'] = zip(*df_bio.apply(
    lambda x: secondary_structure_fraction(x.Sequence), axis=1))

for am in aminoacids:
    df_bio['Percent_'+am] = df_bio.apply(lambda x: get_amino_acids_percent(x.Sequence, am), axis=1)
    
#for am in aminoacids:
    #df_bio['Count_'+am] = df_bio.apply(lambda x: count_amino_acids(x.Sequence, am), axis=1)

In [77]:
df_bio

Unnamed: 0,Locus,Sequence,Sequence_Length,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,Percent_E,...,Percent_M,Percent_N,Percent_P,Percent_Q,Percent_R,Percent_S,Percent_T,Percent_V,Percent_W,Percent_Y
0,YPL071C,MSSRFARSNGNPNHIRKRNHSPDPIGIDNYKRKRLIIDLENLSLND...,156,0.288462,0.211538,0.185897,0.044872,0.006410,0.128205,0.044872,...,0.032051,0.089744,0.025641,0.012821,0.064103,0.057692,0.038462,0.038462,0.032051,0.038462
1,YLL050C,MSRSGVAVADESLTAFNDLKLGKKYKFILFGLNDAKTEIVVKETST...,143,0.307692,0.244755,0.237762,0.076923,0.006993,0.083916,0.069930,...,0.013986,0.041958,0.027972,0.006993,0.048951,0.111888,0.055944,0.083916,0.006993,0.048951
2,YMR172W,MSGMGIAILCIVRTKIYRITISFDYSTLMSPFFLFLMMPTTLKDGY...,719,0.219750,0.326843,0.214186,0.055633,0.002782,0.058414,0.047288,...,0.026426,0.118220,0.043115,0.050070,0.050070,0.112656,0.083449,0.030598,0.004172,0.018081
3,YOR185C,MSAPAQNNAEVPTFKLVLVGDGGTGKTTFVKRHLTGEFEKKYIATI...,220,0.318182,0.195455,0.236364,0.077273,0.013636,0.063636,0.063636,...,0.018182,0.054545,0.054545,0.050000,0.036364,0.027273,0.059091,0.081818,0.013636,0.040909
4,YLL032C,MDNFKIYSTVITTAFLQVPHLYTTNRLWKPIEAPFLVEFLQKRISS...,825,0.335758,0.244848,0.225455,0.042424,0.010909,0.042424,0.066667,...,0.025455,0.084848,0.040000,0.046061,0.035152,0.081212,0.059394,0.043636,0.002424,0.043636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,YBR021W,MPDNLSLHLSGSSKRLNSRQLMESSNETFAPNNVDLEKEYKSSQSN...,633,0.391785,0.246445,0.227488,0.080569,0.025276,0.034755,0.037915,...,0.023697,0.045814,0.041074,0.023697,0.033175,0.101106,0.039494,0.069510,0.034755,0.052133
6596,YDR320W-B,MRVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR,45,0.400000,0.111111,0.311111,0.044444,0.066667,0.000000,0.000000,...,0.066667,0.022222,0.022222,0.022222,0.066667,0.066667,0.022222,0.088889,0.000000,0.022222
6597,YBR232C,MFILAEVSDFILDIVAPLCPTISEACLTKHSIRKCTSEGTLSGESW...,119,0.386555,0.252101,0.268908,0.067227,0.033613,0.050420,0.067227,...,0.008403,0.008403,0.025210,0.000000,0.033613,0.176471,0.067227,0.100840,0.016807,0.000000
6598,YDL245C,MASEQSSPEINADNLNSSAADVHVQPPGEKEWSDGFYDKEVINGNT...,567,0.391534,0.248677,0.238095,0.068783,0.024691,0.024691,0.049383,...,0.028219,0.033510,0.042328,0.024691,0.029982,0.081129,0.045855,0.081129,0.022928,0.045855


## PPI Dataset 

In [78]:
df_ppi = pd.read_csv('data/proteins_features_cerevisiae.csv')
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential
0,4932.YKL128C,0.002300,0.000629,0.000001,0.395334,0.197802,0
1,4932.YML066C,0.004764,0.000823,0.000015,0.401571,0.275862,0
2,4932.YGR086C,0.031050,0.007486,0.000208,0.488516,0.146178,0
3,4932.YCR089W,0.004436,0.001015,0.000023,0.452066,0.202279,0
4,4932.YKL078W,0.081978,0.028308,0.000319,0.504730,0.297116,1
...,...,...,...,...,...,...,...
6083,4932.YNR054C,0.022178,0.007235,0.000021,0.434904,0.361747,1
6084,4932.YGR142W,0.007229,0.001963,0.000012,0.468247,0.207188,0
6085,4932.YBR165W,0.003121,0.001269,0.000006,0.445576,0.380117,0
6086,4932.YDL112W,0.032035,0.011030,0.000076,0.470857,0.353159,0


In [85]:
df_ppi['Locus'] = df_ppi.apply(lambda x:x.Protein_key[5::], axis=1)
#df_ppi['Locus'] = df_ppi['Protein_key'] 

In [86]:
df_ppi

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential,Locus
0,4932.YKL128C,0.002300,0.000629,0.000001,0.395334,0.197802,0,YKL128C
1,4932.YML066C,0.004764,0.000823,0.000015,0.401571,0.275862,0,YML066C
2,4932.YGR086C,0.031050,0.007486,0.000208,0.488516,0.146178,0,YGR086C
3,4932.YCR089W,0.004436,0.001015,0.000023,0.452066,0.202279,0,YCR089W
4,4932.YKL078W,0.081978,0.028308,0.000319,0.504730,0.297116,1,YKL078W
...,...,...,...,...,...,...,...,...
6083,4932.YNR054C,0.022178,0.007235,0.000021,0.434904,0.361747,1,YNR054C
6084,4932.YGR142W,0.007229,0.001963,0.000012,0.468247,0.207188,0,YGR142W
6085,4932.YBR165W,0.003121,0.001269,0.000006,0.445576,0.380117,0,YBR165W
6086,4932.YDL112W,0.032035,0.011030,0.000076,0.470857,0.353159,0,YDL112W


In [87]:
df_ppi[df_ppi['is_essential'] == 1]

Unnamed: 0,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential,Locus
4,4932.YKL078W,0.081978,0.028308,0.000319,0.504730,0.297116,1,YKL078W
17,4932.YGL116W,0.100214,0.029536,0.000572,0.504771,0.247926,1,YGL116W
27,4932.YNL263C,0.008707,0.001341,0.000018,0.437657,0.272859,1,YNL263C
35,4932.YBL030C,0.067192,0.019700,0.000489,0.491993,0.213445,1,YBL030C
39,4932.YMR211W,0.004436,0.001568,0.000001,0.404482,0.467236,1,YMR211W
...,...,...,...,...,...,...,...,...
6067,4932.YGL120C,0.102185,0.034531,0.000570,0.509166,0.281001,1,YGL120C
6076,4932.YLR153C,0.034993,0.008481,0.000478,0.469114,0.183542,1,YLR153C
6077,4932.YDR376W,0.007886,0.001370,0.000012,0.460659,0.263298,1,YDR376W
6081,4932.YLL035W,0.007393,0.002633,0.000002,0.428803,0.390909,1,YLL035W


## Merge

In [88]:
# Merge Datasets
df = df_bio.merge(df_ppi, how='inner', on='Locus')
df

Unnamed: 0,Locus,Sequence,Sequence_Length,Sec_Struct_Helix,Sec_Struct_Turn,Sec_Struct_Sheet,Percent_A,Percent_C,Percent_D,Percent_E,...,Percent_V,Percent_W,Percent_Y,Protein_key,DegreeCentrality,EigenvectorCentrality,BetweennessCentrality,ClosenessCentrality,Clustering,is_essential
0,YPL071C,MSSRFARSNGNPNHIRKRNHSPDPIGIDNYKRKRLIIDLENLSLND...,156,0.288462,0.211538,0.185897,0.044872,0.006410,0.128205,0.044872,...,0.038462,0.032051,0.038462,4932.YPL071C,0.000986,0.000512,0.000003,0.426787,0.266667,0
1,YLL050C,MSRSGVAVADESLTAFNDLKLGKKYKFILFGLNDAKTEIVVKETST...,143,0.307692,0.244755,0.237762,0.076923,0.006993,0.083916,0.069930,...,0.083916,0.006993,0.048951,4932.YLL050C,0.053392,0.017135,0.000200,0.497262,0.302355,1
2,YMR172W,MSGMGIAILCIVRTKIYRITISFDYSTLMSPFFLFLMMPTTLKDGY...,719,0.219750,0.326843,0.214186,0.055633,0.002782,0.058414,0.047288,...,0.030598,0.004172,0.018081,4932.YMR172W,0.007557,0.002314,0.000003,0.438952,0.289855,0
3,YOR185C,MSAPAQNNAEVPTFKLVLVGDGGTGKTTFVKRHLTGEFEKKYIATI...,220,0.318182,0.195455,0.236364,0.077273,0.013636,0.063636,0.063636,...,0.081818,0.013636,0.040909,4932.YOR185C,0.046164,0.017683,0.000104,0.491754,0.307905,0
4,YLL032C,MDNFKIYSTVITTAFLQVPHLYTTNRLWKPIEAPFLVEFLQKRISS...,825,0.335758,0.244848,0.225455,0.042424,0.010909,0.042424,0.066667,...,0.043636,0.002424,0.043636,4932.YLL032C,0.021028,0.006141,0.000127,0.473128,0.236713,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5990,YKL119C,MFEIKLNDRITEFLRKFKNSAKSNEGIDEDIDLFLKRHAIPMQSLL...,215,0.395349,0.153488,0.260465,0.027907,0.009302,0.046512,0.093023,...,0.102326,0.009302,0.037209,4932.YKL119C,0.007721,0.001813,0.000009,0.461148,0.238668,0
5991,YGR014W,MQFPFACLLSTLVISGSLARASPFDFIFGNGTQQAQSQSESQGQVS...,1306,0.222818,0.411179,0.178407,0.078867,0.001531,0.033691,0.033691,...,0.068913,0.004594,0.025268,4932.YGR014W,0.009364,0.002101,0.000028,0.460519,0.244361,0
5992,YBR021W,MPDNLSLHLSGSSKRLNSRQLMESSNETFAPNNVDLEKEYKSSQSN...,633,0.391785,0.246445,0.227488,0.080569,0.025276,0.034755,0.037915,...,0.069510,0.034755,0.052133,4932.YBR021W,0.007229,0.002346,0.000009,0.448334,0.238901,0
5993,YDL245C,MASEQSSPEINADNLNSSAADVHVQPPGEKEWSDGFYDKEVINGNT...,567,0.391534,0.248677,0.238095,0.068783,0.024691,0.024691,0.049383,...,0.081129,0.022928,0.045855,4932.YDL245C,0.004600,0.000906,0.000002,0.400540,0.224868,0


In [89]:
df['is_essential'].value_counts()

0    4895
1    1100
Name: is_essential, dtype: int64

In [90]:
# Dataset Final

df.to_csv('data/base_cerevisiae.csv', index=False)