# DEG-Integration-String-Essentiality

Integração dos dados da base de eucariotos e bactérias do DEG database e do STRING com Biopython e Pandas.

Link do banco DEG: http://origin.tubic.org/deg/public/index.php

Link do banco STRING: https://string-db.org/cgi/download?sessionId=bygTUZSWCFxG


In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Lista de Organismos

organismos = ['danio.fasta', 'drosophila.fasta', 'saccharomyces.fasta', 
              'elegans.fasta', 'musculus.fasta']

## Gerando Dicionarios de Genes e Aminoácidos

In [14]:
# Extraindo sequências com Biopython

from Bio import SeqIO

def lerFastaBio(arquivo):
    arquivoFasta = SeqIO.parse(open(arquivo),'fasta') #lê o arquivo com o Biopython

    dict_fasta = {} 

    for i in arquivoFasta:
        dict_fasta[i.id] = str(i.seq) 

    return dict_fasta

In [15]:
# Dict Nucleotídeos
dict_nucl = lerFastaBio('data/DEG20_nt.fasta')


In [17]:
# Dict Proteinas
dict_prot = lerFastaBio('data/DEG20_aa.fasta')

## Pré-Processamento de Dados

In [18]:
csv_eukarioto = pd.read_csv('data/deg_eukaryotes.csv', sep=';',
                           names=['Organism', 'Reference', 'Pubmed', 'Name',  'Essential_Genes',
                                 '1', '2', '3', '4', '5','Method', 'Medium', 'Code_Organism', 'Date'])


eukarioto = csv_eukarioto[['Organism', 'Reference', 'Name',  'Essential_Genes',
                                'Method','Code_Organism', 'Date']]

eukarioto

Unnamed: 0,Organism,Reference,Name,Essential_Genes,Method,Code_Organism,Date
0,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11
1,Caenorhabditis elegans,"Kamath RS, et al (2003). Systematic functional...",worm,294,RNA interference,DEG2002,2018-11-11
2,Arabidopsis thaliana,"Meinke D, et al(2008). Identifying essential g...",thale cress,356,Single-gene knockout,DEG2003,2018-11-11
3,Danio rerio,"Amsterdam A, et al (2004). Identification of 3...",zebrafish,315,Insertional mutagenesis,DEG2004,2018-11-11
4,Mus musculus,"Liao BY, Zhang J (2007). Mouse duplicate genes...",mouse,2114,MGI annotation,DEG2005,2018-11-11
5,Homo sapiens,"Liao BY, Zhang J (2008). Null mutations in hum...",Uncertain,118,OMIM annotation,DEG2006,2018-11-11
6,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11
7,Aspergillus fumigatus,"Hu W, et al (2007). Essential gene identificat...",Uncertain,35,Conditional promoter replacement,DEG2008,2018-11-11
8,Schizosaccharomyces pombe 972h-,"Kim, Dong-Uk, et al. ""Analysis of a genome-wid...",Schizosaccharomyces,1260,Single-gene knockout,DEG2009,2018-11-11
9,Homo sapiens,"Georgi, B., Voight, B. F., & Bućan, M. (2013)....",Uncertain,2472,-,DEG2010,2018-11-11


In [22]:
csv_annotations = pd.read_csv('data/deg_annotation_e.csv', sep=';',
                             names=['Code_Organism', 'Code_Gene_DEG', 'Gene', 'Genbank',  '1', '2',
                                 'Function', 'Organism', 'RefSeq', 'Medium', '3','4', '5', '6'])

annotations = csv_annotations[['Code_Organism', 'Code_Gene_DEG', 'Gene', 'Function', 'RefSeq']]

annotations

Unnamed: 0,Code_Organism,Code_Gene_DEG,Gene,Function,RefSeq
0,DEG2001,DEG20010001,TFC3,Largest of six subunits of the RNA polymerase ...,NC_001133
1,DEG2001,DEG20010002,EFB1,Translation elongation factor 1 beta; stimulat...,NC_001133
2,DEG2001,DEG20010003,MAK16,"Essential nuclear protein, constituent of 66S ...",NC_001133
3,DEG2001,DEG20010004,PRP45,Protein required for pre-mRNA splicing; associ...,NC_001133
4,DEG2001,DEG20010005,POP5,"Subunit of both RNase MRP, which cleaves pre-r...",NC_001133
...,...,...,...,...,...
43289,DEG2033,DEG20331589,CKAP5,cytoskeleton associated protein 5,11p11.2
43290,DEG2033,DEG20331590,BRD8,bromodomain containing 8,5q31.2
43291,DEG2033,DEG20331591,VPS72,vacuolar protein sorting 72 homolog,1q21.3
43292,DEG2033,DEG20331592,HTATSF1,HIV-1 Tat specific factor 1,Xq26.3


In [23]:
# Realizar Merge

essential_genes_temp = eukarioto.merge(annotations, left_on='Code_Organism', right_on='Code_Organism')
essential_genes_temp

Unnamed: 0,Organism,Reference,Name,Essential_Genes,Method,Code_Organism,Date,Code_Gene_DEG,Gene,Function,RefSeq
0,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010001,TFC3,Largest of six subunits of the RNA polymerase ...,NC_001133
1,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010002,EFB1,Translation elongation factor 1 beta; stimulat...,NC_001133
2,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010003,MAK16,"Essential nuclear protein, constituent of 66S ...",NC_001133
3,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010004,PRP45,Protein required for pre-mRNA splicing; associ...,NC_001133
4,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010005,POP5,"Subunit of both RNase MRP, which cleaves pre-r...",NC_001133
...,...,...,...,...,...,...,...,...,...,...,...
43289,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321558,CUL1,cullin 1,7q36.1
43290,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321559,GTF2E1,general transcription factor IIE subunit 1,3q13.33
43291,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321560,OSCP1,organic solute carrier partner 1,1p34.3
43292,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321561,IRF5,interferon regulatory factor 5,7q32.1


In [26]:
#lista de organismos modelos

model_organism = ['DEG2001', 'DEG2002', 'DEG2004','DEG2005', 'DEG2007']

In [27]:
# Base filtrada de organismos modelo

essential_genes = essential_genes_temp[essential_genes_temp['Code_Organism'].isin(model_organism)]
essential_genes

Unnamed: 0,Organism,Reference,Name,Essential_Genes,Method,Code_Organism,Date,Code_Gene_DEG,Gene,Function,RefSeq
0,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010001,TFC3,Largest of six subunits of the RNA polymerase ...,NC_001133
1,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010002,EFB1,Translation elongation factor 1 beta; stimulat...,NC_001133
2,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010003,MAK16,"Essential nuclear protein, constituent of 66S ...",NC_001133
3,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010004,PRP45,Protein required for pre-mRNA splicing; associ...,NC_001133
4,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010005,POP5,"Subunit of both RNase MRP, which cleaves pre-r...",NC_001133
...,...,...,...,...,...,...,...,...,...,...,...
4641,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070335,Doa,"Darkener_of_apricot_CG33553-PA,_isoform_A",3R
4642,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070336,Lip,-,3R
4643,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070337,noi,noisette_CG2925-PA,3R
4644,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070338,pnt,Ets_at_58AB,3R


In [None]:
# Gerando a lista de sequências

essential_genes['Seq_Gene'] = essential_genes.apply(lambda x: dict_nucl[x.Code_Gene_DEG], axis=1)
essential_genes['Seq_Prot'] = essential_genes.apply(lambda x: dict_prot[x.Code_Gene_DEG], axis=1)

In [18]:
# Base Final

essential_genes

Unnamed: 0,Organism,Reference,Name,Essential_Genes,Method,Code_Organism,Date,Code_Gene_DEG,Gene,Function,RefSeq,Seq_Gene,Seq_Prot,Locus
0,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010001,TFC3,Largest of six subunits of the RNA polymerase ...,NC_001133,ATGGTACTGACGATTTATCCTGACGAACTCGTACAAATAGTGTCTG...,MVLTIYPDELVQIVSDKIASNKGKITLNQLWDISGKYFDLSDKKVK...,4932.YAL001C
1,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010002,EFB1,Translation elongation factor 1 beta; stimulat...,NC_001133,ATGGCATCCACCGATTTCTCCAAGATTGAAACTTTGAAACAATTAA...,MASTDFSKIETLKQLNASLADKSYIEGTAVSQADVTVFKAFQSAYP...,4932.YAL003W
2,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010003,MAK16,"Essential nuclear protein, constituent of 66S ...",NC_001133,ATGTCCGACGAAATTGTTTGGCAAGTGATTAATCAAAGTTTCTGCT...,MSDEIVWQVINQSFCSHRIKAPNGQNFCRNEYNVTGLCTRQSCPLA...,4932.YAL025C
3,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010004,PRP45,Protein required for pre-mRNA splicing; associ...,NC_001133,ATGTTTAGTAACAGACTACCACCTCCAAAACATTCTCAAGGACGAG...,MFSNRLPPPKHSQGRVSTALSSDRVEPAILTDQIAKNVKLDDFIPK...,4932.YAL032C
4,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010005,POP5,"Subunit of both RNase MRP, which cleaves pre-r...",NC_001133,ATGGTACGTTTAAAAAGTAGATATATCCTTTTTGAAATTATATTCC...,MVRLKSRYILFEIIFPPTDTNVEESVSKADILLSHHRASPADVSIK...,4932.YAL033W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070335,Doa,"Darkener_of_apricot_CG33553-PA,_isoform_A",3R,ATGTCCAACGAACTTAGCGAACTGATCGCCCTTGGCTGTCCAGACC...,MSNELSELIALGCPDLAAQKPQGGSTQVELILNTERRLSIKQMTAT...,7227.FBpp0289029
4642,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070336,Lip,-,3R,ATGTCACATGATAAGAAGATGTTGGATCGCGAGGCAGTACGCTCAG...,MSHDKKMLDREAVRSVIQQWNANRLDLFALSEPDENLLFHGVMRFY...,7227.FBpp0289752
4643,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070337,noi,noisette_CG2925-PA,3R,ATGGAGACGCTCCTGGAGCAACAGCGGCGCCTACACGAGGAGCGCG...,METLLEQQRRLHEERERLVKLMVDEHATKKPGEKERIHSEHRLKYL...,7227.FBpp0078400
4644,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",fruit fly,339,P-element insertion,DEG2007,2018-11-11,DEG20070338,pnt,Ets_at_58AB,3R,ATGGAATTGGCGATTTGTAAAACAGATCTGTCTGCCACGAAATTTA...,MELAICKTDLSATKFMLPPALPSSAAIGSSSAVASTASHFLDKAAH...,7227.FBpp0088658


## Web Scraping

In [28]:
# Importando as libs
from urllib.request import urlopen

# Importando a BeautifulSoup
from bs4 import BeautifulSoup

# Bibliotea de Regex
import re

# Vamos mudar a URL
url = 'http://origin.tubic.org/deg/public/index.php/information/eukaryotes/'

In [31]:
# Função para realizar o scraping do locus id

def webScrapingLocus(url, chave):
    
    url = url + chave + '.html'
    
    # lendo a URL com a urllopen
    html = urlopen(url)

    # Enfim mostrando o poder da bs4
    bs = BeautifulSoup(html, "html.parser")
    
    # Transformando em String
    texto = str(bs.find_all('td')).strip('[]')
    
    # Aplicando regex para extrair o identificador locus
    locus = re.findall('[0-9]{4}\.[a-zA-Z]+[0-9]*[a-zA-Z]*', texto)
    
    return "".join(locus)

In [None]:
import time

seconds_ini = time.time()

# Execução do método na base de organismos modelo
essential_genes['Locus'] = essential_genes.apply(lambda x: webScrapingLocus(url, x.Code_Gene_DEG), axis=1)

seconds_fini = time.time()

print("Seconds since epoch =", seconds_fini - seconds_ini)

In [20]:
# Salvando Base final em formato parquet
essential_genes.to_parquet('essential_genes.parquet', engine='fastparquet')

##  Trabalhando com Parquet

In [37]:
# Transaformação em Parquet

import pandas as pd

# Saccharomyces cerevisiae (Arquivo String)

string_df_4932 = pd.read_csv("string_df_4932.txt", sep=" ")

string_df_4932.to_parquet('string_df_4932.parquet', engine='fastparquet')

## Demonstração com Spark SQL

Através do uso Spark SQL é possível juntar a base de Essencialidade com as bases de PPI do String através da chave Locus, extraída via Web Scraping e qualquer chave de proteína da base String. 

Como a base String é grande, o teste abaixo foi realizada com a base da Saccharomyces cereviasiae.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('deg-blast').getOrCreate()

dfEssentialDEG = spark.read.parquet("essential_genes.parquet")

dfPPI = spark.read.parquet("string_df_4932.parquet")

In [41]:
dfEssentialDEG

DataFrame[index: bigint, Organism: string, Reference: string, Name: string, Essential_Genes: bigint, Method: string, Code_Organism: string, Date: string, Code_Gene_DEG: string, Gene: string, Function: string, RefSeq: string, Seq_Gene: string, Seq_Prot: string, Locus: string]

In [42]:
dfPPI

DataFrame[protein1: string, protein2: string, neighborhood: bigint, fusion: bigint, cooccurence: bigint, coexpression: bigint, experimental: bigint, database: bigint, textmining: bigint, combined_score: bigint]

In [43]:
# Realizando um select entre as duas bases para listar as proteínas 
# com ligações com a proteína da proteína 4932.YAL025C

dfEssentialDEG.join(dfPPI, dfEssentialDEG.Locus == dfPPI.protein1, 
                    'inner').select(dfEssentialDEG.Gene, dfEssentialDEG.Locus,dfPPI.protein1, 
                                    dfPPI.protein2).where(dfEssentialDEG.Locus == '4932.YAL025C').collect()   

                                                                                

[Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YHR147C'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YPR190C'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YGL076C'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YJL177W'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YOR091W'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YNL137C'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YDR324C'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YOR361C'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YDL150W'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932.YGL105W'),
 Row(Gene='MAK16', Locus='4932.YAL025C', protein1='4932.YAL025C', protein2='4932

In [44]:
# Realizando um select entre as duas bases para listar as proteínas 
# com ligações com a proteína  do gene TFC3	

dfEssentialDEG.join(dfPPI, dfEssentialDEG.Locus == dfPPI.protein1, 
                    'inner').select(dfEssentialDEG.Gene, dfEssentialDEG.Locus,dfPPI.protein1, 
                                    dfPPI.protein2).where(dfEssentialDEG.Gene == 'TFC3').collect()   

                                                                                

[Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YPR101W'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YPR114W'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YLR211C'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YGL248W'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YOL139C'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YLR050C'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YKL072W'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YOR373W'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YBR154C'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YPL033C'),
 Row(Gene='TFC3', Locus='4932.YAL001C', protein1='4932.YAL001C', protein2='4932.YLR291C'),