# DEG-Data-Essentiality

Integração dos dados da base de eucariotos e bactérias do DEG database com Biopython e Pandas.

Link do banco:http://origin.tubic.org/deg/public/index.php


## Gerando Dicionarios de Genes e Aminoácidos

In [29]:
# Extraindo sequências com Biopython

from Bio import SeqIO

def lerFastaBio(arquivo):
    arquivoFasta = SeqIO.parse(open(arquivo),'fasta') #lê o arquivo com o Biopython

    dict_fasta = {} 

    for i in arquivoFasta:
        dict_fasta[i.id] = str(i.seq) 

    return dict_fasta

In [None]:
# Dict Nucleotídeos
dict_nucl = lerFastaBio("eucarioto/DEG20_nt.fasta")
dict_nucl

In [None]:
# Dict Proteinas
dict_prot = lerFastaBio("eucarioto/DEG20_aa.fasta")
dict_prot

In [34]:
def sequence(chave, dicionario):
    return dicionario[chave]

## Leitura dos Arquivos de Eucariotos 

In [35]:
import pandas as pd

In [36]:
csv_eukarioto = pd.read_csv('eucarioto/deg_eukaryotes.csv', sep=';',
                           names=['Organism', 'Reference', 'Pubmed', 'Name',  'Essential_Genes',
                                 '1', '2', '3', '4', '5','Method', 'Medium', 'Code_Organism', 'Date'])


eukarioto = csv_eukarioto[['Organism', 'Reference', 'Pubmed', 'Name',  'Essential_Genes',
                                'Method','Code_Organism', 'Date']]

eukarioto

Unnamed: 0,Organism,Reference,Pubmed,Name,Essential_Genes,Method,Code_Organism,Date
0,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11
1,Caenorhabditis elegans,"Kamath RS, et al (2003). Systematic functional...",12529635,worm,294,RNA interference,DEG2002,2018-11-11
2,Arabidopsis thaliana,"Meinke D, et al(2008). Identifying essential g...",18684657,thale cress,356,Single-gene knockout,DEG2003,2018-11-11
3,Danio rerio,"Amsterdam A, et al (2004). Identification of 3...",15256591,zebrafish,315,Insertional mutagenesis,DEG2004,2018-11-11
4,Mus musculus,"Liao BY, Zhang J (2007). Mouse duplicate genes...",17559966,mouse,2114,MGI annotation,DEG2005,2018-11-11
5,Homo sapiens,"Liao BY, Zhang J (2008). Null mutations in hum...",18458337,Uncertain,118,OMIM annotation,DEG2006,2018-11-11
6,Drosophila melanogaster,"Spradling AC, et al (1999). The Berkeley Droso...",10471706,fruit fly,339,P-element insertion,DEG2007,2018-11-11
7,Aspergillus fumigatus,"Hu W, et al (2007). Essential gene identificat...",17352532,Uncertain,35,Conditional promoter replacement,DEG2008,2018-11-11
8,Schizosaccharomyces pombe 972h-,"Kim, Dong-Uk, et al. ""Analysis of a genome-wid...",20473289,Schizosaccharomyces,1260,Single-gene knockout,DEG2009,2018-11-11
9,Homo sapiens,"Georgi, B., Voight, B. F., & Bućan, M. (2013)....",23675308,Uncertain,2472,-,DEG2010,2018-11-11


In [37]:
csv_annotations = pd.read_csv('eucarioto/deg_annotation_e.csv', sep=';',
                             names=['Code_Organism', 'Code_Gene_DEG', 'Gene', 'Genbank',  '1', '2',
                                 'Function', 'Organism', 'RefSeq', 'Medium', '3','4', '5', '6'])

annotations = csv_annotations[['Code_Organism', 'Code_Gene_DEG', 'Gene', 'Genbank',
                                 'Function', 'Organism', 'RefSeq']]

annotations

Unnamed: 0,Code_Organism,Code_Gene_DEG,Gene,Genbank,Function,Organism,RefSeq
0,DEG2001,DEG20010001,TFC3,GI:6319317,Largest of six subunits of the RNA polymerase ...,Saccharomyces cerevisiae,NC_001133
1,DEG2001,DEG20010002,EFB1,GI:6319315,Translation elongation factor 1 beta; stimulat...,Saccharomyces cerevisiae,NC_001133
2,DEG2001,DEG20010003,MAK16,GI:6319294,"Essential nuclear protein, constituent of 66S ...",Saccharomyces cerevisiae,NC_001133
3,DEG2001,DEG20010004,PRP45,GI:6319287,Protein required for pre-mRNA splicing; associ...,Saccharomyces cerevisiae,NC_001133
4,DEG2001,DEG20010005,POP5,GI:6319286,"Subunit of both RNase MRP, which cleaves pre-r...",Saccharomyces cerevisiae,NC_001133
...,...,...,...,...,...,...,...
43289,DEG2033,DEG20331589,CKAP5,HGNC:28959,cytoskeleton associated protein 5,Homo sapiens,11p11.2
43290,DEG2033,DEG20331590,BRD8,HGNC:19874,bromodomain containing 8,Homo sapiens,5q31.2
43291,DEG2033,DEG20331591,VPS72,HGNC:11644,vacuolar protein sorting 72 homolog,Homo sapiens,1q21.3
43292,DEG2033,DEG20331592,HTATSF1,HGNC:5276,HIV-1 Tat specific factor 1,Homo sapiens,Xq26.3


In [38]:
# Realizar Merge

essential_genes = eukarioto.merge(annotations, left_on='Code_Organism', right_on='Code_Organism')
essential_genes

Unnamed: 0,Organism_x,Reference,Pubmed,Name,Essential_Genes,Method,Code_Organism,Date,Code_Gene_DEG,Gene,Genbank,Function,Organism_y,RefSeq
0,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010001,TFC3,GI:6319317,Largest of six subunits of the RNA polymerase ...,Saccharomyces cerevisiae,NC_001133
1,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010002,EFB1,GI:6319315,Translation elongation factor 1 beta; stimulat...,Saccharomyces cerevisiae,NC_001133
2,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010003,MAK16,GI:6319294,"Essential nuclear protein, constituent of 66S ...",Saccharomyces cerevisiae,NC_001133
3,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010004,PRP45,GI:6319287,Protein required for pre-mRNA splicing; associ...,Saccharomyces cerevisiae,NC_001133
4,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010005,POP5,GI:6319286,"Subunit of both RNase MRP, which cleaves pre-r...",Saccharomyces cerevisiae,NC_001133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43289,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321558,CUL1,HGNC:2551,cullin 1,Homo sapiens,7q36.1
43290,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321559,GTF2E1,HGNC:4650,general transcription factor IIE subunit 1,Homo sapiens,3q13.33
43291,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321560,OSCP1,HGNC:29971,organic solute carrier partner 1,Homo sapiens,1p34.3
43292,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321561,IRF5,HGNC:6120,interferon regulatory factor 5,Homo sapiens,7q32.1


In [39]:
# Gerando a lista de sequências

essential_genes['Seq_Gene'] = essential_genes.apply(lambda x: sequence(x.Code_Gene_DEG, dict_nucl), axis=1)
essential_genes['Seq_Prot'] = essential_genes.apply(lambda x: sequence(x.Code_Gene_DEG, dict_prot), axis=1)

In [40]:
essential_genes

Unnamed: 0,Organism_x,Reference,Pubmed,Name,Essential_Genes,Method,Code_Organism,Date,Code_Gene_DEG,Gene,Genbank,Function,Organism_y,RefSeq,Seq_Gene,Seq_Prot
0,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010001,TFC3,GI:6319317,Largest of six subunits of the RNA polymerase ...,Saccharomyces cerevisiae,NC_001133,ATGGTACTGACGATTTATCCTGACGAACTCGTACAAATAGTGTCTG...,MVLTIYPDELVQIVSDKIASNKGKITLNQLWDISGKYFDLSDKKVK...
1,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010002,EFB1,GI:6319315,Translation elongation factor 1 beta; stimulat...,Saccharomyces cerevisiae,NC_001133,ATGGCATCCACCGATTTCTCCAAGATTGAAACTTTGAAACAATTAA...,MASTDFSKIETLKQLNASLADKSYIEGTAVSQADVTVFKAFQSAYP...
2,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010003,MAK16,GI:6319294,"Essential nuclear protein, constituent of 66S ...",Saccharomyces cerevisiae,NC_001133,ATGTCCGACGAAATTGTTTGGCAAGTGATTAATCAAAGTTTCTGCT...,MSDEIVWQVINQSFCSHRIKAPNGQNFCRNEYNVTGLCTRQSCPLA...
3,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010004,PRP45,GI:6319287,Protein required for pre-mRNA splicing; associ...,Saccharomyces cerevisiae,NC_001133,ATGTTTAGTAACAGACTACCACCTCCAAAACATTCTCAAGGACGAG...,MFSNRLPPPKHSQGRVSTALSSDRVEPAILTDQIAKNVKLDDFIPK...
4,Saccharomyces cerevisiae,"Giaever G, et al (2002). Functional profiling ...",12140549,yeast,1110,Single-gene knockout,DEG2001,2018-11-11,DEG20010005,POP5,GI:6319286,"Subunit of both RNase MRP, which cleaves pre-r...",Saccharomyces cerevisiae,NC_001133,ATGGTACGTTTAAAAAGTAGATATATCCTTTTTGAAATTATATTCC...,MVRLKSRYILFEIIFPPTDTNVEESVSKADILLSHHRASPADVSIK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43289,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321558,CUL1,HGNC:2551,cullin 1,Homo sapiens,7q36.1,ATGTCGTCAACCCGGAGCCAGAACCCCCACGGCCTGAAGCAGATTG...,MSSTRSQNPHGLKQIGLDQIWDDLRAGIQQVYTRQSMAKSRYMELY...
43290,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321559,GTF2E1,HGNC:4650,general transcription factor IIE subunit 1,Homo sapiens,3q13.33,ATGGCAGACCCAGATGTCCTCACTGAAGTTCCAGCAGCATTGAAGC...,MALETVPKDLRHLRACLLCSLVKTIDQFEYDGCDNCDAYLQMKGNR...
43291,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321560,OSCP1,HGNC:29971,organic solute carrier partner 1,Homo sapiens,1p34.3,ATGTCGGTGCGGACGCTACCGCTGCTCTTCTTGAACTTGGGCGGGG...,MSVRTLPLLFLNLGGEMLYILDQRLRAQNIPGDKARKDEWTEVDRK...
43292,Homo sapiens,"Mair, B., Tomic, J., Masud, S. N., Tonge, P., ...",30970261,MEF,1562,CRISPR,DEG2032,2020-08-31,DEG20321561,IRF5,HGNC:6120,interferon regulatory factor 5,Homo sapiens,7q32.1,ATGGAGGGCGACGGGGTGCCATGGGGCAGCGAGCCCGTCTCGGGTC...,MNQSIPVAPTPPRRVRLKPWLVAQVNSCQYPGLQWVNGEKKLFCIP...


In [41]:
# Geração da base final de genes essenciais de eucariotos
essential_genes.to_csv('eucarioto/essential_genes_euk.csv', index=False)

## Leitura dos Arquivos de Bactérias

In [None]:
# Dict Nucleotídeos
dict_nucl_bac = lerFastaBio("bacteria/DEG10_nt.fasta")
dict_nucl_bac

In [None]:
# Dict Proteinas
dict_prot_bac = lerFastaBio("bacteria/DEG10_aa.fasta")
dict_prot_bac

In [51]:
csv_bacteria = pd.read_csv('bacteria/deg_bacteria.csv', sep=';',
                           names=['Organism', 'Reference', 'Pubmed', 'RefSeq',  'Essential_Genes',
                                 '1', '2', '3', '4', '5','Method', 'Medium', 'Code_Organism', 'Date'])


bacteria = csv_bacteria[['Organism', 'Reference', 'Pubmed', 'RefSeq',  'Essential_Genes',
                                'Method','Code_Organism', 'Date']]

bacteria

Unnamed: 0,Organism,Reference,Pubmed,RefSeq,Essential_Genes,Method,Code_Organism,Date
0,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11
1,Staphylococcus aureus N315,"Ji Y, et al (2001). Identification of critical...",11567142,NC_002745,302,Antisense RNA,DEG1002,2018-11-11
2,Vibrio cholerae N16961,"Cameron DE, Urbach MJ,Mekalanos JJ(2008). A de...",18574146,"NC_002505,NC_002506",779,Tn-seq,DEG1003,2018-11-11
3,Haemophilus influenzae Rd KW20,"Akerley BJ, et al (2002). A genome-scale analy...",11805338,NC_000907,642,Genetic footprinting,DEG1005,2018-11-11
4,Mycoplasma genitalium G37,"Glass JI, et al (2006). Essential genes of a m...",16407165,NC_000908,381,Tn-seq,DEG1006,2018-11-11
...,...,...,...,...,...,...,...,...
61,Staphylococcus aureus subsp. aureus MSSA476,"Coe, K. A., Lee, W., Stone, M. C., Komazin-Mer...",31738809,NC_002953.3,305,Tn-seq,DEG1064,2020-06-29
62,Staphylococcus aureus subsp. aureus MRSA252,"Coe, K. A., Lee, W., Stone, M. C., Komazin-Mer...",31738809,NC_002952.2,295,Tn-seq,DEG1065,2020-06-29
63,Burkholderia cenocepacia J2315,"Wong, Y. C., Abd El Ghany, M., Naeem, R., Lee,...",27597847,AM747720 AM747723 AM747721 AM747722,383,TraDIS,DEG1066,2020-08-05
64,Vibrio cholerae C6706,"Chao, M. C., et al. (2013). High-resolution de...",23901011,NC_002505.1; NC_002506.1,343,Tn-seq,DEG1067,2020-07-10


In [52]:
csv_annotations_bac = pd.read_csv('bacteria/deg_annotation_p.csv', sep=';',
                             names=['Code_Organism', 'Code_Gene_DEG', 'Gene', 'Genbank',  '1', '2',
                                 'Function', 'Organism', 'RefSeq', 'Medium', '3','4', '5', '6'])

annotations_bac = csv_annotations_bac[['Code_Organism', 'Code_Gene_DEG', 'Gene', 'Genbank',
                                 'Function', 'Organism', 'RefSeq']]

annotations_bac

Unnamed: 0,Code_Organism,Code_Gene_DEG,Gene,Genbank,Function,Organism,RefSeq
0,DEG1001,DEG10010001,dnaA,16077069,initiation of chromosome replication,Bacillus subtilis 168,NC_000964
1,DEG1001,DEG10010002,dnaN,16077070,DNA polymerase III (beta subunit),Bacillus subtilis 168,NC_000964
2,DEG1001,DEG10010003,gyrB,16077074,DNA gyrase (subunit B),Bacillus subtilis 168,NC_000964
3,DEG1001,DEG10010004,gyrA,16077075,DNA gyrase (subunit A),Bacillus subtilis 168,NC_000964
4,DEG1001,DEG10010005,guaB,16077077,inosine-monophosphate dehydrogenase,Bacillus subtilis 168,NC_000964
...,...,...,...,...,...,...,...
26614,DEG1068,DEG10680338,rpmH,GI:13508421; Protein_ID: NP_110371.1,50S ribosomal protein L34,Mycoplasma pneumoniae M129,NC_000912
26615,DEG1068,DEG10680339,-,GI:13508423; Protein_ID: NP_110373.1,transporter,Mycoplasma pneumoniae M129,NC_000912
26616,DEG1068,DEG10680340,cysA,GI:13508424; Protein_ID: NP_110374.1,sulfate ABC transporter ATP-binding protein,Mycoplasma pneumoniae M129,NC_000912
26617,DEG1068,DEG10680341,dnaA,GI:13508425; Protein_ID: NP_110375.1,chromosome replication initiator DnaA,Mycoplasma pneumoniae M129,NC_000912


In [53]:
# Realizar Merge

essential_genes_bac = bacteria.merge(annotations_bac, left_on='Code_Organism', right_on='Code_Organism')
essential_genes_bac

Unnamed: 0,Organism_x,Reference,Pubmed,RefSeq_x,Essential_Genes,Method,Code_Organism,Date,Code_Gene_DEG,Gene,Genbank,Function,Organism_y,RefSeq_y
0,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010001,dnaA,16077069,initiation of chromosome replication,Bacillus subtilis 168,NC_000964
1,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010002,dnaN,16077070,DNA polymerase III (beta subunit),Bacillus subtilis 168,NC_000964
2,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010003,gyrB,16077074,DNA gyrase (subunit B),Bacillus subtilis 168,NC_000964
3,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010004,gyrA,16077075,DNA gyrase (subunit A),Bacillus subtilis 168,NC_000964
4,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010005,guaB,16077077,inosine-monophosphate dehydrogenase,Bacillus subtilis 168,NC_000964
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26614,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680338,rpmH,GI:13508421; Protein_ID: NP_110371.1,50S ribosomal protein L34,Mycoplasma pneumoniae M129,NC_000912
26615,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680339,-,GI:13508423; Protein_ID: NP_110373.1,transporter,Mycoplasma pneumoniae M129,NC_000912
26616,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680340,cysA,GI:13508424; Protein_ID: NP_110374.1,sulfate ABC transporter ATP-binding protein,Mycoplasma pneumoniae M129,NC_000912
26617,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680341,dnaA,GI:13508425; Protein_ID: NP_110375.1,chromosome replication initiator DnaA,Mycoplasma pneumoniae M129,NC_000912


In [54]:
# Gerando a lista de sequências

essential_genes_bac['Seq_Gene'] = essential_genes_bac.apply(lambda x: sequence(x.Code_Gene_DEG,
                                                                               dict_nucl_bac), axis=1)
essential_genes_bac['Seq_Prot'] = essential_genes_bac.apply(lambda x: sequence(x.Code_Gene_DEG,
                                                                               dict_prot_bac), axis=1)

In [55]:
essential_genes_bac

Unnamed: 0,Organism_x,Reference,Pubmed,RefSeq_x,Essential_Genes,Method,Code_Organism,Date,Code_Gene_DEG,Gene,Genbank,Function,Organism_y,RefSeq_y,Seq_Gene,Seq_Prot
0,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010001,dnaA,16077069,initiation of chromosome replication,Bacillus subtilis 168,NC_000964,ATGGAAAATATATTAGACCTGTGGAACCAAGCCCTTGCTCAAATCG...,MENILDLWNQALAQIEKKLSKPSFETWMKSTKAHSLQGDTLTITAP...
1,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010002,dnaN,16077070,DNA polymerase III (beta subunit),Bacillus subtilis 168,NC_000964,ATGAAATTCACGATTCAAAAAGATCGTCTTGTTGAAAGTGTCCAAG...,MKFTIQKDRLVESVQDVLKAVSSRTTIPILTGIKIVASDDGVSFTG...
2,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010003,gyrB,16077074,DNA gyrase (subunit B),Bacillus subtilis 168,NC_000964,ATGGAACAGCAGCAAAACAGTTATGATGAAAATCAGATACAGGTAC...,MEQQQNSYDENQIQVLEGLEAVRKRPGMYIGSTNSKGLHHLVWEIV...
3,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010004,gyrA,16077075,DNA gyrase (subunit A),Bacillus subtilis 168,NC_000964,ATGAGTGAACAAAACACACCACAAGTTCGTGAAATAAATATCAGTC...,MSEQNTPQVREINISQEMRTSFLDYAMSVIVSRALPDVRDGLKPVH...
4,Bacillus subtilis 168,"Kobayashi K, et al (2003). Essential Bacillus ...",12682299,NC_000964,271,Single-gene knockout,DEG1001,2018-11-11,DEG10010005,guaB,16077077,inosine-monophosphate dehydrogenase,Bacillus subtilis 168,NC_000964,ATGTGGGAAAGTAAATTTTCAAAAGAAGGCTTAACGTTCGACGATG...,MWESKFSKEGLTFDDVLLVPAKSEVLPRDVDLSVELTKTLKLNIPV...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26614,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680338,rpmH,GI:13508421; Protein_ID: NP_110371.1,50S ribosomal protein L34,Mycoplasma pneumoniae M129,NC_000912,ATGAAACGTACATATCAACCAAGTAAATTAAAGCGAGCAAAAACCC...,MKRTYQPSKLKRAKTHGFLARMATASGRKVLKLRRKKQRAQLTVSSER
26615,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680339,-,GI:13508423; Protein_ID: NP_110373.1,transporter,Mycoplasma pneumoniae M129,NC_000912,ATGTTTTCGTTTTTTAAACAGATCTTTAAGTCACTCAAAAAGTTCT...,MFSFFKQIFKSLKKFFFLLFGIIFVLFSIIFLETSILQLSNNLVNT...
26616,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680340,cysA,GI:13508424; Protein_ID: NP_110374.1,sulfate ABC transporter ATP-binding protein,Mycoplasma pneumoniae M129,NC_000912,ATGTCTTTAAATGCTAAAAATAAACGAAGCCTCGATTATTGTTTGC...,MSLNAKNKRSLDYCLQWPDFCQSKKASKLIVKLNKKHPKRRHYKDP...
26617,Mycoplasma pneumoniae,"Lluch‐Senar, M., et al. (2015). Defining a min...",25609650,NC_000912,342,Tn-seq,DEG1068,2020-07-10,DEG10680341,dnaA,GI:13508425; Protein_ID: NP_110375.1,chromosome replication initiator DnaA,Mycoplasma pneumoniae M129,NC_000912,ATGGAACAATTTAGTGCTTTTAAACTTCTTTTAAAAAAGCAATACG...,MEQFSAFKLLLKKQYETTLGFYDKYIKNLKRFALKNNVLFVIVDNE...


In [56]:
# Geração da base final de genes essenciais de bacterias
essential_genes_bac.to_csv('bacteria/essential_genes_bac.csv', index=False)

## Extração de dados do Genbank pelo Entrez

In [33]:
from Bio import Entrez, SeqIO
#Entrez.email = "A.N.Other@example.com"
handle = Entrez.efetch(db="nucleotide", id="NC_000908", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
record

SeqRecord(seq=Seq(None, length=580076), id='NC_000908.2', name='NC_000908', description='Mycoplasma genitalium G37, complete sequence', dbxrefs=['BioProject:PRJNA224116', 'BioSample:SAMN02603983', 'Assembly:GCF_000027325.1'])

In [27]:
print(record)

ID: NC_000908.2
Name: NC_000908
Description: Mycoplasma genitalium G37, complete sequence
Database cross-references: BioProject:PRJNA224116, BioSample:SAMN02603983, Assembly:GCF_000027325.1
Number of features: 1127
/molecule_type=DNA
/topology=circular
/data_file_division=CON
/date=20-APR-2021
/accessions=['NC_000908', 'NZ_U39679-NZ_U39729']
/sequence_version=2
/keywords=['RefSeq']
/source=Mycoplasma genitalium G37
/organism=Mycoplasma genitalium G37
/taxonomy=['Bacteria', 'Tenericutes', 'Mollicutes', 'Mycoplasmataceae', 'Mycoplasma']
/references=[Reference(title='Essential genes of a minimal bacterium', ...), Reference(title='The minimal gene complement of Mycoplasma genitalium', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REFSEQ INFORMATION: The reference sequence is identical to
L43967.2.
On Jun 13, 2006 this sequence version replaced NC_000908.1.
The annotation was added by the NCBI Prokaryotic Genome Annotation
Pipeline (PGA

In [29]:
from Bio import SeqIO

def get_cds_feature(seq_record):
    """Função para buscar a Locus tag na feature do CDS
    """

    for feature in record.features:
        if feature.type == "CDS": 
            return feature.qualifiers.get('locus_tag')
    # Could not find it
    return None

cds_feature = get_cds_feature(record)
print("".join(cds_feature))

MG_RS00005


In [22]:
# caso o notebook não suporte a execução
#jupyter notebook --NotebookApp.iopub_data_rate_limit=100000000