## Análise da sequência e das features presentes no NCBI

In [38]:
from Bio import SeqIO
from Bio import Entrez

Entrez.email = 'lucianacmartins8@gmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND NEUROG3[Gene]")

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')

    for record in fetch_records:
        if record.id == "NM_020999.4":
            print('Record accession: ', record.id)
            print('Record sequence length: ', len(record.seq))
            print('Record description: ', record.description)
            print('Record annotations: ', record.annotations)
            print('Record external references: ', record.dbxrefs)
            print('Record features count: ', len(record.features))
            print('Record features: ', record.features)
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'])
                            print(record.features[a].qualifiers['translation'])

Record accession:  NM_020999.4
Record sequence length:  1560
Record description:  Homo sapiens neurogenin 3 (NEUROG3), mRNA
Record annotations:  {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '16-APR-2022', 'accessions': ['NM_020999'], 'sequence_version': 4, 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Protein Production and Purification of a Codon-Optimized Human NGN3 Transcription Factor from E. coli', ...), Reference(title='Extensive NEUROG3 occupancy in the human pancreatic endocrine gene regulatory network', ...), Reference(title='Gene Signatures of NEUROGENIN3+ Endocrine Progenitor Cells in the Human Pancreas', ...), Reference(title='Effect of NEUROG3 polymorphi

In [40]:
Entrez.email = 'analisboasan@gmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND NOTCH2[Gene]") 

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')

    for record in fetch_records:
        if record.id == "NM_024408.4":
            print('Record accession: ', record.id)
            print('Record sequence length: ', len(record.seq))
            print('Record description: ', record.description)
            print('Record annotations: ', record.annotations)
            print('Record external references: ', record.dbxrefs)
            print('Record features count: ', len(record.features))
            print('Record features: ', record.features)
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'])
                            print(record.features[a].qualifiers['translation'])

Record accession:  NM_024408.4
Record sequence length:  11425
Record description:  Homo sapiens notch receptor 2 (NOTCH2), transcript variant 1, mRNA
Record annotations:  {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '09-OCT-2022', 'accessions': ['NM_024408', 'XM_941698', 'XM_945379'], 'sequence_version': 4, 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Patients with biallelic GGC repeat expansions in NOTCH2NLC exhibiting a typical neuronal intranuclear inclusion disease phenotype', ...), Reference(title='Defining pathogenicity of NOTCH2 variants for diagnosis of Alagille syndrome type 2 using a large cohort of patients', ...), Reference(title='gamma-Secretase inhibit

In [42]:
Entrez.email = 'bruna_dfa@hotmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND DSTYK[Gene]") 

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')

    for record in fetch_records:
        if record.id == "NM_015375.3":
            print('Record accession: ', record.id)
            print('Record sequence length: ', len(record.seq))
            print('Record description: ', record.description)
            print('Record annotations: ', record.annotations)
            print('Record external references: ', record.dbxrefs)
            print('Record features count: ', len(record.features))
            print('Record features: ', record.features)
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'])
                            print(record.features[a].qualifiers['translation'])

Record accession:  NM_015375.3
Record sequence length:  8010
Record description:  Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 1, mRNA
Record annotations:  {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '09-OCT-2022', 'accessions': ['NM_015375'], 'sequence_version': 3, 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='DSTYK inhibition increases the sensitivity of lung cancer cells to T cell-mediated cytotoxicity', ...), Reference(title='DSTYK Enhances Chemoresistance in Triple-Negative Breast Cancer Cells', ...), Reference(title='RIPK3 promotes adenovirus type 5 activity', ...), Reference(title='Large Intragenic Deletion in DST

## Análise de homologias por BLAST ou Diamond

In [2]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record1 = SeqIO.read(open("rs177045_sequence.fa"), format="fasta")

In [3]:
result_handle1 = NCBIWWW.qblast("blastn", "nt", record1.format("fasta"))

In [4]:
save_file1 = open("blast_variante_NEUROG3.xml", "w")
save_file1.write(result_handle1.read()) 
save_file1.close()
result_handle1.close()

In [5]:
result_handle1 = open("blast_variante_NEUROG3.xml")

In [6]:
blast_records1 = NCBIXML.parse(result_handle1)
for blast_record in blast_records1:
    print("*** Parâmetros Globais da Pesquisa ***")
    print("Base de Dados usada: " , blast_record.database)
    print("Matriz de Substituição: ", blast_record.matrix)
    print("Parâmetros de Espaçamento: ", blast_record.gap_penalties)

*** Parâmetros Globais da Pesquisa ***
Base de Dados usada:  nt
Matriz de Substituição:  
Parâmetros de Espaçamento:  (5, 2)


In [10]:
result_handle1 = open("blast_variante_NEUROG3.xml")
resultado1 = NCBIXML.read(result_handle1)
print(f"Existem {len(resultado1.alignments)} alinhamentos no registo.")

Existem 50 alinhamentos no registo.


In [11]:
print(resultado1.alignments[0])

gi|14626972|emb|AL450311.11| Human DNA sequence from clone RP11-343J3 on chromosome 10, complete sequence
           Length = 165110



In [12]:
for a in resultado1.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

AL450311
Human DNA sequence from clone RP11-343J3 on chromosome 10, complete sequence
0.0
OW443374
Orcinus orca genome assembly, chromosome: 14
9.11219e-126
CP050625
Canis lupus familiaris breed Labrador retriever chromosome 04b
2.14677e-89
CP050572
Canis lupus familiaris breed Labrador retriever chromosome 04a
2.6153e-88
HG994386
Canis lupus genome assembly, chromosome: 4
2.6153e-88


In [14]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record2 = SeqIO.read(open("rs1493694_sequence.fa"), format="fasta")

In [15]:
result_handle2 = NCBIWWW.qblast("blastn", "nt", record2.format("fasta"))

In [26]:
save_file2 = open("blast_variante_NOTCH2.xml", "w")
save_file2.write(result_handle2.read()) 
save_file2.close()
result_handle2.close()

In [27]:
result_handle2 = open("blast_variante_NOTCH2.xml")

In [28]:
blast_records2 = NCBIXML.parse(result_handle2)
for blast_record in blast_records2:
    print("*** Parâmetros Globais da Pesquisa ***")
    print("Base de Dados usada: " , blast_record.database)
    print("Matriz de Substituição: ", blast_record.matrix)
    print("Parâmetros de Espaçamento: ", blast_record.gap_penalties)

*** Parâmetros Globais da Pesquisa ***
Base de Dados usada:  nt
Matriz de Substituição:  
Parâmetros de Espaçamento:  (5, 2)


In [29]:
result_handle2 = open("blast_variante_NOTCH2.xml")
resultado2 = NCBIXML.read(result_handle2)
print(f"Existem {len(resultado2.alignments)} alinhamentos no registo.")

Existem 50 alinhamentos no registo.


In [30]:
print(resultado2.alignments[0])

gi|1601833237|gb|AC278627.1| Homo sapiens chromosome 1 clone VMRC62-112J14, complete sequence
           Length = 176732



In [35]:
for a in resultado2.alignments[0:10]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

AC278627
Homo sapiens chromosome 1 clone VMRC62-112J14, complete sequence
0.0
AC278623
Homo sapiens chromosome 1 clone VMRC53-156D10, complete sequence
0.0
AC278619
Homo sapiens chromosome 1 clone VMRC62-388P04, complete sequence
0.0
AC278445
Homo sapiens chromosome 1 clone VMRC59-343K02, complete sequence
0.0
AC278334
Homo sapiens chromosome 1 clone VMRC62-166D15, complete sequence
0.0
NG_008163
Homo sapiens notch receptor 2 (NOTCH2), RefSeqGene on chromosome 1
0.0
AC245008
Homo sapiens BAC clone CH17-77K15 from chromosome 1, complete sequence
0.0
AL512503
Human DNA sequence from clone RP11-323K8 on chromosome 1, complete sequence
0.0
AC278816
Homo sapiens chromosome 1 clone VMRC64-514A21, complete sequence
0.0
AC278785
Homo sapiens chromosome 1 clone VMRC66-236N07, complete sequence
0.0


In [50]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record3 = SeqIO.read(open("rs12048743_sequence.fa"), format="fasta")

In [51]:
result_handle3 = NCBIWWW.qblast("blastn", "nt", record3.format("fasta"))

In [52]:
save_file3 = open("blast_variante_DSTYK.xml", "w")
save_file3.write(result_handle3.read()) 
save_file3.close()
result_handle3.close()

In [53]:
result_handle3 = open("blast_variante_DSTYK.xml")

In [54]:
blast_records3 = NCBIXML.parse(result_handle3)
for blast_record in blast_records3:
    print("*** Parâmetros Globais da Pesquisa ***")
    print("Base de Dados usada: " , blast_record.database)
    print("Matriz de Substituição: ", blast_record.matrix)
    print("Parâmetros de Espaçamento: ", blast_record.gap_penalties)

*** Parâmetros Globais da Pesquisa ***
Base de Dados usada:  nt
Matriz de Substituição:  
Parâmetros de Espaçamento:  (5, 2)


In [59]:
result_handle3 = open("blast_variante_DSTYK.xml")
resultado3 = NCBIXML.read(result_handle3)
print(f"Existem {len(resultado3.alignments)} alinhamentos no registo.")

Existem 50 alinhamentos no registo.


In [60]:
print(resultado3.alignments[0])

gi|1675076430|ref|NM_199462.3| Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 2, mRNA
           Length = 7875



In [62]:
for a in resultado3.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

NM_199462
Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 2, mRNA
0.0
6.95827e-45
2.95873e-43
0.0153537
NM_015375
Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 1, mRNA
0.0
6.95827e-45
2.95873e-43
0.0153537
XM_047417152
PREDICTED: Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant X6, mRNA
0.0
6.95827e-45
2.95873e-43
0.0153537
XM_047417151
PREDICTED: Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant X5, mRNA
0.0
6.95827e-45
2.95873e-43
0.0153537
XM_011509394
PREDICTED: Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant X4, mRNA
0.0
6.95827e-45
2.95873e-43
0.0153537


## Ferramentas de análise das propriedades da proteína

### PROTEIN NEUROG3

In [17]:
import Bio.SwissProt as sp

with open("Q9Y4Z2.txt") as handle:
    record = sp.read(handle)
    print(record.entry_name, "\n")
    print(", ".join(record.accessions), "\n")
    print(record.keywords, "\n")
    print(record.organism, "\n")
    print(len(record.sequence), "aa", "\n")
    print(record.sequence)

NGN3_HUMAN 

Q9Y4Z2, Q5VVI0, Q6DJX6, Q9BY24 

['Activator', 'Developmental protein', 'Differentiation', 'Disease variant', 'DNA-binding', 'Neurogenesis', 'Nucleus', 'Reference proteome', 'Transcription', 'Transcription regulation'] 

Homo sapiens (Human). 

214 aa 

MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRGNCAEAEEGGCRGAPRKLRARRGGRSRPKSELALSKQRRSRRKKANDRERNRMHNLNSALDALRGVLPTFPDDAKLTKIETLRFAHNYIWALTQTLRIADHSLYALEPPAPHCGELGSPGGSPGDWGSLYSPVSQAGSLSPAASLEERPGLLGATFSACLSPGSLAFSDFL


### PROTEIN NOTCH2

In [35]:
with open("Q04721.txt") as handle:
    record = sp.read(handle)
    print(record.entry_name, "\n")
    print(", ".join(record.accessions), "\n")
    print(record.keywords, "\n")
    print(record.organism, "\n")
    print(len(record.sequence), "aa", "\n")
    print(record.sequence)

NOTC2_HUMAN 

Q04721, Q5T3X7, Q99734, Q9H240 

['3D-structure', 'Activator', 'ANK repeat', 'Cell membrane', 'Cytoplasm', 'Developmental protein', 'Differentiation', 'Disease variant', 'Disulfide bond', 'EGF-like domain', 'Glycoprotein', 'Membrane', 'Notch signaling pathway', 'Nucleus', 'Phosphoprotein', 'Receptor', 'Reference proteome', 'Repeat', 'Signal', 'Transcription', 'Transcription regulation', 'Transmembrane', 'Transmembrane helix', 'Ubl conjugation'] 

Homo sapiens (Human). 

2471 aa 

MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTYHNGTGYCKCPEGFLGEYCQHRDPCEKNRCQNGGTCVAQAMLGKATCRCASGFTGEDCQYSTSHPCFVSRPCLNGGTCHMLSRDTYECTCQVGFTGKECQWTDACLSHPCANGSTCTTVANQFSCKCLTGFTGQKCETDVNECDIPGHCQHGGTCLNLPGSYQCQCPQGFTGQYCDSLYVPCAPSPCVNGGTCRQTGDFTFECNCLPGFEGSTCERNIDDCPNHRCQNGGVCVDGVNTYNCRCPPQWTGQFCTEDVDECLLQPNACQNGGTCANRNGGYGCVCVNGWSGDDCSENIDDCAFASCTPGSTCIDRVASFSCMCPEGKAGLLCHLDDACISNPCHKGALCDTNPLNGQYICTCPQGYKGADCTEDVDECAMANSNPCEHAGKCVNTDGAFHCECLKGYAGPRCEMDINECHSDPCQNDATCLDKIGGFTCLCMPGFKGVHCELEINECQSN

### PROTEIN DSTYK

In [22]:
with open("Q6XUX3.txt") as handle:
    record = sp.parse(handle)
    print(str(record))
    #print(", ".join(record.accessions), "\n")
    #print(record.keywords, "\n")
    #print(record.organism, "\n")
    #print(len(record.sequence), "aa", "\n")
    #print(record.sequence)

<generator object parse at 0x00000293ABB31DD0>


In [25]:
with open("Q6XUX3.txt") as handle:
    records = sp.parse(handle)
    for record in records:
        print(record.entry_name)
        print(",".join(record.accessions))
        print(record.keywords)
        print(repr(record.organism))
        print(record.sequence[:20] + "...")

IndexError: list index out of range