## Análise da sequência e das features presentes no NCBI

In [None]:
from Bio import SeqIO
from Bio import Entrez

Entrez.email = 'lucianacmartins8@gmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND NEUROG3[Gene]")

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')
    record = SeqIO.read(fetch_handler, 'gb')
    SeqIO.write(record, "NEUROG3_file", 'gb')

In [None]:
for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')
    
    for record in fetch_records:
        if record.id == "NM_020999.4":
            print('Record accession: ', record.id, "\n")
            print('Record sequence length: ', len(record.seq), "\n")
            print('Record description: ', record.description, "\n")
            print('Record annotations: ', record.annotations, "\n")
            print('Record external references: ', record.dbxrefs, "\n")
            print('Record features count: ', len(record.features), "\n")
            print('Record features: ', record.features)
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'], "\n")
                            print(record.features[a].qualifiers['translation'])

In [None]:
Entrez.email = 'analisboasan@gmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND NOTCH2[Gene]") 

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')
    record = SeqIO.read(fetch_handler, 'gb')
    SeqIO.write(record, "NOTCH2_file", 'gb')

In [None]:
for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')
    
    for record in fetch_records:
        if record.id == "NM_024408.4":
            print('Record accession: ', record.id, "\n")
            print('Record sequence length: ', len(record.seq), "\n")
            print('Record description: ', record.description, "\n")
            print('Record annotations: ', record.annotations, "\n")
            print('Record external references: ', record.dbxrefs, "\n")
            print('Record features count: ', len(record.features), "\n")
            print('Record features: ', record.features, "\n")
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'], "\n")
                            print(record.features[a].qualifiers['translation'])

In [6]:
from Bio import SeqIO
from Bio import Entrez

Entrez.email = 'bruna_dfa@hotmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND DSTYK[Gene]") 

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')
    record = SeqIO.read(fetch_handler, 'gb')
    SeqIO.write(record, "DSTYK_file", 'gb')

In [7]:
for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')

    for record in fetch_records:
        if record.id == "BC048204.1":
            print('Record accession: ', record.id, "\n")
            print('Record sequence length: ', len(record.seq), "\n")
            print('Record description: ', record.description, "\n")
            print('Record annotations: ', record.annotations, "\n")
            print('Record external references: ', record.dbxrefs, "\n")
            print('Record features count: ', len(record.features), "\n")
            print('Record features: ', record.features, "\n")
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'], "\n")
                            print(record.features[a].qualifiers['translation'])

Record accession:  BC048204.1 

Record sequence length:  1362 

Record description:  Homo sapiens dual serine/threonine and tyrosine protein kinase, mRNA (cDNA clone IMAGE:4137417), partial cds 

Record annotations:  {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '18-MAR-2009', 'accessions': ['BC048204'], 'sequence_version': 1, 'keywords': [''], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences', ...), Reference(title='Direct Submission', ...)], 'comment': 'Contact: MGC help desk\nEmail: cgapbs-r@mail.nih.gov\nTissue Procurement: ATCC\ncDNA Library Preparation: Rubin Laboratory\ncDNA Library Arrayed by: The I.M.A.G.E.

## Análise de homologias por BLAST ou Diamond

### NEUROG3

In [None]:
SeqIO.convert("NEUROG3_file", "genbank", "Sequence_NEUROG3.fna", "fasta")

In [None]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record1 = SeqIO.read(open("Sequence_NEUROG3.fna"), format="fasta")

In [None]:
result_handle1 = NCBIWWW.qblast("blastn", "nt", record1.format("fasta"))

In [None]:
save_file1 = open("Blast_NEUROG3.xml", "w")
save_file1.write(result_handle1.read()) 
save_file1.close()
result_handle1.close()

In [None]:
result_handle1 = open("Blast_NEUROG3.xml")
resultado1 = NCBIXML.read(result_handle1)

In [None]:
print(resultado1.alignments[0])

In [None]:
for a in resultado1.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

### NOTCH2

In [None]:
SeqIO.convert("NOTCH2_file", "genbank", "Sequence_NOTCH2.fna", "fasta")

In [None]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record2 = SeqIO.read(open("Sequence_NOTCH2.fna"), format="fasta")

In [None]:
result_handle2 = NCBIWWW.qblast("blastn", "nt", record2.format("fasta"))

In [None]:
save_file2 = open("Blast_NOTCH2.xml", "w")
save_file2.write(result_handle2.read()) 
save_file2.close()
result_handle2.close()

In [None]:
result_handle2 = open("Blast_NOTCH2.xml")
resultado2 = NCBIXML.read(result_handle2)

In [None]:
print(resultado2.alignments[0])

In [None]:
for a in resultado2.alignments[0:10]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

### DSTYK

In [None]:
SeqIO.convert("DSTYK_file", "genbank", "Sequence_DSTYK.fna", "fasta")

In [None]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record3 = SeqIO.read(open("Sequence_DSTYK.fna"), format="fasta")

In [None]:
result_handle3 = NCBIWWW.qblast("blastn", "nt", record3.format("fasta"))

In [None]:
save_file3 = open("Blast_DSTYK.xml", "w")
save_file3.write(result_handle3.read()) 
save_file3.close()
result_handle3.close()

In [None]:
result_handle3 = open("Blast_DSTYK.xml")
resultado3 = NCBIXML.read(result_handle3)

In [None]:
print(resultado3.alignments[0])

In [None]:
for a in resultado3.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

## Ferramentas de análise das propriedades da proteína

### PROTEIN NEUROG3

In [None]:
from Bio import SeqIO
from Bio import Entrez

Entrez.email = 'lucianacmartins8@gmail.com'
handle = Entrez.esearch(db="protein", term="Homo sapiens[Orgn] AND NEUROG3[Gene]")

search_records = Entrez.read(handle)
for record_id in search_records['IdList']:
    handle2 = Entrez.efetch(db="protein", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(handle2, 'genbank')
    record = SeqIO.read(handle2, 'gb')
    SeqIO.write(record, "NEUROG3_proteinfile", 'gb')

In [None]:
for record_id in search_records['IdList']:
    handle2 = Entrez.efetch(db="protein", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(handle2, 'genbank')
    for record in fetch_records:
        if record.id == "NP_066279.2":
            print('Record accession: ', record.id, "\n")
            print('Record sequence length: ', len(record.seq), "\n")
            print('Record sequence: ', record.seq, "\n")
            print('Record description: ', record.description, "\n")
            print('Record taxonomy: ', record.annotations["taxonomy"], "\n")
            print('Record organism: ', record.annotations["organism"], "\n")
            print('Record number of features: ', len(record.features), "\n")
            print('Record features: ', record.features)

### PROTEIN NOTCH2

In [None]:
from Bio import SeqIO
from Bio import Entrez

Entrez.email = 'analisboasan@gmail.com'
handle = Entrez.esearch(db = "protein", term = "Homo sapiens[Orgn] AND NOTCH2[Gene]") 

search_records = Entrez.read(handle)
for record_id in search_records['IdList']:
    handle2 = Entrez.efetch(db="protein", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(handle2, 'genbank')
    record = SeqIO.read(handle2, 'gb')
    SeqIO.write(record, "NOTCH2_proteinfile", 'gb')

In [None]:
for record_id in search_records['IdList']:
    handle2 = Entrez.efetch(db="protein", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(handle2, 'genbank')
    for record in fetch_records:
        if record.id == "Q04721.3":
            print('Record accession: ', record.id, "\n")
            print('Record sequence length: ', len(record.seq), "\n")
            print('Record sequence: ', record.seq, "\n")
            print('Record description: ', record.description, "\n")
            print('Record taxonomy: ', record.annotations["taxonomy"], "\n")
            print('Record organism: ', record.annotations["organism"], "\n")
            print('Record number of features: ', len(record.features), "\n")
            print('Record features: ', record.features)

### PROTEIN DSTYK

In [None]:
from Bio import SeqIO
from Bio import Entrez

Entrez.email = 'bruna_dfa@hotmail.com'
handle = Entrez.esearch(db = "protein", term = "Homo sapiens[Orgn] AND DSTYK[Gene]")

search_records = Entrez.read(handle)
for record_id in search_records['IdList']:
    handle2 = Entrez.efetch(db="protein", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(handle2, 'genbank')
    record = SeqIO.read(handle2, 'gb')
    SeqIO.write(record, "DSTYK_proteinfile", 'gb')

In [None]:
for record_id in search_records['IdList']:
    handle2 = Entrez.efetch(db="protein", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(handle2, 'genbank')
    for record in fetch_records:
        if record.id == "AAH48204.1":
            print('Record accession: ', record.id, "\n")
            print('Record sequence length: ', len(record.seq), "\n")
            print('Record sequence: ', record.seq, "\n")
            print('Record description: ', record.description, "\n")
            print('Record taxonomy: ', record.annotations["taxonomy"], "\n")
            print('Record organism: ', record.annotations["organism"], "\n")
            print('Record number of features: ', len(record.features), "\n")
            print('Record features: ', record.features)

# BLAST PROTEIN

### NEUROG3

In [None]:
SeqIO.convert("NEUROG3_proteinfile", "genbank", "ProteinSequence_NEUROG3.fna", "fasta")

In [None]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record1 = SeqIO.read(open("ProteinSequence_NEUROG3.fna"), format="fasta")

In [None]:
result_handle1 = NCBIWWW.qblast("blastp", "nr", record1.format("fasta"))

In [None]:
save_file1 = open("ProteinBlast_NEUROG3.xml", "w")
save_file1.write(result_handle1.read()) 
save_file1.close()
result_handle1.close()

In [None]:
result_handle1 = open("ProteinBlast_NEUROG3.xml")
resultado1 = NCBIXML.read(result_handle1)

In [None]:
print(resultado1.alignments[0])

In [None]:
for a in resultado1.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

## NOTCH2 - MUDAR

In [105]:
SeqIO.convert("NOTCH2_proteinfile", "genbank", "ProteinSequence_NOTCH2.fna", "fasta")

1

In [None]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record2 = SeqIO.read(open("ProteinSequence_NOTCH2.fna"), format="fasta")

In [None]:
result_handle2 = NCBIWWW.qblast("blastp", "nr", record2.format("fasta"), hitlist_size = "10")

In [98]:
save_file2 = open("ProteinBlast_NOTCH2.xml", "w")
save_file2.write(result_handle2.read())
save_file2.close()
result_handle2.close()

In [87]:
result_handle2 = open("ProteinBlast_NOTCH2.xml")
resultado2 = NCBIXML.read(result_handle2)

In [93]:
print(resultado2.alignments)

[]


In [89]:
for a in resultado2.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

## DSTYK

In [97]:
SeqIO.convert("DSTYK_proteinfile", "genbank", "ProteinSequence_DSTYK.fna", "fasta")

1

In [99]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record3 = SeqIO.read(open("ProteinSequence_DSTYK.fna"), format="fasta")

In [100]:
result_handle3 = NCBIWWW.qblast("blastp", "nr", record3.format("fasta"))

In [101]:
save_file3 = open("ProteinBlast_DSTYK.xml", "w")
save_file3.write(result_handle3.read()) 
save_file3.close()
result_handle3.close()

In [102]:
result_handle3 = open("ProteinBlast_DSTYK.xml")
resultado3 = NCBIXML.read(result_handle3)

In [103]:
print(resultado3.alignments[0])

gb|AAH48204.1| DSTYK protein, partial [Homo sapiens]
           Length = 223



In [104]:
for a in resultado3.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

AAH48204
DSTYK protein, partial [Homo sapiens]
1.17158e-164
XP_047273107
dual serine/threonine and tyrosine protein kinase isoform X5 [Homo sapiens]
3.03464e-162
XP_047273108
dual serine/threonine and tyrosine protein kinase isoform X6 [Homo sapiens]
4.82162e-162
XP_047273101
dual serine/threonine and tyrosine protein kinase isoform X2 [Homo sapiens]
8.26022e-159
AAH53627
Dual serine/threonine and tyrosine protein kinase [Homo sapiens] >gb|ACE87798.1| receptor interacting protein kinase 5 protein [synthetic construct] >gb|ACT64414.1| receptor interacting protein kinase 5 protein, partial [synthetic construct] >gb|AIC62721.1| DSTYK, partial [synthetic construct] >gb|AIC63348.1| DSTYK, partial [synthetic construct]
1.21322e-158


## Alinhamento Múltiplo e Filogenia

### NOTCH2

In [None]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [None]:
record1 = SeqIO.parse(open("blast_seqs_NOTCH2 (1).txt"), format="fasta")
record1

In [None]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO

In [None]:
seqs = []
for seq in record1:
    seqs.append(seq)

alin1 = MultipleSeqAlignment(seqs)
print(alin1)
print(alin1.get_alignment_length())

In [None]:
ficheirofasta1 = AlignIO.write(alin1, "resultados_alin_NOTCH2", "fasta")

In [None]:
record_seq = AlignIO.parse("resultados_alin_NOTCH2", "fasta")

converter_stock = AlignIO.parse("resultados_alin_NOTCH2", "fasta")
ficheirostock = AlignIO.write([c for c in converter_stock], "resultados_align_NOTCH2_stock.sth", "stockholm")

In [None]:
alignment1 = AlignIO.read("resultados_align_NOTCH2_stock.sth", "stockholm")
print(alignment1)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceCalculator

In [None]:
calculator = DistanceCalculator('blosum62')
dm1 = calculator.get_distance(alignment1)
print(dm1)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [None]:
constructor = DistanceTreeConstructor()
upgmatree_NOTCH2 = constructor.upgma(dm1)

In [None]:
print(upgmatree_NOTCH2)

In [None]:
from Bio import Phylo
Phylo.draw_ascii(upgmatree_NOTCH2)

In [None]:
njtree_NOTCH2 = constructor.nj(dm1)
print(njtree_NOTCH2)

In [None]:
Phylo.draw_ascii(njtree_NOTCH2)

### DSTYK

In [None]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [None]:
record2 = SeqIO.parse(open("blast_seqs_DSTYK.txt"), format="fasta")
record2

In [None]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO

In [None]:
seqs = []
for seq in record2:
    seqs.append(seq)

alin2 = MultipleSeqAlignment(seqs)
print(alin2)
print(alin2.get_alignment_length())

In [None]:
ficheirofasta2 = AlignIO.write(alin2, "resultados_alin_DSTYK", "fasta")

In [None]:
record_seq = AlignIO.parse("resultados_alin_DSTYK", "fasta")

converter_stock = AlignIO.parse("resultados_alin_DSTYK", "fasta")
ficheirostock = AlignIO.write([c for c in converter_stock], "resultados_align_DSTYK_stock.sth", "stockholm")

In [None]:
alignment2 = AlignIO.read("resultados_align_DSTYK_stock.sth", "stockholm")
print(alignment2)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceCalculator

In [None]:
calculator = DistanceCalculator('blosum62')
dm2 = calculator.get_distance(alignment2)
print(dm2)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [None]:
constructor = DistanceTreeConstructor()
upgmatree_DSTYK = constructor.upgma(dm2)

In [None]:
print(upgmatree_DSTYK)

In [None]:
from Bio import Phylo
Phylo.draw_ascii(upgmatree_DSTYK)

In [None]:
njtree_DSTYK = constructor.nj(dm2)
print(njtree_DSTYK)

In [None]:
Phylo.draw_ascii(njtree_DSTYK)

### NEUROG3

In [None]:
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [None]:
record3 = SeqIO.parse(open("blast_seqs_NEUROG3.txt"), format="fasta")
record3

In [None]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO

In [None]:
seqs = []
for seq in record3:
    seqs.append(seq)

alin3 = MultipleSeqAlignment(seqs)
print(alin3)
print(alin3.get_alignment_length())

In [None]:
ficheirofasta3 = AlignIO.write(alin3, "resultados_alin_NEUROG3", "fasta")

In [None]:
record_seq = AlignIO.parse("resultados_alin_NEUROG3", "fasta")

converter_stock = AlignIO.parse("resultados_alin_NEUROG3", "fasta")
ficheirostock = AlignIO.write([c for c in converter_stock], "resultados_align_NEUROG3_stock.sth", "stockholm")

In [None]:
alignment3 = AlignIO.read("resultados_align_NEUROG3_stock.sth", "stockholm")
print(alignment3)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceCalculator

In [None]:
calculator = DistanceCalculator('blosum62')
dm3 = calculator.get_distance(alignment3)
print(dm3)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [None]:
constructor = DistanceTreeConstructor()
upgmatree_NEUROG3 = constructor.upgma(dm3)

In [None]:
print(upgmatree_NEUROG3)

In [None]:
from Bio import Phylo
Phylo.draw_ascii(upgmatree_NEUROG3)

In [None]:
njtree_NEUROG3 = constructor.nj(dm3)
print(njtree_NEUROG3)

In [None]:
Phylo.draw_ascii(njtree_NEUROG3)

In [None]:
# acabar parte de conservação dos dominios e + (ficha 9)