## Análise da sequência e das features presentes no NCBI

In [1]:
from Bio import SeqIO
from Bio import Entrez

Entrez.email = 'lucianacmartins8@gmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND NEUROG3[Gene]")

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')

    for record in fetch_records:
        if record.id == "NM_020999.4":
            print('Record accession: ', record.id)
            print('Record sequence length: ', len(record.seq))
            print('Record description: ', record.description)
            print('Record annotations: ', record.annotations)
            print('Record external references: ', record.dbxrefs)
            print('Record features count: ', len(record.features))
            print('Record features: ', record.features)
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'])
                            print(record.features[a].qualifiers['translation'])

Record accession:  NM_020999.4
Record sequence length:  1560
Record description:  Homo sapiens neurogenin 3 (NEUROG3), mRNA
Record annotations:  {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '25-DEC-2022', 'accessions': ['NM_020999'], 'sequence_version': 4, 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Protein Production and Purification of a Codon-Optimized Human NGN3 Transcription Factor from E. coli', ...), Reference(title='Extensive NEUROG3 occupancy in the human pancreatic endocrine gene regulatory network', ...), Reference(title='Gene Signatures of NEUROGENIN3+ Endocrine Progenitor Cells in the Human Pancreas', ...), Reference(title='Effect of NEUROG3 polymorphi

In [2]:
Entrez.email = 'analisboasan@gmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND NOTCH2[Gene]") 

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')

    for record in fetch_records:
        if record.id == "NM_024408.4":
            print('Record accession: ', record.id)
            print('Record sequence length: ', len(record.seq))
            print('Record description: ', record.description)
            print('Record annotations: ', record.annotations)
            print('Record external references: ', record.dbxrefs)
            print('Record features count: ', len(record.features))
            print('Record features: ', record.features)
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'])
                            print(record.features[a].qualifiers['translation'])

Record accession:  NM_024408.4
Record sequence length:  11425
Record description:  Homo sapiens notch receptor 2 (NOTCH2), transcript variant 1, mRNA
Record annotations:  {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '09-OCT-2022', 'accessions': ['NM_024408', 'XM_941698', 'XM_945379'], 'sequence_version': 4, 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Patients with biallelic GGC repeat expansions in NOTCH2NLC exhibiting a typical neuronal intranuclear inclusion disease phenotype', ...), Reference(title='Defining pathogenicity of NOTCH2 variants for diagnosis of Alagille syndrome type 2 using a large cohort of patients', ...), Reference(title='gamma-Secretase inhibit

In [3]:
Entrez.email = 'bruna_dfa@hotmail.com'
search_handler = Entrez.esearch(db = "nucleotide", term = "Homo sapiens[Orgn] AND DSTYK[Gene]") 

search_records = Entrez.read(search_handler)
indices_CDS = []

for record_id in search_records['IdList']:
    fetch_handler = Entrez.efetch(db="nucleotide", id=record_id, rettype="gb", retmode="text")
    fetch_records = SeqIO.parse(fetch_handler, 'genbank')

    for record in fetch_records:
        if record.id == "NM_015375.3":
            print('Record accession: ', record.id)
            print('Record sequence length: ', len(record.seq))
            print('Record description: ', record.description)
            print('Record annotations: ', record.annotations)
            print('Record external references: ', record.dbxrefs)
            print('Record features count: ', len(record.features))
            print('Record features: ', record.features)
        
            indices_CDS = []
            for n, feature in enumerate(record.features):
                if feature.type == "CDS":
                    indices_CDS.append(n)
                    for a in indices_CDS:
                        if len(indices_CDS) != 0:
                            print(record.features[a].qualifiers['product'])
                            print(record.features[a].qualifiers['translation'])

Record accession:  NM_015375.3
Record sequence length:  8010
Record description:  Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 1, mRNA
Record annotations:  {'molecule_type': 'mRNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '24-DEC-2022', 'accessions': ['NM_015375'], 'sequence_version': 3, 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='DSTYK inhibition increases the sensitivity of lung cancer cells to T cell-mediated cytotoxicity', ...), Reference(title='DSTYK Enhances Chemoresistance in Triple-Negative Breast Cancer Cells', ...), Reference(title='RIPK3 promotes adenovirus type 5 activity', ...), Reference(title='Large Intragenic Deletion in DST

## Análise de homologias por BLAST ou Diamond

### NEUROG3

In [4]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record1 = SeqIO.read(open("rs177045_sequence.fa"), format="fasta")

In [5]:
result_handle1 = NCBIWWW.qblast("blastn", "nt", record1.format("fasta"))

In [7]:
save_file1 = open("blast_variante_NEUROG3.xml", "w")
save_file1.write(result_handle1.read()) 
save_file1.close()
result_handle1.close()

In [8]:
result_handle1 = open("blast_variante_NEUROG3.xml")

In [9]:
blast_records1 = NCBIXML.parse(result_handle1)
for blast_record in blast_records1:
    print("*** Parâmetros Globais da Pesquisa ***")
    print("Base de Dados usada: " , blast_record.database)
    print("Matriz de Substituição: ", blast_record.matrix)
    print("Parâmetros de Espaçamento: ", blast_record.gap_penalties)

*** Parâmetros Globais da Pesquisa ***
Base de Dados usada:  nt
Matriz de Substituição:  
Parâmetros de Espaçamento:  (5, 2)


In [10]:
result_handle1 = open("blast_variante_NEUROG3.xml")
resultado1 = NCBIXML.read(result_handle1)
print(f"Existem {len(resultado1.alignments)} alinhamentos no registo.")

Existem 50 alinhamentos no registo.


In [15]:
print(resultado1.alignments[0])

gi|14626972|emb|AL450311.11| Human DNA sequence from clone RP11-343J3 on chromosome 10, complete sequence
           Length = 165110



In [12]:
for a in resultado1.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

AL450311
Human DNA sequence from clone RP11-343J3 on chromosome 10, complete sequence
0.0
OW443374
Orcinus orca genome assembly, chromosome: 14
9.24774e-126
CP050625
Canis lupus familiaris breed Labrador retriever chromosome 04b
2.17871e-89
CP050572
Canis lupus familiaris breed Labrador retriever chromosome 04a
2.65421e-88
HG994386
Canis lupus genome assembly, chromosome: 4
2.65421e-88


### NOTCH2

In [16]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record2 = SeqIO.read(open("rs1493694_sequence.fa"), format="fasta")

In [17]:
result_handle2 = NCBIWWW.qblast("blastn", "nt", record2.format("fasta"))

In [18]:
save_file2 = open("blast_variante_NOTCH2.xml", "w")
save_file2.write(result_handle2.read()) 
save_file2.close()
result_handle2.close()

In [19]:
result_handle2 = open("blast_variante_NOTCH2.xml")

In [20]:
blast_records2 = NCBIXML.parse(result_handle2)
for blast_record in blast_records2:
    print("*** Parâmetros Globais da Pesquisa ***")
    print("Base de Dados usada: " , blast_record.database)
    print("Matriz de Substituição: ", blast_record.matrix)
    print("Parâmetros de Espaçamento: ", blast_record.gap_penalties)

*** Parâmetros Globais da Pesquisa ***
Base de Dados usada:  nt
Matriz de Substituição:  
Parâmetros de Espaçamento:  (5, 2)


In [21]:
result_handle2 = open("blast_variante_NOTCH2.xml")
resultado2 = NCBIXML.read(result_handle2)
print(f"Existem {len(resultado2.alignments)} alinhamentos no registo.")

Existem 50 alinhamentos no registo.


In [22]:
print(resultado2.alignments[0])

gi|1601833237|gb|AC278627.1| Homo sapiens chromosome 1 clone VMRC62-112J14, complete sequence
           Length = 176732



In [23]:
for a in resultado2.alignments[0:10]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

AC278627
Homo sapiens chromosome 1 clone VMRC62-112J14, complete sequence
0.0
AC278623
Homo sapiens chromosome 1 clone VMRC53-156D10, complete sequence
0.0
AC278619
Homo sapiens chromosome 1 clone VMRC62-388P04, complete sequence
0.0
AC278445
Homo sapiens chromosome 1 clone VMRC59-343K02, complete sequence
0.0
AC278334
Homo sapiens chromosome 1 clone VMRC62-166D15, complete sequence
0.0
NG_008163
Homo sapiens notch receptor 2 (NOTCH2), RefSeqGene on chromosome 1
0.0
AC245008
Homo sapiens BAC clone CH17-77K15 from chromosome 1, complete sequence
0.0
AL512503
Human DNA sequence from clone RP11-323K8 on chromosome 1, complete sequence
0.0
AC278816
Homo sapiens chromosome 1 clone VMRC64-514A21, complete sequence
0.0
AC278785
Homo sapiens chromosome 1 clone VMRC66-236N07, complete sequence
0.0


### DSTYK

In [24]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

record3 = SeqIO.read(open("rs12048743_sequence.fa"), format="fasta")

In [25]:
result_handle3 = NCBIWWW.qblast("blastn", "nt", record3.format("fasta"))

In [26]:
save_file3 = open("blast_variante_DSTYK.xml", "w")
save_file3.write(result_handle3.read()) 
save_file3.close()
result_handle3.close()

In [27]:
result_handle3 = open("blast_variante_DSTYK.xml")

In [28]:
blast_records3 = NCBIXML.parse(result_handle3)
for blast_record in blast_records3:
    print("*** Parâmetros Globais da Pesquisa ***")
    print("Base de Dados usada: " , blast_record.database)
    print("Matriz de Substituição: ", blast_record.matrix)
    print("Parâmetros de Espaçamento: ", blast_record.gap_penalties)

*** Parâmetros Globais da Pesquisa ***
Base de Dados usada:  nt
Matriz de Substituição:  
Parâmetros de Espaçamento:  (5, 2)


In [29]:
result_handle3 = open("blast_variante_DSTYK.xml")
resultado3 = NCBIXML.read(result_handle3)
print(f"Existem {len(resultado3.alignments)} alinhamentos no registo.")

Existem 49 alinhamentos no registo.


In [30]:
print(resultado3.alignments[0])

gi|1675076430|ref|NM_199462.3| Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 2, mRNA
           Length = 7875



In [36]:
for a in resultado3.alignments[0:5]:
    print(a.accession)
    print(a.hit_def)
    for b in a.hsps:
        print(b.expect)

NM_199462
Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 2, mRNA
0.0
7.13482e-45
3.0338e-43
0.0157433
NM_015375
Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant 1, mRNA
0.0
7.13482e-45
3.0338e-43
0.0157433
XM_047417152
PREDICTED: Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant X6, mRNA
0.0
7.13482e-45
3.0338e-43
0.0157433
XM_047417151
PREDICTED: Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant X5, mRNA
0.0
7.13482e-45
3.0338e-43
0.0157433
XM_011509394
PREDICTED: Homo sapiens dual serine/threonine and tyrosine protein kinase (DSTYK), transcript variant X4, mRNA
0.0
7.13482e-45
3.0338e-43
0.0157433


## Ferramentas de análise das propriedades da proteína

### PROTEIN NEUROG3

In [37]:
import Bio.SwissProt as sp

with open("Q9Y4Z2.txt") as handle:
    record = sp.read(handle)
    print(record.entry_name, "\n")
    print(", ".join(record.accessions), "\n")
    print(record.keywords, "\n")
    print(record.organism, "\n")
    print(len(record.sequence), "aa", "\n")
    print(record.sequence)

NGN3_HUMAN 

Q9Y4Z2, Q5VVI0, Q6DJX6, Q9BY24 

['Activator', 'Developmental protein', 'Differentiation', 'Disease variant', 'DNA-binding', 'Neurogenesis', 'Nucleus', 'Reference proteome', 'Transcription', 'Transcription regulation'] 

Homo sapiens (Human). 

214 aa 

MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRGNCAEAEEGGCRGAPRKLRARRGGRSRPKSELALSKQRRSRRKKANDRERNRMHNLNSALDALRGVLPTFPDDAKLTKIETLRFAHNYIWALTQTLRIADHSLYALEPPAPHCGELGSPGGSPGDWGSLYSPVSQAGSLSPAASLEERPGLLGATFSACLSPGSLAFSDFL


### PROTEIN NOTCH2

In [38]:
with open("Q04721.txt") as handle:
    record = sp.read(handle)
    print(record.entry_name, "\n")
    print(", ".join(record.accessions), "\n")
    print(record.keywords, "\n")
    print(record.organism, "\n")
    print(len(record.sequence), "aa", "\n")
    print(record.sequence)

NOTC2_HUMAN 

Q04721, Q5T3X7, Q99734, Q9H240 

['3D-structure', 'Activator', 'ANK repeat', 'Cell membrane', 'Cytoplasm', 'Developmental protein', 'Differentiation', 'Disease variant', 'Disulfide bond', 'EGF-like domain', 'Glycoprotein', 'Membrane', 'Notch signaling pathway', 'Nucleus', 'Phosphoprotein', 'Receptor', 'Reference proteome', 'Repeat', 'Signal', 'Transcription', 'Transcription regulation', 'Transmembrane', 'Transmembrane helix', 'Ubl conjugation'] 

Homo sapiens (Human). 

2471 aa 

MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTYHNGTGYCKCPEGFLGEYCQHRDPCEKNRCQNGGTCVAQAMLGKATCRCASGFTGEDCQYSTSHPCFVSRPCLNGGTCHMLSRDTYECTCQVGFTGKECQWTDACLSHPCANGSTCTTVANQFSCKCLTGFTGQKCETDVNECDIPGHCQHGGTCLNLPGSYQCQCPQGFTGQYCDSLYVPCAPSPCVNGGTCRQTGDFTFECNCLPGFEGSTCERNIDDCPNHRCQNGGVCVDGVNTYNCRCPPQWTGQFCTEDVDECLLQPNACQNGGTCANRNGGYGCVCVNGWSGDDCSENIDDCAFASCTPGSTCIDRVASFSCMCPEGKAGLLCHLDDACISNPCHKGALCDTNPLNGQYICTCPQGYKGADCTEDVDECAMANSNPCEHAGKCVNTDGAFHCECLKGYAGPRCEMDINECHSDPCQNDATCLDKIGGFTCLCMPGFKGVHCELEINECQSN

### PROTEIN DSTYK

In [None]:
with open("Q6XUX3.txt") as handle:
    record = sp.parse(handle)
    print(str(record))
    #print(", ".join(record.accessions), "\n")
    #print(record.keywords, "\n")
    #print(record.organism, "\n")
    #print(len(record.sequence), "aa", "\n")
    #print(record.sequence)

In [None]:
with open("Q6XUX3.txt") as handle:
    records = sp.parse(handle)
    for record in records:
        print(record.entry_name)
        print(",".join(record.accessions))
        print(record.keywords)
        print(repr(record.organism))
        print(record.sequence[:20] + "...")

## Alinhamento Múltiplo e Filogenia

### NOTCH2

In [39]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [40]:
record1 = SeqIO.parse(open("blast_seqs_NOTCH2 (1).txt"), format="fasta")
record1

<Bio.SeqIO.FastaIO.FastaIterator at 0x29963346f10>

In [41]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO

In [42]:
seqs = []
for seq in record1:
    seqs.append(seq)

alin1 = MultipleSeqAlignment(seqs)
print(alin1)
print(alin1.get_alignment_length())

Alignment with 10 rows and 2471 columns
MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTY...VYA NP_077719.2
MPALRPALPWALLALWLCCAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_009243797.2
MPALRPALHWALLALWLCCAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_030680572.1
MPALRPALHWALLALWLCCAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_032007484.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA NP_001247591.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_023078536.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA EHH50225.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_033054071.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_025230885.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_010358948.2
2471


In [43]:
ficheirofasta1 = AlignIO.write(alin1, "resultados_alin_NOTCH2", "fasta")

In [44]:
record_seq = AlignIO.parse("resultados_alin_NOTCH2", "fasta")

converter_stock = AlignIO.parse("resultados_alin_NOTCH2", "fasta")
ficheirostock = AlignIO.write([c for c in converter_stock], "resultados_align_NOTCH2_stock.sth", "stockholm")

In [45]:
alignment1 = AlignIO.read("resultados_align_NOTCH2_stock.sth", "stockholm")
print(alignment1)

Alignment with 10 rows and 2471 columns
MPALRPALLWALLALWLCCAAPAHALQCRDGYEPCVNEGMCVTY...VYA NP_077719.2
MPALRPALPWALLALWLCCAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_009243797.2
MPALRPALHWALLALWLCCAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_030680572.1
MPALRPALHWALLALWLCCAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_032007484.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA NP_001247591.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_023078536.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA EHH50225.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_033054071.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_025230885.1
MPALRPALLWALLALWLCRAAPARALQCRDGYEPCVNEGMCVTY...VYA XP_010358948.2


In [46]:
from Bio.Phylo.TreeConstruction import DistanceCalculator

In [47]:
calculator = DistanceCalculator('blosum62')
dm1 = calculator.get_distance(alignment1)
print(dm1)

NP_077719.2	0
XP_009243797.2	0.005280810675800995	0
XP_030680572.1	0.005921381179995744	0.007348220018548957	0
XP_032007484.1	0.006707099536211181	0.008134141990724264	0.002354284083612712	0
NP_001247591.1	0.008703716915174464	0.011486052650353185	0.011343368766497774	0.011700078476136078	0
XP_023078536.1	0.008420151277294119	0.011203082631654038	0.01127202682457018	0.012058508740635077	0.001997574373974409	0
EHH50225.1	0.008988443429875925	0.011770580681980336	0.011627906976744207	0.01198459123983453	0.00028534741047225864	0.002282779283777958	0
XP_033054071.1	0.008848294562580294	0.011631225916940213	0.011700078476136078	0.01248662147698898	0.002425626025540417	0.001355787070072778	0.002710800399486346	0
XP_025230885.1	0.009275114155251174	0.01170091324200917	0.011985446243846787	0.012842465753424626	0.001926232432046815	0.0025684931506849695	0.0022114424311598935	0.0029965753424657793	0
XP_010358948.2	0.010847070577321039	0.013207681873349086	0.013768994792038192	0.01455583303603280

In [48]:
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [49]:
constructor = DistanceTreeConstructor()
upgmatree_NOTCH2 = constructor.upgma(dm1)

In [50]:
print(upgmatree_NOTCH2)

Tree(rooted=True)
    Clade(branch_length=0, name='Inner9')
        Clade(name='Inner6')
            Clade(name='XP_010358948.2')
            Clade(name='Inner5')
                Clade(name='Inner2')
                    Clade(name='XP_033054071.1')
                    Clade(name='XP_023078536.1')
                Clade(name='Inner3')
                    Clade(name='XP_025230885.1')
                    Clade(name='Inner1')
                        Clade(name='EHH50225.1')
                        Clade(name='NP_001247591.1')
        Clade(name='Inner8')
            Clade(name='Inner4')
                Clade(name='XP_032007484.1')
                Clade(name='XP_030680572.1')
            Clade(name='Inner7')
                Clade(name='XP_009243797.2')
                Clade(name='NP_077719.2')


In [51]:
from Bio import Phylo
Phylo.draw_ascii(upgmatree_NOTCH2)

                                      ________________ XP_010358948.2
                                     |
  ___________________________________|             _____ XP_033054071.1
 |                                   |        ____|
 |                                   |       |    |_____ XP_023078536.1
 |                                   |_______|
 |                                           |  ________ XP_025230885.1
 |                                           |_|
_|                                             |       , EHH50225.1
 |                                             |_______|
 |                                                     | NP_001247591.1
 |
 |                                                  __________ XP_032007484.1
 |                              ___________________|
 |                             |                   |__________ XP_030680572.1
 |_____________________________|
                               |       _______________________ XP_009243797.2
       

In [52]:
njtree_NOTCH2 = constructor.nj(dm1)
print(njtree_NOTCH2)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner8')
        Clade(name='Inner7')
            Clade(name='Inner6')
                Clade(name='Inner4')
                    Clade(name='XP_010358948.2')
                    Clade(name='XP_033054071.1')
                Clade(name='XP_023078536.1')
            Clade(name='Inner3')
                Clade(name='NP_077719.2')
                Clade(name='Inner2')
                    Clade(name='Inner1')
                        Clade(name='XP_032007484.1')
                        Clade(name='XP_030680572.1')
                    Clade(name='XP_009243797.2')
        Clade(name='Inner5')
            Clade(name='EHH50225.1')
            Clade(name='NP_001247591.1')
        Clade(name='XP_025230885.1')


In [53]:
Phylo.draw_ascii(njtree_NOTCH2)

           __________ XP_010358948.2
      ____|
    _|    | XP_033054071.1
   | |
   | |__ XP_023078536.1
  _|
 | |                                _______ NP_077719.2
 | |                               |
 | |_______________________________|                  ________ XP_032007484.1
 |                                 |   ______________|
 |                                 |__|              |____ XP_030680572.1
_|                                    |
 |                                    |__________________ XP_009243797.2
 |
 |   _ EHH50225.1
 |__|
 |  | NP_001247591.1
 |
 |_____ XP_025230885.1



### DSTYK

In [54]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [55]:
record2 = SeqIO.parse(open("blast_seqs_DSTYK.txt"), format="fasta")
record2

<Bio.SeqIO.FastaIO.FastaIterator at 0x29963619cd0>

In [56]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO

In [57]:
seqs = []
for seq in record2:
    seqs.append(seq)

alin2 = MultipleSeqAlignment(seqs)
print(alin2)
print(alin2.get_alignment_length())

Alignment with 10 rows and 929 columns
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST NP_056190.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_004028290.1
MEGDGVPWGSEPVSGPGPGGGGTIRELCRGFGRYRRYLGRLRQN...DST XP_003823003.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST AIC62720.1
MEGDGVPWGGEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_031992657.1
MEGDGVPWGSEPVSGPGRGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_030668989.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_012314775.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_017380922.1
MEGDGVPWRSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_003938375.1
MEGNEVPWGSEPVSGPGPGPGGMIRELCRGFGRYRRYLGRLRQN...DST XP_023042634.1
929


In [58]:
ficheirofasta2 = AlignIO.write(alin2, "resultados_alin_DSTYK", "fasta")

In [59]:
record_seq = AlignIO.parse("resultados_alin_DSTYK", "fasta")

converter_stock = AlignIO.parse("resultados_alin_DSTYK", "fasta")
ficheirostock = AlignIO.write([c for c in converter_stock], "resultados_align_DSTYK_stock.sth", "stockholm")

In [60]:
alignment2 = AlignIO.read("resultados_align_DSTYK_stock.sth", "stockholm")
print(alignment2)

Alignment with 10 rows and 929 columns
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST NP_056190.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_004028290.1
MEGDGVPWGSEPVSGPGPGGGGTIRELCRGFGRYRRYLGRLRQN...DST XP_003823003.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST AIC62720.1
MEGDGVPWGGEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_031992657.1
MEGDGVPWGSEPVSGPGRGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_030668989.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_012314775.1
MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_017380922.1
MEGDGVPWRSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQN...DST XP_003938375.1
MEGNEVPWGSEPVSGPGPGPGGMIRELCRGFGRYRRYLGRLRQN...DST XP_023042634.1


In [61]:
from Bio.Phylo.TreeConstruction import DistanceCalculator

In [62]:
calculator = DistanceCalculator('blosum62')
dm2 = calculator.get_distance(alignment2)
print(dm2)

NP_056190.1	0
XP_004028290.1	0.004512820512820537	0
XP_003823003.1	0.005333333333333301	0.002874153151303682	0
AIC62720.1	0.005948717948717985	0.0055430096489427205	0.006364196263600852	0
XP_031992657.1	0.009219422249538978	0.007580413849620937	0.008399918049579957	0.011063306699446884	0
XP_030668989.1	0.009432027885995509	0.0077916752101702125	0.00861185154808286	0.011277424646298995	0.003073140749846326	0
XP_012314775.1	0.015993438589296693	0.014353085913471397	0.015173262251384045	0.01783883534960018	0.01167793484941615	0.011892556899733453	0
XP_017380922.1	0.01680672268907568	0.01516704242672684	0.01598688255790126	0.0186513629842181	0.01208768694939566	0.012707522033203578	0.0034843205574912606	0
XP_003938375.1	0.018446402951424523	0.01680672268907568	0.017626562820250102	0.020291043246566942	0.0137266953493137	0.01434720229555242	0.005124000819840102	0.0030744004919041057	0
XP_023042634.1	0.023379819524200207	0.021739130434782594	0.0225594749794914	0.02522559474979491	0.019258348

In [63]:
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [64]:
constructor = DistanceTreeConstructor()
upgmatree_DSTYK = constructor.upgma(dm2)

In [65]:
print(upgmatree_DSTYK)

Tree(rooted=True)
    Clade(branch_length=0, name='Inner9')
        Clade(name='XP_023042634.1')
        Clade(name='Inner8')
            Clade(name='Inner4')
                Clade(name='Inner3')
                    Clade(name='XP_003938375.1')
                    Clade(name='XP_017380922.1')
                Clade(name='XP_012314775.1')
            Clade(name='Inner7')
                Clade(name='Inner2')
                    Clade(name='XP_030668989.1')
                    Clade(name='XP_031992657.1')
                Clade(name='Inner6')
                    Clade(name='AIC62720.1')
                    Clade(name='Inner5')
                        Clade(name='Inner1')
                            Clade(name='XP_003823003.1')
                            Clade(name='XP_004028290.1')
                        Clade(name='NP_056190.1')


In [66]:
from Bio import Phylo
Phylo.draw_ascii(upgmatree_DSTYK)

  _____________________________________ XP_023042634.1
 |
 |                                                 ____ XP_003938375.1
_|                                              __|
 |                            _________________|  |____ XP_017380922.1
 |                           |                 |
 |                           |                 |_______ XP_012314775.1
 |___________________________|
                             |                           _____ XP_030668989.1
                             |                __________|
                             |               |          |_____ XP_031992657.1
                             |_______________|
                                             |      __________ AIC62720.1
                                             |     |
                                             |_____|      ____ XP_003823003.1
                                                   |  ___|
                                                   |_|   |____ XP_004028

In [67]:
njtree_DSTYK = constructor.nj(dm2)
print(njtree_DSTYK)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner8')
        Clade(name='Inner6')
            Clade(name='AIC62720.1')
            Clade(name='NP_056190.1')
        Clade(name='Inner7')
            Clade(name='XP_003823003.1')
            Clade(name='XP_004028290.1')
        Clade(name='Inner5')
            Clade(name='Inner3')
                Clade(name='XP_023042634.1')
                Clade(name='Inner2')
                    Clade(name='Inner1')
                        Clade(name='XP_003938375.1')
                        Clade(name='XP_017380922.1')
                    Clade(name='XP_012314775.1')
            Clade(name='Inner4')
                Clade(name='XP_030668989.1')
                Clade(name='XP_031992657.1')


In [68]:
Phylo.draw_ascii(njtree_DSTYK)

    __________ AIC62720.1
  _|
 | |______ NP_056190.1
 |
 |_____ XP_003823003.1
 |
 |__ XP_004028290.1
_|
 |              ______________________________________________ XP_023042634.1
 |             |
 |           __|                        ______ XP_003938375.1
 |          |  |                    ___|
 |          |  |___________________|   |_ XP_017380922.1
 |__________|                      |
            |                      |____ XP_012314775.1
            |
            |   ____ XP_030668989.1
            |__|
               |___ XP_031992657.1



### NEUROG3

In [69]:
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [70]:
record3 = SeqIO.parse(open("blast_seqs_NEUROG3.txt"), format="fasta")
record3

<Bio.SeqIO.FastaIO.FastaIterator at 0x29963779f40>

In [71]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO

In [72]:
seqs = []
for seq in record3:
    seqs.append(seq)

alin3 = MultipleSeqAlignment(seqs)
print(alin3)
print(alin3.get_alignment_length())

Alignment with 8 rows and 214 columns
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL NP_066279.2
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL AKI72019.1
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL XP_003312656.3
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL XP_004049566.3
MTPQPSGAPTVQVTRETERSFPRASDDEVTCPTSAPPSPTRTRG...DFL XP_002820920.1
MTPQPSGAPTVQLTRETEQSFPRASDDEVTCPTSAAPSPTRTRG...DFL XP_031994867.1
MTPQPWGAPTVQVTRETEQSFPRASDDEVTCPTSAAPSPTRTRG...DFL XP_030653989.1
MAPHPSCAPAVQVTHQTEQPFPSAPEDKVTCVASAPPSPTRVPG...DFL XP_020928083.1
214


In [73]:
ficheirofasta3 = AlignIO.write(alin3, "resultados_alin_NEUROG3", "fasta")

In [74]:
record_seq = AlignIO.parse("resultados_alin_NEUROG3", "fasta")

converter_stock = AlignIO.parse("resultados_alin_NEUROG3", "fasta")
ficheirostock = AlignIO.write([c for c in converter_stock], "resultados_align_NEUROG3_stock.sth", "stockholm")

In [75]:
alignment3 = AlignIO.read("resultados_align_NEUROG3_stock.sth", "stockholm")
print(alignment3)

Alignment with 8 rows and 214 columns
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL NP_066279.2
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL AKI72019.1
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL XP_003312656.3
MTPQPSGAPTVQVTRETERSFPRASEDEVTCPTSAPPSPTRTRG...DFL XP_004049566.3
MTPQPSGAPTVQVTRETERSFPRASDDEVTCPTSAPPSPTRTRG...DFL XP_002820920.1
MTPQPSGAPTVQLTRETEQSFPRASDDEVTCPTSAAPSPTRTRG...DFL XP_031994867.1
MTPQPWGAPTVQVTRETEQSFPRASDDEVTCPTSAAPSPTRTRG...DFL XP_030653989.1
MAPHPSCAPAVQVTHQTEQPFPSAPEDKVTCVASAPPSPTRVPG...DFL XP_020928083.1


In [76]:
from Bio.Phylo.TreeConstruction import DistanceCalculator

In [77]:
calculator = DistanceCalculator('blosum62')
dm3 = calculator.get_distance(alignment3)
print(dm3)

NP_066279.2	0
AKI72019.1	0.005338078291814985	0
XP_003312656.3	0.01690391459074736	0.021371326803205748	0
XP_004049566.3	0.020462633451957313	0.024933214603739984	0.0035714285714285587	0
XP_002820920.1	0.03113879003558717	0.0356188780053428	0.02049910873440286	0.024064171122994638	0
XP_031994867.1	0.05427046263345192	0.058771148708815724	0.038358608385370196	0.041926851025869794	0.0267379679144385	0
XP_030653989.1	0.06554472984942428	0.07085916740478304	0.05225863596102742	0.055801594331266635	0.039858281665190454	0.019486271036315284	0
XP_020928083.1	0.1777777777777778	0.1831111111111111	0.1804444444444444	0.17955555555555558	0.1697777777777778	0.18933333333333335	0.19574844995571306	0
	NP_066279.2	AKI72019.1	XP_003312656.3	XP_004049566.3	XP_002820920.1	XP_031994867.1	XP_030653989.1	XP_020928083.1


In [78]:
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [79]:
constructor = DistanceTreeConstructor()
upgmatree_NEUROG3 = constructor.upgma(dm3)

In [80]:
print(upgmatree_NEUROG3)

Tree(rooted=True)
    Clade(branch_length=0, name='Inner7')
        Clade(name='XP_020928083.1')
        Clade(name='Inner6')
            Clade(name='Inner3')
                Clade(name='XP_030653989.1')
                Clade(name='XP_031994867.1')
            Clade(name='Inner5')
                Clade(name='XP_002820920.1')
                Clade(name='Inner4')
                    Clade(name='Inner1')
                        Clade(name='XP_004049566.3')
                        Clade(name='XP_003312656.3')
                    Clade(name='Inner2')
                        Clade(name='AKI72019.1')
                        Clade(name='NP_066279.2')


In [81]:
from Bio import Phylo
Phylo.draw_ascii(upgmatree_NEUROG3)

  ___________________________________________________ XP_020928083.1
 |
_|                                                   _____ XP_030653989.1
 |                                            ______|
 |                                           |      |_____ XP_031994867.1
 |___________________________________________|
                                             |     _______ XP_002820920.1
                                             |    |
                                             |____|           _ XP_004049566.3
                                                  |      ____|
                                                  |     |    |_ XP_003312656.3
                                                  |_____|
                                                        |     _ AKI72019.1
                                                        |____|
                                                             |_ NP_066279.2



In [82]:
njtree_NEUROG3 = constructor.nj(dm3)
print(njtree_NEUROG3)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner6')
        Clade(name='Inner4')
            Clade(name='Inner2')
                Clade(name='NP_066279.2')
                Clade(name='AKI72019.1')
            Clade(name='Inner3')
                Clade(name='XP_004049566.3')
                Clade(name='XP_003312656.3')
        Clade(name='Inner5')
            Clade(name='Inner1')
                Clade(name='XP_030653989.1')
                Clade(name='XP_031994867.1')
            Clade(name='XP_002820920.1')
        Clade(name='XP_020928083.1')


In [83]:
Phylo.draw_ascii(njtree_NEUROG3)

         , NP_066279.2
     ____|
    |    |_ AKI72019.1
  __|
 |  | , XP_004049566.3
 |  |_|
 |    | XP_003312656.3
 |
_|         _____ XP_030653989.1
 | _______|
 ||       | XP_031994867.1
 ||
 ||_ XP_002820920.1
 |
 |_____________________________________________________________ XP_020928083.1



In [84]:
# acabar parte de conservação dos dominios e tal (ficha 9)

## Regulação