## Análise da literatura

In [2]:
from Bio import Entrez

In [3]:
Entrez.email = "pg49837@uminho.pt"
handle = Entrez.einfo()

In [4]:
result = Entrez.read(handle)
#handle.close()

In [5]:
result["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

In [6]:
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)
record["DbInfo"]["Description"]

'PubMed bibliographic record'

In [7]:
for field in record["DbInfo"]["FieldList"]:
    print("%(Name)s, %(FullName)s, %(Description)s" %field)

ALL, All Fields, All terms from all searchable fields
UID, UID, Unique number assigned to publication
FILT, Filter, Limits the records
TITL, Title, Words in title of publication
WORD, Text Word, Free text associated with publication
MESH, MeSH Terms, Medical Subject Headings assigned to publication
MAJR, MeSH Major Topic, MeSH terms of major importance to publication
AUTH, Author, Author(s) of publication
JOUR, Journal, Journal abbreviation of publication
AFFL, Affiliation, Author's institutional affiliation and address
ECNO, EC/RN Number, EC number for enzyme or CAS registry number
SUBS, Supplementary Concept, CAS chemical name or MEDLINE Substance Name
PDAT, Date - Publication, Date of publication
EDAT, Date - Entrez, Date publication first accessible through Entrez
VOL, Volume, Volume number of publication
PAGE, Pagination, Page number(s) of publication
PTYP, Publication Type, Type of publication (e.g., review)
LANG, Language, Language of publication
ISS, Issue, Issue number of publ

In [8]:
handle = Entrez.esearch(db = "pubmed", term = "MTNR1B[title]", retmax ="40")
record = Entrez.read(handle)

In [9]:
id_List = record["IdList"]

In [10]:
record["Count"]

'104'

In [11]:
handle = Entrez.esearch(db="nucleotide", term = "Homo sapiens[Orgn] AND MTNR1B[Gene]", idtype='acc')

In [12]:
record = Entrez.read(handle)

In [13]:
record["IdList"]
#record["IdList"][2] refere-se ao genoma humano

['NM_005959.5', 'NG_028160.1', 'NC_060935.1', 'NC_000011.10', 'XM_011542839.3', 'XM_017017777.2', 'AF467654.1', 'AY114100.1', 'CM000262.1', 'CH471065.1', 'BC069163.1', 'AY408030.1', 'U25341.1', 'AY521019.1']

In [15]:
from Bio import Medline
handle = Entrez.efetch(db = "pubmed", id= id_List, rettype = "medline", retmode = "text")
records = Medline.parse(handle)
print()

for record in records :
    print("Titulo:", record.get("TI", "Vazio"))                                     
    print()
    print("Abstract:", record.get("AB", "Vazio"))
    print("Autores:", record.get("AU", "Vazio"))
    print("Fonte:", record.get("SO", "Vazio"))
    print("="*140)
    print()


Titulo: The rs10830963 Polymorphism of the MTNR1B Gene: Association With Abnormal Glucose, Insulin and C-peptide Kinetics.

Abstract: BACKGROUND: The MTNR1B gene encodes a receptor for melatonin, a hormone regulating biorhythms. Disruptions in biorhythms contribute to the development of type 2 diabetes mellitus (T2DM). Genetic studies suggest that variability in the MTNR1B gene affects T2DM development. Our aim was to compare the distribution of the genetic variant rs10830963 between persons differing in glucose tolerance in a sample of the Czech population (N=1206). We also evaluated possible associations of the polymorphism with insulin sensitivity, beta cell function, with the shape of glucose, insulin and C-peptide trajectories measured 7 times during a 3-hour oral glucose tolerance test (OGTT) and with glucagon response. In a subgroup of 268 volunteers we also evaluated sleep patterns and biorhythm. RESULTS: 13 persons were diagnosed with T2DM, 119 had impaired fasting blood gluc

In [16]:
handle = Entrez.esearch(db="nucleotide", term = "Homo sapiens[Orgn] AND MTNR1B[Gene]", idtype='acc')

In [17]:
record = Entrez.read(handle)

In [18]:
record["IdList"]

['NM_005959.5', 'NG_028160.1', 'NC_060935.1', 'NC_000011.10', 'XM_011542839.3', 'XM_017017777.2', 'AF467654.1', 'AY114100.1', 'CM000262.1', 'CH471065.1', 'BC069163.1', 'AY408030.1', 'U25341.1', 'AY521019.1']

## Análise da sequência e das features presentes no NCBI

a) Aceder ao NCBI e guardar os ficheiros correspondentes aos genes escolhidos, podendo explorar possíveis variantes

In [19]:
from Bio import Seq
from Bio import SeqIO

In [34]:
record = SeqIO.read("MTNR1B.gb", "genbank")
record

SeqRecord(seq=Seq('CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAG...GAA'), id='NC_000011.10', name='NC_000011', description='Homo sapiens chromosome 11, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [35]:
record.seq

Seq('CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAG...GAA')

In [36]:
print(record.id)
print(record.name)
print(record.description)
print(record.dbxrefs)
print(len(record.annotations) )
print(record.annotations["source"] )
print(len(record.features))

NC_000011.10
NC_000011
Homo sapiens chromosome 11, GRCh38.p14 Primary Assembly
['BioProject:PRJNA168', 'Assembly:GCF_000001405.40']
13
Homo sapiens (human)
16


In [37]:
len(record)

15310

b) Verificar as anotações dos genes de interesse

In [38]:
record.annotations

{'molecule_type': 'DNA',
 'topology': 'linear',
 'data_file_division': 'CON',
 'date': '06-APR-2022',
 'accessions': ['NC_000011', 'REGION:', '92969651..92984960'],
 'sequence_version': 10,
 'keywords': ['RefSeq'],
 'source': 'Homo sapiens (human)',
 'organism': 'Homo sapiens',
 'taxonomy': ['Eukaryota',
  'Metazoa',
  'Chordata',
  'Craniata',
  'Vertebrata',
  'Euteleostomi',
  'Mammalia',
  'Eutheria',
  'Euarchontoglires',
  'Primates',
  'Haplorrhini',
  'Catarrhini',
  'Hominidae',
  'Homo'],
 'references': [Reference(title='Human chromosome 11 DNA sequence and analysis including novel gene identification', ...),
  Reference(title='Finishing the euchromatic sequence of the human genome', ...),
  Reference(title='Initial sequencing and analysis of the human genome', ...)],
 'comment': 'REFSEQ INFORMATION: The reference sequence is identical to\nCM000673.2.\nOn Feb 3, 2014 this sequence version replaced NC_000011.9.\nAssembly Name: GRCh38.p14 Primary Assembly\nThe DNA sequence is c

c) Verificar e analisar a informação complementar fornecida pela lista de features e seus 
qualifiers

In [39]:
# print(record.features)
for feat in record.features:
    print("-->" , feat)
print(f"Número de features: {len(record.features)}")

--> type: source
location: [0:15310](+)
qualifiers:
    Key: chromosome, Value: ['11']
    Key: db_xref, Value: ['taxon:9606']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Homo sapiens']

--> type: gene
location: [0:15310](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:4544', 'HGNC:HGNC:7464', 'MIM:600804']
    Key: gene, Value: ['MTNR1B']
    Key: gene_synonym, Value: ['FGQTL2; MEL-1B-R; MT2']
    Key: note, Value: ['melatonin receptor 1B; Derived by automated computational analysis using gene prediction method: BestRefSeq,Gnomon.']

--> type: mRNA
location: join{[0:298](+), [11796:13132](+)}
qualifiers:
    Key: db_xref, Value: ['Ensembl:ENST00000257068.3', 'GeneID:4544', 'HGNC:HGNC:7464', 'MIM:600804']
    Key: gene, Value: ['MTNR1B']
    Key: gene_synonym, Value: ['FGQTL2; MEL-1B-R; MT2']
    Key: note, Value: ['Derived by automated computational analysis using gene prediction method: BestRefSeq.']
    Key: product, Value: ['melatonin receptor 1B']
    Ke

i) Localização e tipo

In [40]:
from Bio.SeqFeature import SeqFeature, FeatureLocation
for feat in record.features:
    print(feat.type)
    print(feat.location)

source
[0:15310](+)
gene
[0:15310](+)
mRNA
join{[0:298](+), [11796:13132](+)}
mRNA
join{[75:298](+), [11796:13073](+), [15156:15310](+)}
CDS
join{[75:298](+), [11796:12662](+)}
misc_feature
[84:87](+)
misc_feature
[201:264](+)
misc_feature
[11801:11864](+)
misc_feature
[11918:11981](+)
misc_feature
[12038:12101](+)
misc_feature
[12173:12236](+)
misc_feature
[12332:12395](+)
misc_feature
[12434:12497](+)
CDS
join{[75:298](+), [11796:12662](+)}
mRNA
join{[7508:8137](+), [11796:13073](+), [15156:15310](+)}
CDS
join{[8040:8137](+), [11796:12662](+)}


ii ) Regiões codificantes

In [41]:
featcds = [ ]
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        featcds.append(i)
for k in featcds:
    print (record.features[k].location)
for k in featcds:
    print (record.features[k].extract(record.seq))

join{[75:298](+), [11796:12662](+)}
join{[75:298](+), [11796:12662](+)}
join{[8040:8137](+), [11796:12662](+)}
ATGTCAGAGAACGGCTCCTTCGCCAACTGCTGCGAGGCGGGCGGGTGGGCAGTGCGCCCGGGCTGGTCGGGGGCTGGCAGCGCGCGGCCCTCCAGGACCCCTCGACCTCCCTGGGTGGCTCCAGCGCTGTCCGCGGTGCTCATCGTCACCACCGCCGTGGACGTCGTGGGCAACCTCCTGGTGATCCTCTCCGTGCTCAGGAACCGCAAGCTCCGGAACGCAGGTAATTTGTTCTTGGTGAGTCTGGCATTGGCTGACCTGGTGGTGGCCTTCTACCCCTACCCGCTAATCCTCGTGGCCATCTTCTATGACGGCTGGGCCCTGGGGGAGGAGCACTGCAAGGCCAGCGCCTTTGTGATGGGCCTGAGCGTCATCGGCTCTGTCTTCAATATCACTGCCATCGCCATTAACCGCTACTGCTACATCTGCCACAGCATGGCCTACCACCGAATCTACCGGCGCTGGCACACCCCTCTGCACATCTGCCTCATCTGGCTCCTCACCGTGGTGGCCTTGCTGCCCAACTTCTTTGTGGGGTCCCTGGAGTACGACCCACGCATCTATTCCTGCACCTTCATCCAGACCGCCAGCACCCAGTACACGGCGGCAGTGGTGGTCATCCACTTCCTCCTCCCTATCGCTGTCGTGTCCTTCTGCTACCTGCGCATCTGGGTGCTGGTGCTTCAGGCCCGCAGGAAAGCCAAGCCAGAGAGCAGGCTGTGCCTGAAGCCCAGCGACTTGCGGAGCTTTCTAACCATGTTTGTGGTGTTTGTGATCTTTGCCATCTGCTGGGCTCCACTTAACTGCATCGGCCTCGCTGTGGCCATCAACCCCCAAGAAATGGCTCCCCAGATCCCTGAGGGGCTATTTGTCACTAGCTACTTACTGG

In [42]:
featcds

[4, 13, 15]

iii ) Proteína codificada e seu significado biológico (anotações do gene)

In [43]:
from Bio.SeqFeature import SeqFeature, FeatureLocation
for feat in record.features:
    if feat.type == 'CDS':
        print(feat.qualifiers['product'])

['melatonin receptor type 1B']
['melatonin receptor type 1B isoform X2']
['melatonin receptor type 1B isoform X1']


In [44]:
for feat in record.features:
    if feat.type == 'gene':
        print(feat.qualifiers["note"])

['melatonin receptor 1B; Derived by automated computational analysis using gene prediction method: BestRefSeq,Gnomon.']


In [45]:
for feat in record.features:
    if feat.type == "CDS":
        print(feat)

type: CDS
location: join{[75:298](+), [11796:12662](+)}
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['CCDS:CCDS8290.1', 'Ensembl:ENSP00000257068.2', 'GeneID:4544', 'HGNC:HGNC:7464', 'MIM:600804']
    Key: gene, Value: ['MTNR1B']
    Key: gene_synonym, Value: ['FGQTL2; MEL-1B-R; MT2']
    Key: note, Value: ['Derived by automated computational analysis using gene prediction method: BestRefSeq.']
    Key: product, Value: ['melatonin receptor type 1B']
    Key: protein_id, Value: ['NP_005950.1']
    Key: translation, Value: ['MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSAVLIVTTAVDVVGNLLVILSVLRNRKLRNAGNLFLVSLALADLVVAFYPYPLILVAIFYDGWALGEEHCKASAFVMGLSVIGSVFNITAIAINRYCYICHSMAYHRIYRRWHTPLHICLIWLLTVVALLPNFFVGSLEYDPRIYSCTFIQTASTQYTAAVVVIHFLLPIAVVSFCYLRIWVLVLQARRKAKPESRLCLKPSDLRSFLTMFVVFVIFAICWAPLNCIGLAVAINPQEMAPQIPEGLFVTSYLLAYFNSCLNAIVYGLLNQNFRREYKRILLALWNPRHCIQDASKGSHAEGLQSPAPPIIGVQHQADAL']

type: CDS
location: join{[75:298](+), [11796:12662](+)}
qualifiers:
    Key: c

iv) Converter a FASTA

In [47]:
from Bio import SeqIO
records = SeqIO.parse("MTNR1B.gb","genbank")
count = SeqIO.write(records, "MTNR1B.fasta","fasta")
print(f'Foi convertido {count} registo.')

Foi convertido 1 registo.


## Análise de homologias por BLAST

##### Nesta parte também devemos ter que ir à base de dados NCBI para confirmar os resultados (rever aula 5)

As ferramentas de procura de homologias serão de especial relevo, nomeadamente para a 
procura de genes homólogos, bem como para a caracterização funcional dos genes 
selecionados. No primeiro caso, deverá configurar adequadamente as suas pesquisas ao nível 
da base de dados e desenvolver código para automatizar a decisão de existência de homologias 
significativas. No segundo caso, poderá analisar a lista de sequências homólogas e identificar 
padrões consistentes ao nível da função desempenhada por estas

In [48]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [49]:
record = SeqIO.read(open("MTNR1B.fasta"), format= "fasta")
print(len(record.seq))

15310


In [50]:
result_handle = NCBIWWW.qblast("blastn","nt", record.seq)

In [51]:
save_file = open("MTNR1B_b.xml","w")
save_file.write(result_handle.read())
save_file.close()
result_handle.close()

In [52]:
result_handle = open("MTNR1B_b.xml")
blast_record = NCBIXML.parse(result_handle)
for br in blast_record:
    print(f"Database: {br.database}")
    print(f"Gap penalty:{br.gap_penalties}")

Database: nt
Gap penalty:(5, 2)


Número de alinhamentos do registo

Acession number, ID do hit, definicao

nºs de HSP (high scoring pair) do alinhamento, e-value, score, tamanho do alinhamento, numero de caracteres iguais

In [53]:
print(len(br.alignments))

0


In [54]:
for br_x in br.alignments:
    print(f"Acession number: {br_x.accession}")
    print(f"ID do hit: {br_x.hit_id}")
    print(f"Definição: {br_x.hit_def}")
    print(f"HSP: {br_x.hsps}")
    break
    #???

In [55]:
for alignment in br.alignments:
    for hsp in alignment.hsps:
        print("        ***ALINHAMENTO***")
        print(f"E-value: {hsp.expect}")
        print(f"Score: {hsp.score}")
        print(f"Tamanho: {hsp.align_length}")
        print(f"Caracteres iguais: {len(hsp.match)}")
        print("Query " + hsp.query[100:200] + "...")
        print("Match " + hsp.match[100:200] + "...")
        print("Sbjct " + hsp.sbjct[100:200] + "...")
        print()

In [56]:
from Bio import SearchIO

In [57]:
blastq_result = SearchIO.read("MTNR1B_b.xml", "blast-xml")
print(blastq_result)

Program: blastn (2.13.0+)
  Query: No (15310)
         definition line
 Target: nt
   Hits: 0


In [58]:
#Primeiro hit, primeiro HSP
blast_hsp = blastq_result[0][0]    
print(blast_hsp)

IndexError: list index out of range

In [59]:
blast_hsp.query_range

NameError: name 'blast_hsp' is not defined

In [None]:
blast_hsp.evalue

In [None]:
#Três primeiros hsp
blast_slice = blastq_result[:3]
print(blast_slice)

## Ferramentas de análise das propriedades da proteína

##### Ver bases de dados curadas, UniProt, SwissProt pelo Biopyhton, PDB, CDD

A base de dados UniProt permite aceder a toda a informação de um conjunto alargado de
proteínas. Os ficheiros da SwissProt podem ser tratados automaticamente pelo BioPython (ver 
exemplos na secção 10.1 do tutorial).
Note que os registos UniProt podem ter diferentes graus de revisão por parte dos curadores da 
base de dados, sendo nos casos em que o registo tenha sido manualmente curado uma fonte 
importante de informação.

In [None]:
from Bio import ExPASy
from Bio import SeqIO

In [None]:
#Transcrição

mtnr1b_mrna = record.seq.transcribe()
mtnr1b_mrna

In [None]:
#Tradução
mtnr1b_prot = mtnr1b_mrna.translate()
mtnr1b_prot

In [None]:
from collections import Counter
common_amino = Counter(mtnr1b_prot)
common_amino

In [None]:
#Codões STOP
protein = mtnr1b_prot.split('*')
protein

In [None]:
handle = ExPASy.get_sprot_raw("P49286")
seq_record = SeqIO.read(handle, "swiss")
id = seq_record.id
seq = seq_record.seq
tam = len(seq_record.seq)
name = seq_record.name
desc = seq_record.description
com = seq_record.annotations["comment"]
taxon = seq_record.annotations["taxonomy"]
organism = seq_record.annotations["organism"]
key = seq_record.annotations["keywords"]
print(f"ID {id} \n Sequência: {seq} \n Tamanho da sequência: {tam} bp")
print(f"Nome: {name} \n Descrição: {desc} \n Taxonomia: {taxon} \n Organismo: {organism} \n Keywords: {key}")

In [None]:
seq_prot = seq_record.seq
result_handle = NCBIWWW.qblast('blastp','swissprot', seq_prot)
save_file = open("prot_blast_swiss.xml","w")
save_file.write(result_handle.read())
save_file.close()

blast_record = NCBIXML.parse(result_handle)

In [None]:
#Protein BLAST swissprot
from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastp", "swissprot", seq_record.seq)

In [None]:
from Bio import SearchIO
blast_records = SearchIO.read(result_handle, "blast-xml")

In [None]:
print(blast_records[:])

In [None]:
for br in blast_records:
    print(f'Sequence ID: {br.id}')
    print(f'Description: {br.description}')
    print(f'E-value: {br[0].evalue}')
    print(f'Bit Score: {br[0].bitscore}')
    print(f'Alignment:\n{br[0].aln}')
    print()

In [None]:
print(blast_records[:])

In [None]:
protein_seq = SeqIO.read("protein_seq.fasta","fasta")

In [None]:

protein_seq.seq

Por outro lado, a base de dados PDB contém informação sobre a estrutura das proteínas. Poderá 
efetuar pesquisas nesta base de dados no sentido de identificar proteínas de interesse que 
estejam presentes nesta base de dados. As proteínas de interesse podem ser analisadas 
identificando zonas de possível ligação de compostos que possam regular o seu funcionamento.
Complementarmente, foram estudadas ferramentas que permitem inferir características da 
proteína com base na sua sequência, como sejam a sua localização celular, a existência de 
domínios transmembranares ou alterações pós-tradução relevantes. Todas estas ferramentas 
permitem dar pistas sobre as proteínas de interesse.

Foram ainda abordadas bases de dados de domínios de proteínas, das quais se destaca a NCBI 
CDD (conserved domain database) do NCBI. Esta base de dados, ou outras similares, pode ser 
usada para confirmar a anotação de proteínas de interesse, sendo de particular utilidade quando 
subsistem dúvidas sobre a anotação, quer esta provenha da anotação original, quer provenha 
de resultados de homologia (e.g. BLAST). Por outro lado, permite a análise dos domínios 
presentes na proteína, de forma a poder caracterizar potenciais pontos de ligação de compostos 
e outras proteínas que possam inibir o funcionamento da proteína

In [None]:
from Bio.PDB.PDBParser import PDBParser

In [None]:
#Protein BLAST cdd
from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastp", "CDD", seq_record.seq)

In [None]:
blast_records = SearchIO.read(result_handle, "blast-xml")

In [None]:
print(blast_records[:])

In [None]:
for br in blast_records:
    print(f'Sequence ID: {br.id}')
    print(f'Description: {br.description}')
    print(f'E-value: {br[0].evalue}')
    print(f'Bit Score: {br[0].bitscore}')
    print(f'Alignment:\n{br[0].aln}')
    print()

In [None]:
prots = []
for p in protein:
    if len(p) > 20:
        prots.append(p)
prots

In [None]:
with open ("protein_seq.fasta","w") as file:
    file.write(f">Protein: \n {prots[:]}")

In [None]:
from Bio import SeqIO
protein_seq = SeqIO.read("protein_seq.fasta","fasta")

In [None]:
protein_seq.seq

In [None]:
#Protein BLAST
from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastp", "pdb", protein_seq)

In [None]:
from Bio import SearchIO
blast_records = SearchIO.read(result_handle, "blast-xml")

In [None]:
print(blast_records[:])

## Alinhamentos múltiplos e filogenia

selecionar-se a sequência de interesse do organismo e um conjunto de
sequências homólogas (e.g. provenientes de um processo de BLAST) de organismos
selecionados, realizar o seu alinhamento múltiplo e complementarmente determinar a árvore
filogenética correspondente

In [None]:
from Bio import Phylo
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Align import AlignInfo

In [None]:
result_blast = open("MTNR1B_b.xml")
blast_records = NCBIXML.read(result_blast)
for alignment in blast_records.alignments:
    for hsp in alignment.hsps:
        print(">", alignment.title, "\n", hsp.query[0:45])
        print()

In [None]:
#guardou se manualmente o alinhamento anterior no ficheiro mtnr1b_blast.txt
alignments = AlignIO.parse("MTNR1B_BLAST.txt",format = "fasta")
for alignment in alignments:
    print(alignment)

In [None]:
AlignIO.write(alignment, "align_results_mtnr1b", "fasta")

In [None]:
summary_align = AlignInfo.SummaryInfo(alignment)
consensus = summary_align.dumb_consensus()
consensus

In [None]:
alignment = AlignIO.read(open("align_results_mtnr1b.sth"),"stockholm")
print(alignment)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
calculator = DistanceCalculator('blosum62')
dm = calculator.get_distance(alignment)
print(dm)

In [None]:
constructor = DistanceTreeConstructor()
upgmatree = constructor.upgma(dm)
print(upgmatree)

In [None]:
seq1 = "MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW"
seq2 = "MH--IFIYQIGYALKSGYIQSIRSPEY-NW"

seqr1 = SeqRecord(Seq(seq1),id="seq1")
seqr2 = SeqRecord(Seq(seq2),id="seq2")
alin = MultipleSeqAlignment([seqr1, seqr2])
print (alin)


In [None]:
align1 = MultipleSeqAlignment([SeqRecord(Seq("ACTGCTAGC"), id="A"), 
    SeqRecord(Seq("ACT-CTAGC"), id="B"), 
    SeqRecord(Seq("ACTGCTAGD"), id="C"), ]) 
align2 = MultipleSeqAlignment([ SeqRecord(Seq("TCAGC-AG"), id="D"), 
    SeqRecord(Seq("ACAGCTAG"), id="E"), 
    SeqRecord(Seq("TCAGCTAG"), id="F"), ])

my_alignments = [align1, align2]

AlignIO.write(my_alignments, "my_example.phy", "phylip") 
AlignIO.write(my_alignments, "my_exampl.sth", "stockholm")
AlignIO.write(my_alignments, "my_examp.faa", "fasta")

In [None]:
alignment = AlignIO.read("my_exampl.sth", "stockholm")
print (alignment)
 
print ("tamanho alinhamento %i" % alignment.get_alignment_length() )
 
for record in alignment:
    print ("%s - %s" % (record.seq, record.id) )
for record in alignment:
    if record.dbxrefs: 
        print (record.id, record.dbxrefs)

In [None]:
alignment = AlignIO.read("MTNR1B.faa", "fasta")
print (alignment )

print ("tam. alinhamento %i" % alignment.get_alignment_length() )
 
for record in alignment:
    print ("%s - %s" % (record.seq, record.id) )

Multiplos alinhamentos (formato phylip)

In [None]:
alignments = AlignIO.parse("MTNR1B.phy", "phylip") 
for alignment in alignments: 
    print (alignment)

In [None]:
lalignments = list(AlignIO.parse("MTNR1B.phy", "phylip")) 
print (lalignments[-1])
print (lalignments[0])

arvore filogeni

In [None]:
tree = Phylo.read("example.dnd", "newick")

In [None]:
Phylo.draw_ascii(tree)

In [None]:
Phylo.convert("int_node_labels.nwk", "newick", "tree.xml", "phyloxml")