## Análise da literatura

In [1]:
from Bio import Entrez

In [2]:
Entrez.email = "pg49837@uminho.pt"
handle = Entrez.einfo()

In [3]:
result = Entrez.read(handle)
#handle.close()

In [4]:
result["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

In [5]:
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)
record["DbInfo"]["Description"]

'PubMed bibliographic record'

In [6]:
for field in record["DbInfo"]["FieldList"]:
    print("%(Name)s, %(FullName)s, %(Description)s" %field)

ALL, All Fields, All terms from all searchable fields
UID, UID, Unique number assigned to publication
FILT, Filter, Limits the records
TITL, Title, Words in title of publication
WORD, Text Word, Free text associated with publication
MESH, MeSH Terms, Medical Subject Headings assigned to publication
MAJR, MeSH Major Topic, MeSH terms of major importance to publication
AUTH, Author, Author(s) of publication
JOUR, Journal, Journal abbreviation of publication
AFFL, Affiliation, Author's institutional affiliation and address
ECNO, EC/RN Number, EC number for enzyme or CAS registry number
SUBS, Supplementary Concept, CAS chemical name or MEDLINE Substance Name
PDAT, Date - Publication, Date of publication
EDAT, Date - Entrez, Date publication first accessible through Entrez
VOL, Volume, Volume number of publication
PAGE, Pagination, Page number(s) of publication
PTYP, Publication Type, Type of publication (e.g., review)
LANG, Language, Language of publication
ISS, Issue, Issue number of publ

In [17]:
handle = Entrez.esearch(db = "pubmed", term = "MTNR1B[title]", retmax ="40")
record = Entrez.read(handle)

In [18]:
record["IdList"]

['35733780', '35467761', '35342351', '35064058', '35017578', '35015083', '34967052', '34669935', '34629798', '34118937', '34117605', '34020621', '33958070', '33525391', '33119394', '33095446', '32656703', '32616615', '32378356', '32373162', '32057567', '31884106', '31826236', '31787898', '31757795', '31623012', '31580701', '31563132', '31140197', '30991439', '30981681', '30834760', '30811895', '30477160', '30063936', '29871606', '29707428', '29691896', '29674279', '29590381']

In [19]:
record["Count"]

'104'

In [20]:
handle = Entrez.esearch(db="nucleotide", term = "Homo sapiens[Orgn] AND MTNR1B[Gene]", idtype='acc')

In [21]:
record = Entrez.read(handle)

In [22]:
record["IdList"]
#record["IdList"][2] refere-se ao genoma humano

['NM_005959.5', 'NG_028160.1', 'NC_060935.1', 'NC_000011.10', 'XM_011542839.3', 'XM_017017777.2', 'AF467654.1', 'AY114100.1', 'CM000262.1', 'CH471065.1', 'BC069163.1', 'AY408030.1', 'U25341.1', 'AY521019.1']

## Análise da sequência e das features presentes no NCBI

a) Aceder ao NCBI e guardar os ficheiros correspondentes aos genes escolhidos, podendo explorar possíveis variantes

In [13]:
from Bio import Seq
from Bio import SeqIO

In [25]:
record = SeqIO.read("MTNR1B.gb", "genbank")
record

SeqRecord(seq=Seq('CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAG...GAA'), id='NC_000011.10', name='NC_000011', description='Homo sapiens chromosome 11, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [26]:
record.seq

Seq('CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAG...GAA')

In [27]:
print(record.id)
print(record.name)
print(record.description)
print(record.dbxrefs)
print(len(record.annotations) )
print(record.annotations["source"] )
print(len(record.features))

NC_000011.10
NC_000011
Homo sapiens chromosome 11, GRCh38.p14 Primary Assembly
['BioProject:PRJNA168', 'Assembly:GCF_000001405.40']
13
Homo sapiens (human)
16


In [28]:
len(record)

15310

b) Verificar as anotações dos genes de interesse

In [29]:
record.annotations

{'molecule_type': 'DNA',
 'topology': 'linear',
 'data_file_division': 'CON',
 'date': '06-APR-2022',
 'accessions': ['NC_000011', 'REGION:', '92969651..92984960'],
 'sequence_version': 10,
 'keywords': ['RefSeq'],
 'source': 'Homo sapiens (human)',
 'organism': 'Homo sapiens',
 'taxonomy': ['Eukaryota',
  'Metazoa',
  'Chordata',
  'Craniata',
  'Vertebrata',
  'Euteleostomi',
  'Mammalia',
  'Eutheria',
  'Euarchontoglires',
  'Primates',
  'Haplorrhini',
  'Catarrhini',
  'Hominidae',
  'Homo'],
 'references': [Reference(title='Human chromosome 11 DNA sequence and analysis including novel gene identification', ...),
  Reference(title='Finishing the euchromatic sequence of the human genome', ...),
  Reference(title='Initial sequencing and analysis of the human genome', ...)],
 'comment': 'REFSEQ INFORMATION: The reference sequence is identical to\nCM000673.2.\nOn Feb 3, 2014 this sequence version replaced NC_000011.9.\nAssembly Name: GRCh38.p14 Primary Assembly\nThe DNA sequence is c

c) Verificar e analisar a informação complementar fornecida pela lista de features e seus 
qualifiers

In [30]:
# print(record.features)
for feat in record.features:
    print("-->" , feat)
print(f"Número de features: {len(record.features)}")

--> type: source
location: [0:15310](+)
qualifiers:
    Key: chromosome, Value: ['11']
    Key: db_xref, Value: ['taxon:9606']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Homo sapiens']

--> type: gene
location: [0:15310](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:4544', 'HGNC:HGNC:7464', 'MIM:600804']
    Key: gene, Value: ['MTNR1B']
    Key: gene_synonym, Value: ['FGQTL2; MEL-1B-R; MT2']
    Key: note, Value: ['melatonin receptor 1B; Derived by automated computational analysis using gene prediction method: BestRefSeq,Gnomon.']

--> type: mRNA
location: join{[0:298](+), [11796:13132](+)}
qualifiers:
    Key: db_xref, Value: ['Ensembl:ENST00000257068.3', 'GeneID:4544', 'HGNC:HGNC:7464', 'MIM:600804']
    Key: gene, Value: ['MTNR1B']
    Key: gene_synonym, Value: ['FGQTL2; MEL-1B-R; MT2']
    Key: note, Value: ['Derived by automated computational analysis using gene prediction method: BestRefSeq.']
    Key: product, Value: ['melatonin receptor 1B']
    Ke

i) Localização e tipo

In [31]:
from Bio.SeqFeature import SeqFeature, FeatureLocation
for feat in record.features:
    print(feat.type)
    print(feat.location)

source
[0:15310](+)
gene
[0:15310](+)
mRNA
join{[0:298](+), [11796:13132](+)}
mRNA
join{[75:298](+), [11796:13073](+), [15156:15310](+)}
CDS
join{[75:298](+), [11796:12662](+)}
misc_feature
[84:87](+)
misc_feature
[201:264](+)
misc_feature
[11801:11864](+)
misc_feature
[11918:11981](+)
misc_feature
[12038:12101](+)
misc_feature
[12173:12236](+)
misc_feature
[12332:12395](+)
misc_feature
[12434:12497](+)
CDS
join{[75:298](+), [11796:12662](+)}
mRNA
join{[7508:8137](+), [11796:13073](+), [15156:15310](+)}
CDS
join{[8040:8137](+), [11796:12662](+)}


ii ) Regiões codificantes

In [32]:
featcds = [ ]
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        featcds.append(i)
for k in featcds:
    print (record.features[k].location)
for k in featcds:
    print (record.features[k].extract(record.seq))

join{[75:298](+), [11796:12662](+)}
join{[75:298](+), [11796:12662](+)}
join{[8040:8137](+), [11796:12662](+)}
ATGTCAGAGAACGGCTCCTTCGCCAACTGCTGCGAGGCGGGCGGGTGGGCAGTGCGCCCGGGCTGGTCGGGGGCTGGCAGCGCGCGGCCCTCCAGGACCCCTCGACCTCCCTGGGTGGCTCCAGCGCTGTCCGCGGTGCTCATCGTCACCACCGCCGTGGACGTCGTGGGCAACCTCCTGGTGATCCTCTCCGTGCTCAGGAACCGCAAGCTCCGGAACGCAGGTAATTTGTTCTTGGTGAGTCTGGCATTGGCTGACCTGGTGGTGGCCTTCTACCCCTACCCGCTAATCCTCGTGGCCATCTTCTATGACGGCTGGGCCCTGGGGGAGGAGCACTGCAAGGCCAGCGCCTTTGTGATGGGCCTGAGCGTCATCGGCTCTGTCTTCAATATCACTGCCATCGCCATTAACCGCTACTGCTACATCTGCCACAGCATGGCCTACCACCGAATCTACCGGCGCTGGCACACCCCTCTGCACATCTGCCTCATCTGGCTCCTCACCGTGGTGGCCTTGCTGCCCAACTTCTTTGTGGGGTCCCTGGAGTACGACCCACGCATCTATTCCTGCACCTTCATCCAGACCGCCAGCACCCAGTACACGGCGGCAGTGGTGGTCATCCACTTCCTCCTCCCTATCGCTGTCGTGTCCTTCTGCTACCTGCGCATCTGGGTGCTGGTGCTTCAGGCCCGCAGGAAAGCCAAGCCAGAGAGCAGGCTGTGCCTGAAGCCCAGCGACTTGCGGAGCTTTCTAACCATGTTTGTGGTGTTTGTGATCTTTGCCATCTGCTGGGCTCCACTTAACTGCATCGGCCTCGCTGTGGCCATCAACCCCCAAGAAATGGCTCCCCAGATCCCTGAGGGGCTATTTGTCACTAGCTACTTACTGG

In [33]:
featcds

[4, 13, 15]

iii ) Proteína codificada e seu significado biológico (anotações do gene)

In [34]:
from Bio.SeqFeature import SeqFeature, FeatureLocation
for feat in record.features:
    if feat.type == 'CDS':
        print(feat.qualifiers['product'])

['melatonin receptor type 1B']
['melatonin receptor type 1B isoform X2']
['melatonin receptor type 1B isoform X1']


In [35]:
for feat in record.features:
    if feat.type == 'gene':
        print(feat.qualifiers["note"])

['melatonin receptor 1B; Derived by automated computational analysis using gene prediction method: BestRefSeq,Gnomon.']


iv) Converter a FASTA

In [37]:
from Bio import SeqIO
records = SeqIO.parse("MTNR1B.gb","genbank")
count = SeqIO.write(records, "MTNR1B.fasta","fasta")
print(f'Foi convertido {count} registo.')

Foi convertido 1 registo.


## Análise de homologias por BLAST

##### Nesta parte também devemos ter que ir à base de dados NCBI para confirmar os resultados (rever aula 5)

As ferramentas de procura de homologias serão de especial relevo, nomeadamente para a 
procura de genes homólogos, bem como para a caracterização funcional dos genes 
selecionados. No primeiro caso, deverá configurar adequadamente as suas pesquisas ao nível 
da base de dados e desenvolver código para automatizar a decisão de existência de homologias 
significativas. No segundo caso, poderá analisar a lista de sequências homólogas e identificar 
padrões consistentes ao nível da função desempenhada por estas

In [40]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [38]:
record = SeqIO.read(open("MTNR1B.fasta"), format= "fasta")
print(len(record.seq))

15310


In [41]:
result_handle = NCBIWWW.qblast("blastn","nt", record.seq)

In [42]:
save_file = open("MTNR1B_b.xml","w")
save_file.write(result_handle.read())
save_file.close()
result_handle.close()

In [90]:
result_handle = open("MTNR1B_b.xml")
blast_record = NCBIXML.parse(result_handle)
for br in blast_record:
    print(f"Matrix (?): {br.matrix}")
    print(f"Database: {br.database}")
    print(f"Gap penalty:{br.gap_penalties}")

Matrix (?): 
Database: nt
Gap penalty:(5, 2)


Número de alinhamentos do registo

Acession number, ID do hit, definicao

nºs de HSP (high scoring pair) do alinhamento, e-value, score, tamanho do alinhamento, numero de caracteres iguais

In [92]:
print(len(br.alignments))#???

0


In [93]:
for br_x in br.alignments:
    print(f"Acession number: {br_x.accession}")
    print(f"ID do hit: {br_x.hit_id}")
    print(f"Definição: {br_x.hit_def}")
    print(f"HSP: {br_x.hsps}")
    break
    #???

In [56]:
for alignment in br.alignments:
    for hsp in alignment.hsps:
        print("        ***ALINHAMENTO***")
        print(f"E-value: {hsp.expect}")
        print(f"Score: {hsp.score}")
        print(f"Tamanho: {hsp.align_length}")
        print(f"Caracteres iguais: {len(hsp.match)}")
        print("Query " + hsp.query[100:200] + "...")
        print("Match " + hsp.match[100:200] + "...")
        print("Sbjct " + hsp.sbjct[100:200] + "...")
        print()

In [52]:
from Bio import SearchIO

In [57]:
blastq_result = SearchIO.read("MTNR1B_b.xml", "blast-xml")
print(blastq_result)

Program: blastn (2.13.0+)
  Query: No (15310)
         definition line
 Target: nt
   Hits: 0


In [59]:
#Primeiro hit, primeiro HSP
blast_hsp = blastq_result[0][0]    
print(blast_hsp)

IndexError: list index out of range

In [60]:
blast_hsp.query_range

NameError: name 'blast_hsp' is not defined

In [None]:
blast_hsp.evalue

0.0

In [None]:
#Três primeiros hsp
blast_slice = blastq_result[:3]
print(blast_slice)

Program: blastn (2.13.0+)
  Query: No (5693)
         definition line
 Target: nt
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  gi|21211761|emb|AL590080.25|  Human DNA sequence from c...
            1      2  gi|2063803000|emb|OU343092.1|  Cervus elaphus genome as...
            2      1  gi|2156558396|emb|LR962757.1|  Bos taurus genome assemb...


In [None]:
#Para conhecer se um hit específico está nos resultados - inserir ID + description #???
"gi|21211761|emb|AL590080.25|" in blastq_result

True

## Ferramentas de análise das propriedades da proteína

##### Ver bases de dados curadas, UniProt, SwissProt pelo Biopyhton, PDB, CDD

A base de dados UniProt permite aceder a toda a informação de um conjunto alargado de
proteínas. Os ficheiros da SwissProt podem ser tratados automaticamente pelo BioPython (ver 
exemplos na secção 10.1 do tutorial).
Note que os registos UniProt podem ter diferentes graus de revisão por parte dos curadores da 
base de dados, sendo nos casos em que o registo tenha sido manualmente curado uma fonte 
importante de informação.

In [None]:
from Bio import ExPASy
from Bio import SeqIO

In [61]:
#Transcrição

mtnr1b_mrna = record.seq.transcribe()
mtnr1b_mrna

Seq('CGGCUCAGUACUGCGCGCGCCCUGCGGCUGUCCGGGGCCGCGCGGUGGCCAAAG...GAA')

In [62]:
#Tradução
mtnr1b_prot = mtnr1b_mrna.translate()
mtnr1b_prot



Seq('RLSTARALRLSGAARWPKHSAGESAMSENGSFANCCEAGGWAVRPGWSGAGSAR...IWG')

In [63]:
from collections import Counter
common_amino = Counter(mtnr1b_prot)
common_amino

Counter({'R': 233,
         'L': 604,
         'S': 573,
         'T': 276,
         'A': 293,
         'G': 302,
         'W': 135,
         'P': 365,
         'K': 210,
         'H': 192,
         'E': 184,
         'M': 100,
         'N': 136,
         'F': 219,
         'C': 189,
         'V': 255,
         'I': 235,
         'D': 108,
         '*': 213,
         'Q': 188,
         'Y': 93})

In [64]:
#Codões STOP
protein = mtnr1b_prot.split('*')
protein

[Seq('RLSTARALRLSGAARWPKHSAGESAMSENGSFANCCEAGGWAVRPGWSGAGSAR...HHS'),
 Seq('CWVLASGPSLSSDLVPDPGICAAFSLLSPGSPFPSL'),
 Seq('PIP'),
 Seq('V'),
 Seq('LCTQKLG'),
 Seq('VLPLRSAAAQDALGTEGIAESFTGALFGRAPALWEPTLLDAIATPSRSNACLSR...QAL'),
 Seq('LYVVVA'),
 Seq('KILAEKMS'),
 Seq('SMEAFQ'),
 Seq('VLCQKLISRSSEDPIPVDGE'),
 Seq('MQKLQILLGLN'),
 Seq('IALKYCGLFRRLGIPGINCEN'),
 Seq('SNDLAKSCCNPENIFPISIHSHFNIPLVRSSKKQNQSGGQYLNSSPHRSWGMMTTSA'),
 Seq('EGAQSPVC'),
 Seq('FCSFFKLGLGSKFKY'),
 Seq('SELKWKDSSSKTTI'),
 Seq('LKNAFLRSQTIFSK'),
 Seq(''),
 Seq('YVFGVACDSRNFFFLRWSLSLCHPGWSAVAWSRLTATSASQVQGILLPQPPE'),
 Seq('LGLQALATTPR'),
 Seq('FFIFLVDTVFHHVGQAGLELLTSSDPPASASQSAGITGMSHCARPTLGFLAKKT...LQL'),
 Seq('VSSHHMGNITLLRRSKLSNSLQ'),
 Seq('WWLFSGIHCSKYCVL'),
 Seq('RKAYILGKNRL'),
 Seq('LENCI'),
 Seq('YF'),
 Seq('HIVPCSHHKHSFPAQGSKRETTPGSMGEKWVLLVLTLAPSVPDLLSI'),
 Seq('SRSVRPWLPAQ'),
 Seq('FGLPTMRKVVEGSRSCFLLLEGEKHPSPPASYRGLAFSVILAGSAET'),
 Seq('PV'),
 Seq('EHHWGGS'),
 Seq('AC'),
 Seq('IAGISSVLSFLPPDVSATAQHPNKKK'),
 Se

Por outro lado, a base de dados PDB contém informação sobre a estrutura das proteínas. Poderá 
efetuar pesquisas nesta base de dados no sentido de identificar proteínas de interesse que 
estejam presentes nesta base de dados. As proteínas de interesse podem ser analisadas 
identificando zonas de possível ligação de compostos que possam regular o seu funcionamento.
Complementarmente, foram estudadas ferramentas que permitem inferir características da 
proteína com base na sua sequência, como sejam a sua localização celular, a existência de 
domínios transmembranares ou alterações pós-tradução relevantes. Todas estas ferramentas 
permitem dar pistas sobre as proteínas de interesse.

In [65]:
prots = []
for p in protein:
    if len(p) > 20:
        prots.append(p)
prots

[Seq('RLSTARALRLSGAARWPKHSAGESAMSENGSFANCCEAGGWAVRPGWSGAGSAR...HHS'),
 Seq('CWVLASGPSLSSDLVPDPGICAAFSLLSPGSPFPSL'),
 Seq('VLPLRSAAAQDALGTEGIAESFTGALFGRAPALWEPTLLDAIATPSRSNACLSR...QAL'),
 Seq('IALKYCGLFRRLGIPGINCEN'),
 Seq('SNDLAKSCCNPENIFPISIHSHFNIPLVRSSKKQNQSGGQYLNSSPHRSWGMMTTSA'),
 Seq('YVFGVACDSRNFFFLRWSLSLCHPGWSAVAWSRLTATSASQVQGILLPQPPE'),
 Seq('FFIFLVDTVFHHVGQAGLELLTSSDPPASASQSAGITGMSHCARPTLGFLAKKT...LQL'),
 Seq('VSSHHMGNITLLRRSKLSNSLQ'),
 Seq('HIVPCSHHKHSFPAQGSKRETTPGSMGEKWVLLVLTLAPSVPDLLSI'),
 Seq('FGLPTMRKVVEGSRSCFLLLEGEKHPSPPASYRGLAFSVILAGSAET'),
 Seq('IAGISSVLSFLPPDVSATAQHPNKKK'),
 Seq('MTSIPATAAVLQFNQFLCDLWLYW'),
 Seq('LAHILLLIRPCLTEEFGFNLRNASSDLSVLKSGSLNHWY'),
 Seq('KIKSVTMRIHIFPITKIHSPY'),
 Seq('GHSMFPRRHQYPLLAYHKPSSSMEQICIIPNSAAPDGRVSG'),
 Seq('INCLFFFRDILSVVSVAACGGLTSEIAPEKGRQGGVEARWEVGSGHRPNLKGKA...ESR'),
 Seq('PLSYTSLIFISCIFELFLSLFWKLPPCYTHTRYTKSPVLKEQKTLFLTSCPPNF...CCV'),
 Seq('FYLPLFLNFSLSCLLILTAPLHAFIIS'),
 Seq('ELFLLPPPATVTTAKSFPVFSGCPPSPEQCPTPEHSSHGHLRLA'),
 Seq('

In [66]:
with open ("protein_seq.fasta","w") as file:
    file.write(f">Protein: \n {prots[:]}")

In [67]:
from Bio import SeqIO
protein_seq = SeqIO.read("protein_seq.fasta","fasta")

In [68]:
protein_seq.seq

Seq('[Seq('RLSTARALRLSGAARWPKHSAGESAMSENGSFANCCEAGGWAVRPGWS...')]')

In [69]:
#Protein BLAST
from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastp", "pdb", protein_seq)

In [70]:
from Bio import SearchIO
blast_records = SearchIO.read(result_handle, "blast-xml")

In [71]:
print(blast_records[:])

Program: blastp (2.13.0+)
  Query: unnamed (108)
         protein product
 Target: pdb
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  pdb|7VH0|A  Chain A, Melatonin receptor type 1B [Homo s...
            1      1  pdb|6ZJ3|LR  Chain LR, Ribosomal protein uL4 [Euglena g...
            2      1  pdb|6ZU5|LC0  Chain LC0, uL4 [Paranosema locustae]
            3      1  pdb|5TQB|A  Chain A, 60S ribosomal protein L4-like prot...
            4      1  pdb|7K58|C  Chain C, gamma heavy chain [Tetrahymena the...
            5      1  pdb|7K5B|C  Chain C, gamma heavy chain [Tetrahymena the...
            6      1  pdb|7OLC|LC  Chain LC, 60S ribosomal protein L4-like pr...
            7      1  pdb|4V8M|Br  Chain Br, 60S RIBOSOMAL PROTEIN L4 [Trypan...
            8      1  pdb|3JBN|AF  Chain AF, 60S ribosomal protein uL4 [

Foram ainda abordadas bases de dados de domínios de proteínas, das quais se destaca a NCBI 
CDD (conserved domain database) do NCBI. Esta base de dados, ou outras similares, pode ser 
usada para confirmar a anotação de proteínas de interesse, sendo de particular utilidade quando 
subsistem dúvidas sobre a anotação, quer esta provenha da anotação original, quer provenha 
de resultados de homologia (e.g. BLAST). Por outro lado, permite a análise dos domínios 
presentes na proteína, de forma a poder caracterizar potenciais pontos de ligação de compostos 
e outras proteínas que possam inibir o funcionamento da proteína

## Alinhamentos múltiplos e filogenia

selecionar-se a sequência de interesse do organismo e um conjunto de
sequências homólogas (e.g. provenientes de um processo de BLAST) de organismos
selecionados, realizar o seu alinhamento múltiplo e complementarmente determinar a árvore
filogenética correspondente

In [79]:
from Bio import Phylo
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq

In [84]:
seq1 = "MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW"
seq2 = "MH--IFIYQIGYALKSGYIQSIRSPEY-NW"

seqr1 = SeqRecord(Seq(seq1),id="seq1")
seqr2 = SeqRecord(Seq(seq2),id="seq2")
alin = MultipleSeqAlignment([seqr1, seqr2])
print (alin)


Alignment with 2 rows and 30 columns
MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW seq1
MH--IFIYQIGYALKSGYIQSIRSPEY-NW seq2


In [85]:
align1 = MultipleSeqAlignment([SeqRecord(Seq("ACTGCTAGC"), id="A"), 
    SeqRecord(Seq("ACT-CTAGC"), id="B"), 
    SeqRecord(Seq("ACTGCTAGD"), id="C"), ]) 
align2 = MultipleSeqAlignment([ SeqRecord(Seq("TCAGC-AG"), id="D"), 
    SeqRecord(Seq("ACAGCTAG"), id="E"), 
    SeqRecord(Seq("TCAGCTAG"), id="F"), ])

my_alignments = [align1, align2]

AlignIO.write(my_alignments, "my_example.phy", "phylip") 
AlignIO.write(my_alignments, "my_exampl.sth", "stockholm")
AlignIO.write(my_alignments, "my_examp.faa", "fasta")

2

In [83]:
alignment = AlignIO.read("my_exampl.sth", "stockholm")
print (alignment)
 
print ("tamanho alinhamento %i" % alignment.get_alignment_length() )
 
for record in alignment:
    print ("%s - %s" % (record.seq, record.id) )
for record in alignment:
    if record.dbxrefs: 
        print (record.id, record.dbxrefs)

ValueError: More than one record found in handle

In [None]:
alignment = AlignIO.read("MTNR1B.faa", "fasta")
print (alignment )

print ("tam. alinhamento %i" % alignment.get_alignment_length() )
 
for record in alignment:
    print ("%s - %s" % (record.seq, record.id) )

Multiplos alinhamentos (formato phylip)

In [None]:
alignments = AlignIO.parse("MTNR1B.phy", "phylip") 
for alignment in alignments: 
    print (alignment)

In [None]:
lalignments = list(AlignIO.parse("MTNR1B.phy", "phylip")) 
print (lalignments[-1])
print (lalignments[0])

arvore filo

In [None]:
tree = Phylo.read("example.dnd", "newick")

In [None]:
Phylo.draw_ascii(tree)

In [None]:
Phylo.convert("int_node_labels.nwk", "newick", "tree.xml", "phyloxml")