## Análise da literatura

In [1]:
from Bio import Entrez
from Bio import Medline

In [2]:
Entrez.email = "a91006@alunos.uminho.pt"
handle = Entrez.einfo()

In [3]:
result = Entrez.read(handle)
#handle.close()

In [4]:
result["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

In [5]:
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)
record["DbInfo"]["Description"]

'PubMed bibliographic record'

In [6]:
for field in record["DbInfo"]["FieldList"]:
    print("%(Name)s, %(FullName)s, %(Description)s" %field)

ALL, All Fields, All terms from all searchable fields
UID, UID, Unique number assigned to publication
FILT, Filter, Limits the records
TITL, Title, Words in title of publication
WORD, Text Word, Free text associated with publication
MESH, MeSH Terms, Medical Subject Headings assigned to publication
MAJR, MeSH Major Topic, MeSH terms of major importance to publication
AUTH, Author, Author(s) of publication
JOUR, Journal, Journal abbreviation of publication
AFFL, Affiliation, Author's institutional affiliation and address
ECNO, EC/RN Number, EC number for enzyme or CAS registry number
SUBS, Supplementary Concept, CAS chemical name or MEDLINE Substance Name
PDAT, Date - Publication, Date of publication
EDAT, Date - Entrez, Date publication first accessible through Entrez
VOL, Volume, Volume number of publication
PAGE, Pagination, Page number(s) of publication
PTYP, Publication Type, Type of publication (e.g., review)
LANG, Language, Language of publication
ISS, Issue, Issue number of publ

In [8]:
handle = Entrez.esearch(db = "pubmed", term = "HHEX[title]", retmax ="40")
record = Entrez.read(handle)

In [9]:
id_list = record["IdList"]
id_list

['36008411', '35887298', '35787684', '35641029', '35434453', '35385752', '35248949', '35197747', '34849419', '34399652', '34321041', '34011403', '33677034', '33447064', '33442279', '33425911', '33086067', '33002120', '32922661', '32770145', '32601467', '32492700', '32195947', '32090320', '32019914', '31843982', '31792183', '31697936', '31237015', '30428031', '30207601', '30121926', '30006544', '29871606', '29720110', '29453249', '29263042', '28604763', '28577303', '28189604']

In [10]:
record["Count"]

'128'

In [11]:
handle = Entrez.efetch(db = "pubmed", id = id_list, rettype = "medline", retmode = "text")
records = Medline.parse(handle)
print()
for record in records:
    print("Titulo:", record.get("TI", "Vazio"))                                     
    print()
    print("Abstract:", record.get("AB", "Vazio"))
    print("Autores:", record.get("AU", "Vazio"))
    print("Fonte:", record.get("SO", "Vazio"))
    print("="*140)
    print()


Titulo: CK2-induced cooperation of HHEX with the YAP-TEAD4 complex promotes colorectal tumorigenesis.

Abstract: Dysregulation of Hippo pathway leads to hyperactivation of YAP-TEAD transcriptional complex in various cancers, including colorectal cancer (CRC). In this study, we observed that HHEX (Hematopoietically expressed homeobox) may enhance transcription activity of the YAP-TEAD complex. HHEX associates with and stabilizes the YAP-TEAD complex on the regulatory genomic loci to coregulate the expression of a group of YAP/TEAD target genes. Also, HHEX may indirectly regulate these target genes by controlling YAP/TAZ expression. Importantly, HHEX is required for the pro-tumorigenic effects of YAP during CRC progression. In response to serum stimulation, CK2 (Casein Kinase 2) phosphorylates HHEX and enhances its interaction with TEAD4. A CK2 inhibitor CX-4945 diminishes the interaction between HHEX and TEAD4, leading to decreased expression of YAP/TEAD target genes. CX-4945 synergize

In [12]:
handle = Entrez.esearch(db="nucleotide", term = "Homo sapiens[Orgn] AND HHEX[Gene]", idtype='acc')

In [13]:
record = Entrez.read(handle)

In [14]:
record["IdList"]


['NM_002729.5', 'NC_060934.1', 'NC_000010.11', 'EU446478.1', 'CM000261.1', 'CH471066.2', 'BC050638.1', 'BC015110.1', 'BC014336.1', 'AY404673.1', 'AB528006.1']

## Análise da sequência e das features presentes no NCBI

a) Aceder ao NCBI e guardar os ficheiros correspondentes aos genes escolhidos, podendo explorar possíveis variantes

In [15]:
from Bio import Seq
from Bio import SeqIO


In [16]:
record = SeqIO.read("mRNA_seq_HHEX.gb", "genbank")
record

SeqRecord(seq=Seq('AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGCACCCCGGGC...GTT'), id='NM_002729.5', name='NM_002729', description='Homo sapiens hematopoietically expressed homeobox (HHEX), mRNA', dbxrefs=[])

In [17]:
record.seq

Seq('AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGCACCCCGGGC...GTT')

In [19]:
print(record.id)
print(record.name)
print(record.description)
print(len(record.annotations))
print(record.annotations["source"] )
print(len(record.features))

NM_002729.5
NM_002729
Homo sapiens hematopoietically expressed homeobox (HHEX), mRNA
13
Homo sapiens (human)
13


In [20]:
len(record)

1724

b) Verificar as anotações dos genes de interesse

In [21]:
record.annotations

{'molecule_type': 'mRNA',
 'topology': 'linear',
 'data_file_division': 'PRI',
 'date': '21-OCT-2022',
 'accessions': ['NM_002729', 'NM_001529'],
 'sequence_version': 5,
 'keywords': ['RefSeq', 'MANE Select'],
 'source': 'Homo sapiens (human)',
 'organism': 'Homo sapiens',
 'taxonomy': ['Eukaryota',
  'Metazoa',
  'Chordata',
  'Craniata',
  'Vertebrata',
  'Euteleostomi',
  'Mammalia',
  'Eutheria',
  'Euarchontoglires',
  'Primates',
  'Haplorrhini',
  'Catarrhini',
  'Hominidae',
  'Homo'],
 'references': [Reference(title='CK2-induced cooperation of HHEX with the YAP-TEAD4 complex promotes colorectal tumorigenesis', ...),
  Reference(title='Unraveling the Influence of HHEX Risk Polymorphism rs7923837 on Multiple Sclerosis Pathogenesis', ...),
  Reference(title='Integrated single-cell transcriptomics and epigenomics reveals strong germinal center-associated etiology of autoimmune risk loci', ...),
  Reference(title='Hhex inhibits cell migration via regulating RHOA/CDC42-CFL1 axis in 

In [22]:
print(record.annotations["comment"] )

REVIEWED REFSEQ: This record has been curated by NCBI staff. The
reference sequence was derived from BC015110.1 and BM543214.1.
On Nov 22, 2018 this sequence version replaced NM_002729.4.
Summary: This gene encodes a member of the homeobox family of
transcription factors, many of which are involved in developmental
processes. Expression in specific hematopoietic lineages suggests
that this protein may play a role in hematopoietic differentiation.
[provided by RefSeq, Jul 2008].
Publication Note:  This RefSeq record includes a subset of the
publications that are available for this gene. Please see the Gene
record to access additional publications.
COMPLETENESS: full length.


c) Verificar e analisar a informação complementar fornecida pela lista de features e seus 
qualifiers

In [23]:
# print(record.features)
for feat in record.features:
    print("-->" , feat)
print(f"Número de features: {len(record.features)}")

--> type: source
location: [0:1724](+)
qualifiers:
    Key: chromosome, Value: ['10']
    Key: db_xref, Value: ['taxon:9606']
    Key: map, Value: ['10q23.33']
    Key: mol_type, Value: ['mRNA']
    Key: organism, Value: ['Homo sapiens']

--> type: gene
location: [0:1724](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:3087', 'HGNC:HGNC:4901', 'MIM:604420']
    Key: gene, Value: ['HHEX']
    Key: gene_synonym, Value: ['HEX; HMPH; HOX11L-PEN; PRH; PRHX']
    Key: note, Value: ['hematopoietically expressed homeobox']

--> type: exon
location: [0:393](+)
qualifiers:
    Key: gene, Value: ['HHEX']
    Key: gene_synonym, Value: ['HEX; HMPH; HOX11L-PEN; PRH; PRHX']
    Key: inference, Value: ['alignment:Splign:2.1.0']

--> type: CDS
location: [32:845](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['CCDS:CCDS7423.1', 'GeneID:3087', 'HGNC:HGNC:4901', 'MIM:604420']
    Key: gene, Value: ['HHEX']
    Key: gene_synonym, Value: ['HEX; HMPH; HOX11L-PEN; PRH; PRHX']
   

i) Localização e tipo

In [24]:
from Bio.SeqFeature import SeqFeature, FeatureLocation
for feat in record.features:
    print(feat.type)
    print(feat.location)

source
[0:1724](+)
gene
[0:1724](+)
exon
[0:393](+)
CDS
[32:845](+)
misc_feature
[32:443](+)
misc_feature
[188:191](+)
misc_feature
[440:842](+)
misc_feature
[611:842](+)
exon
[393:572](+)
exon
[572:623](+)
exon
[623:1724](+)
regulatory
[1691:1697](+)
polyA_site
[1723:1724](+)


ii ) Regiões codificantes

In [25]:
featcds = [ ]
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        featcds.append(i)
for k in featcds:
    print (record.features[k].location)
for k in featcds:
    print (record.features[k].extract(record.seq))

[32:845](+)
ATGCAGTACCCGCACCCCGGGCCGGCGGCGGGCGCCGTGGGGGTGCCGCTGTACGCGCCCACGCCGCTGCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCGCCGCGCCCACGCCCGCCCCCACGCTGCCGTCCCCCAACTCCTCCTTCACCAGCCTCGTGTCCCCCTACCGGACCCCGGTGTACGAGCCCACGCCGATCCATCCAGCCTTCTCGCACCACTCCGCCGCCGCGCTGGCCGCTGCCTACGGACCCGGCGGCTTCGGGGGCCCTCTGTACCCCTTCCCGCGGACGGTGAACGACTACACGCACGCCCTGCTCCGCCACGACCCCCTGGGCAAACCTCTACTCTGGAGCCCCTTCTTGCAGAGGCCTCTGCATAAAAGGAAAGGCGGCCAGGTGAGATTCTCCAACGACCAGACCATCGAGCTGGAGAAGAAATTCGAGACGCAGAAATATCTCTCTCCGCCCGAGAGGAAGCGTCTGGCCAAGATGCTGCAGCTCAGCGAGAGACAGGTCAAAACCTGGTTTCAGAATCGACGCGCTAAATGGAGGAGACTAAAACAGGAGAACCCTCAAAGCAATAAAAAAGAAGAACTGGAAAGTTTGGACAGTTCCTGTGATCAGAGGCAAGATTTGCCCAGTGAACAGAATAAAGGTGCTTCTTTGGATAGCTCTCAATGTTCGCCCTCCCCTGCCTCCCAGGAAGACCTTGAATCAGAGATTTCAGAGGATTCTGATCAGGAAGTGGACATTGAGGGCGATAAAAGCTATTTTAATGCTGGATGA


In [26]:
featcds

[3]

iii ) Proteína codificada e seu significado biológico (anotações do gene)

In [27]:
from Bio.SeqFeature import SeqFeature, FeatureLocation
for feat in record.features:
    if feat.type == 'CDS':
        print(feat.qualifiers['product'])

['hematopoietically-expressed homeobox protein HHEX']


In [28]:
for feat in record.features:
    if feat.type == 'gene':
        print(feat.qualifiers["note"])

['hematopoietically expressed homeobox']


In [29]:
for feat in record.features:
    if feat.type == "CDS":
        print(feat)

type: CDS
location: [32:845](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['CCDS:CCDS7423.1', 'GeneID:3087', 'HGNC:HGNC:4901', 'MIM:604420']
    Key: gene, Value: ['HHEX']
    Key: gene_synonym, Value: ['HEX; HMPH; HOX11L-PEN; PRH; PRHX']
    Key: note, Value: ['homeobox, hematopoietically expressed; proline-rich homeodomain-containing transcription factor; homeobox protein HEX; homeobox protein PRH']
    Key: product, Value: ['hematopoietically-expressed homeobox protein HHEX']
    Key: protein_id, Value: ['NP_002720.1']
    Key: translation, Value: ['MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPIHPAFSHHSAAALAAAYGPGGFGGPLYPFPRTVNDYTHALLRHDPLGKPLLWSPFLQRPLHKRKGGQVRFSNDQTIELEKKFETQKYLSPPERKRLAKMLQLSERQVKTWFQNRRAKWRRLKQENPQSNKKEELESLDSSCDQRQDLPSEQNKGASLDSSQCSPSPASQEDLESEISEDSDQEVDIEGDKSYFNAG']



iv) Converter a FASTA

In [30]:
from Bio import SeqIO
records = SeqIO.parse("mRNA_seq_HHEX.gb","genbank")
count = SeqIO.write(records, "HHEX.fasta","fasta")
print(f'Foi convertido {count} registo.')

Foi convertido 1 registo.


## Análise de homologias por BLAST

##### Nesta parte também devemos ter que ir à base de dados NCBI para confirmar os resultados (rever aula 5)

As ferramentas de procura de homologias serão de especial relevo, nomeadamente para a 
procura de genes homólogos, bem como para a caracterização funcional dos genes 
selecionados. No primeiro caso, deverá configurar adequadamente as suas pesquisas ao nível 
da base de dados e desenvolver código para automatizar a decisão de existência de homologias 
significativas. No segundo caso, poderá analisar a lista de sequências homólogas e identificar 
padrões consistentes ao nível da função desempenhada por estas

In [31]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [32]:
record = SeqIO.read(open("HHEX.fasta"), format= "fasta")
print(len(record.seq))

1724


In [33]:
result_handle = NCBIWWW.qblast("blastn","nt", record.seq)

In [34]:
save_file = open("HHEX_b.xml","w")
save_file.write(result_handle.read())
save_file.close()
result_handle.close()

In [36]:
result_handle = open("HHEX_b.xml")
blast_record = NCBIXML.parse(result_handle)
for br in blast_record:
    print(f"Database: {br.database}")
    print(f"Gap penalty: {br.gap_penalties}")

Database: nt
Gap penalty: (5, 2)


Número de alinhamentos do registo

Acession number, ID do hit, definicao

nºs de HSP (high scoring pair) do alinhamento, e-value, score, tamanho do alinhamento, numero de caracteres iguais

In [37]:
print(len(br.alignments))

50


In [38]:
for br_x in br.alignments:
    print(f"Acession number: {br_x.accession}")
    print(f"ID do hit: {br_x.hit_id}")
    print(f"Definição: {br_x.hit_def}")
    print(f"HSP: {br_x.hsps}")
    break
    #apenas escolhi o primeiro que é o que se refere ao organismo Human

Acession number: NM_002729
ID do hit: gi|1519245767|ref|NM_002729.5|
Definição: Homo sapiens hematopoietically expressed homeobox (HHEX), mRNA
HSP: [<Bio.Blast.Record.HSP object at 0x0000019DCFD44070>]


In [39]:
evalue_tresh = 0.05
for alignment in br.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < evalue_tresh:
            print("        ***ALINHAMENTO***")
            print(f"E-value: {hsp.expect}")
            print(f"Score: {hsp.score}")
            print(f"Tamanho: {hsp.align_length}")
            print(f"Caracteres iguais: {len(hsp.match)}")
            print("Query " + hsp.query[100:200] + "...")
            print("Match " + hsp.match[100:200] + "...")
            print("Sbjct " + hsp.sbjct[100:200] + "...")
            print()

        ***ALINHAMENTO***
E-value: 0.0
Score: 3448.0
Tamanho: 1724
Caracteres iguais: 1724
Query GCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCGCCGCGCCCACGCCCGCCCCCACGCTGCCGTCCCCCAACTCC...
Match ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
Sbjct GCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCGCCGCGCCCACGCCCGCCCCCACGCTGCCGTCCCCCAACTCC...

        ***ALINHAMENTO***
E-value: 0.0
Score: 3446.0
Tamanho: 1723
Caracteres iguais: 1723
Query GCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCGCCGCGCCCACGCCCGCCCCCACGCTGCCGTCCCCCAACTCC...
Match ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
Sbjct GCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCGCCGCGCCCACGCCCGCCCCCACGCTGCCGTCCCCCAACTCC...

        ***ALINHAMENTO***
E-value: 0.0
Score: 3429.0
Tamanho: 1717
Caracteres iguais: 1717
Query GCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCG

In [40]:
from Bio import SearchIO

In [41]:
blastq_result = SearchIO.read("HHEX_b.xml", "blast-xml")
print(blastq_result)

Program: blastn (2.13.0+)
  Query: No (1724)
         definition line
 Target: nt
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  gi|1519245767|ref|NM_002729.5|  Homo sapiens hematopoie...
            1      1  gi|15929354|gb|BC015110.1|  Homo sapiens hematopoietica...
            2      1  gi|15680040|gb|BC014336.1|  Homo sapiens hematopoietica...
            3      1  gi|1367123331|ref|XM_507925.5|  PREDICTED: Pan troglody...
            4      1  gi|32547|emb|X67235.1|  H.sapiens mRNA for proline rich...
            5      1  gi|30048158|gb|BC050638.1|  Homo sapiens hematopoietica...
            6      1  gi|1849002271|ref|XM_003825674.3|  PREDICTED: Pan panis...
            7      1  gi|1753031712|ref|XM_031015733.1|  PREDICTED: Gorilla g...
            8      1  gi|1351383818|ref|XM_024253783.1|  PREDICTED: P



In [42]:
#Primeiro hit, primeiro HSP
blast_hsp = blastq_result[0][0]    
print(blast_hsp)

      Query: No definition line
        Hit: gi|1519245767|ref|NM_002729.5| Homo sapiens hematopoietically ex...
Query range: [0:1724] (1)
  Hit range: [0:1724] (1)
Quick stats: evalue 0; bitscore 3110.29
  Fragments: 1 (1724 columns)
     Query - AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGCACCCCGGGCCGGCG~~~TCGTT
             |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||~~~|||||
       Hit - AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGCACCCCGGGCCGGCG~~~TCGTT


In [43]:
blast_hsp.query_range

(0, 1724)

In [44]:
blast_hsp.evalue

0.0

In [45]:
#Três primeiros hsp
blast_slice = blastq_result[:3]
print(blast_slice)

Program: blastn (2.13.0+)
  Query: No (1724)
         definition line
 Target: nt
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  gi|1519245767|ref|NM_002729.5|  Homo sapiens hematopoie...
            1      1  gi|15929354|gb|BC015110.1|  Homo sapiens hematopoietica...
            2      1  gi|15680040|gb|BC014336.1|  Homo sapiens hematopoietica...


In [46]:
#Para conhecer se um hit específico está nos resultados - inserir ID + description
"gi|21211761|emb|AL590080.25|" in blastq_result

True

## Ferramentas de análise das propriedades da proteína

##### Ver bases de dados curadas, UniProt, SwissProt pelo Biopyhton, PDB, CDD

A base de dados UniProt permite aceder a toda a informação de um conjunto alargado de
proteínas. Os ficheiros da SwissProt podem ser tratados automaticamente pelo BioPython (ver 
exemplos na secção 10.1 do tutorial).
Note que os registos UniProt podem ter diferentes graus de revisão por parte dos curadores da 
base de dados, sendo nos casos em que o registo tenha sido manualmente curado uma fonte 
importante de informação.

In [49]:
from Bio import ExPASy 
handle = ExPASy.get_sprot_raw("Q03014")
seq_record = SeqIO.read(handle, "swiss")
id = seq_record.id
seq = seq_record.seq
tam = len(seq_record.seq)
name = seq_record.name
desc = seq_record.description
com = seq_record.annotations["comment"]
taxon = seq_record.annotations["taxonomy"]
organism = seq_record.annotations["organism"]
key = seq_record.annotations["keywords"]
print(f"ID {id} \n Sequência: {seq} \n Tamanho da sequência: {tam} bp")
print(f"Nome: {name} \n Descrição: {desc} \n Taxonomia: {taxon} \n Organismo: {organism} \n Keywords: {key}")

ID Q03014 
 Sequência: MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPIHPAFSHHSAAALAAAYGPGGFGGPLYPFPRTVNDYTHALLRHDPLGKPLLWSPFLQRPLHKRKGGQVRFSNDQTIELEKKFETQKYLSPPERKRLAKMLQLSERQVKTWFQNRRAKWRRLKQENPQSNKKEELESLDSSCDQRQDLPSEQNKGASLDSSQCSPSPASQEDLESEISEDSDQEVDIEGDKSYFNAG 
 Tamanho da sequência: 270 bp
Nome: HHEX_HUMAN 
 Descrição: RecName: Full=Hematopoietically-expressed homeobox protein HHEX; Short=Homeobox protein HEX; AltName: Full=Homeobox protein PRH; 
 Taxonomia: ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'] 
 Organismo: Homo sapiens (Human) 
 Keywords: ['3D-structure', 'Developmental protein', 'Differentiation', 'DNA-binding', 'Homeobox', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repressor', 'Transcription', 'Transcription regulation', 'Wnt signaling pathway']


In [50]:
seq_prot= seq_record.seq

result_handle = NCBIWWW.qblast('blastp', 'swissprot', seq_prot)

save_file = open("prot_blast_swiss.xml","w")
save_file.write(result_handle.read())
save_file.close()

blast_record = NCBIXML.parse(result_handle)

#result_handle.close()

In [54]:
#Protein BLAST swissprot
from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastp", "swissprot", seq_record.seq)

In [55]:
from Bio import SearchIO
blast_records = SearchIO.read(result_handle, "blast-xml")

In [56]:
print(blast_records[:])

Program: blastp (2.13.0+)
  Query: unnamed (270)
         protein product
 Target: swissprot
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  sp|Q03014.1|  RecName: Full=Hematopoietically-expressed...
            1      1  sp|P43120.1|  RecName: Full=Hematopoietically-expressed...
            2      1  sp|Q8AWG6.1|  RecName: Full=Hematopoietically-expressed...
            3      1  sp|O13023.1|  RecName: Full=Hematopoietically-expressed...
            4      1  sp|Q05502.1|  RecName: Full=Hematopoietically-expressed...
            5      1  sp|Q9IAV3.1|  RecName: Full=Hematopoietically-expressed...
            6      1  sp|D2KQB0.1|  RecName: Full=Hematopoietically-expressed...
            7      1  sp|Q21578.5|  RecName: Full=Homeobox protein HEX homolo...
            8      1  sp|Q61670.1|  RecName: Full=H2.0-lik

In [57]:
for br in blast_records:
    print(f'Sequence ID: {br.id}')
    print(f'Description: {br.description}')
    print(f'E-value: {br[0].evalue}')
    print(f'Bit Score: {br[0].bitscore}')
    print(f'Alignment:\n{br[0].aln}')
    print()

Sequence ID: sp|Q03014.1|
Description: RecName: Full=Hematopoietically-expressed homeobox protein HHEX; Short=Homeobox protein HEX; AltName: Full=Homeobox protein PRH [Homo sapiens]
E-value: 0.0
Bit Score: 552.362
Alignment:
Alignment with 2 rows and 270 columns
MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAA...NAG unnamed
MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAA...NAG sp|Q03014.1|

Sequence ID: sp|P43120.1|
Description: RecName: Full=Hematopoietically-expressed homeobox protein Hhex; Short=Homeobox protein HEX; Short=mHex; AltName: Full=Homeobox protein PRH [Mus musculus]
E-value: 6.07441e-160
Bit Score: 447.973
Alignment:
Alignment with 2 rows and 271 columns
MQYPHPGPAAGAVGVP-LYAPTPLLQPAHPTPFYIEDILGRGPA...NAG unnamed
MQFPHPGPAAAPAVGVPLYAPTPLLQPAHPTPFYIDDILGRGPA...NAG sp|P43120.1|

Sequence ID: sp|Q8AWG6.1|
Description: RecName: Full=Hematopoietically-expressed homeobox protein hhex; Short=Homeobox protein hex; Short=tHex [Xenopus tropicalis]
E-value: 4.49319e-139
Bit Score: 395.201


In [62]:
print(blast_records[:])

Program: blastp (2.13.0+)
  Query: unnamed (270)
         protein product
 Target: swissprot
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  sp|Q03014.1|  RecName: Full=Hematopoietically-expressed...
            1      1  sp|P43120.1|  RecName: Full=Hematopoietically-expressed...
            2      1  sp|Q8AWG6.1|  RecName: Full=Hematopoietically-expressed...
            3      1  sp|O13023.1|  RecName: Full=Hematopoietically-expressed...
            4      1  sp|Q05502.1|  RecName: Full=Hematopoietically-expressed...
            5      1  sp|Q9IAV3.1|  RecName: Full=Hematopoietically-expressed...
            6      1  sp|D2KQB0.1|  RecName: Full=Hematopoietically-expressed...
            7      1  sp|Q21578.5|  RecName: Full=Homeobox protein HEX homolo...
            8      1  sp|Q61670.1|  RecName: Full=H2.0-lik

In [59]:
from Bio import SeqIO
protein_seq = SeqIO.read("protein_seq.fasta","fasta")

In [60]:
protein_seq.seq

Seq('[Seq('SSARGRSAAEPCSTRTPGRRRAPWGCRCTRPRRCCNPHTRRPFTSRTS...')]')

Por outro lado, a base de dados PDB contém informação sobre a estrutura das proteínas. Poderá 
efetuar pesquisas nesta base de dados no sentido de identificar proteínas de interesse que 
estejam presentes nesta base de dados. As proteínas de interesse podem ser analisadas 
identificando zonas de possível ligação de compostos que possam regular o seu funcionamento.
Complementarmente, foram estudadas ferramentas que permitem inferir características da 
proteína com base na sua sequência, como sejam a sua localização celular, a existência de 
domínios transmembranares ou alterações pós-tradução relevantes. Todas estas ferramentas 
permitem dar pistas sobre as proteínas de interesse.

In [51]:
from Bio.PDB.PDBParser import PDBParser

In [53]:
p = PDBParser(PERMISSIVE=1)
s = p.get_structure("2E10", "2e1o.pdb")
for chain in s[0]:
    print(f'Chain ID: {chain.id}')
smeth = s.header['structure_method']
keywords = s.header['keywords']
comp = s.header["compound"]
print("Keywords: " , keywords)
print("Structure Method: ", smeth)
print("Composto: ", comp)

import nglview as nv
nv.show_biopython(s, gui=True)

Chain ID: A
Keywords:  dna binding protein, structural genomics, nppsfa, national project on protein structural and functional analyses, riken structural genomics/proteomics initiative, rsgi, unknown function
Structure Method:  solution nmr
Composto:  {'1': {'misc': '', 'molecule': 'homeobox protein prh', 'chain': 'a', 'fragment': 'homeobox domain', 'synonym': 'hematopoietically expressed homeobox, homeobox protein hex', 'engineered': 'yes'}}


AttributeError: 'super' object has no attribute '_ipython_display_'

Foram ainda abordadas bases de dados de domínios de proteínas, das quais se destaca a NCBI 
CDD (conserved domain database) do NCBI. Esta base de dados, ou outras similares, pode ser 
usada para confirmar a anotação de proteínas de interesse, sendo de particular utilidade quando 
subsistem dúvidas sobre a anotação, quer esta provenha da anotação original, quer provenha 
de resultados de homologia (e.g. BLAST). Por outro lado, permite a análise dos domínios 
presentes na proteína, de forma a poder caracterizar potenciais pontos de ligação de compostos 
e outras proteínas que possam inibir o funcionamento da proteína

In [64]:
#Protein BLAST cdd
from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastp", "CDD", seq_record.seq)

In [65]:
from Bio import SearchIO
blast_records = SearchIO.read(result_handle, "blast-xml")

In [66]:
print(blast_records[:])

Program: blastp (2.13.0+)
  Query: unnamed (270)
         protein product
 Target: CDD
   Hits: 0


In [67]:
for br in blast_records:
    print(f'Sequence ID: {br.id}')
    print(f'Description: {br.description}')
    print(f'E-value: {br[0].evalue}')
    print(f'Bit Score: {br[0].bitscore}')
    print(f'Alignment:\n{br[0].aln}')
    print()

## Alinhamentos múltiplos e filogenia

In [68]:
from Bio.Align import MultipleSeqAlignment
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio import SeqIO
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import AlignInfo

In [69]:
result_blast = open("HHEX_b.xml")
blast_records = NCBIXML.read(result_blast)
for alignment in blast_records.alignments:
    for hsp in alignment.hsps:
        print(">", alignment.title, "\n", hsp.query[0:45])
        print()

> gi|1519245767|ref|NM_002729.5| Homo sapiens hematopoietically expressed homeobox (HHEX), mRNA 
 AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC

> gi|15929354|gb|BC015110.1| Homo sapiens hematopoietically expressed homeobox, mRNA (cDNA clone MGC:22885 IMAGE:4048691), complete cds 
 AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC

> gi|15680040|gb|BC014336.1| Homo sapiens hematopoietically expressed homeobox, mRNA (cDNA clone MGC:22882 IMAGE:4046641), complete cds 
 AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC

> gi|1367123331|ref|XM_507925.5| PREDICTED: Pan troglodytes hematopoietically expressed homeobox (HHEX), mRNA 
 AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC

> gi|32547|emb|X67235.1| H.sapiens mRNA for proline rich homeobox (Prh) protein 
 CGGAGCCATGCAGTACCCGCACCCCGGGCCGGCGGCGGGCGCCGT

> gi|30048158|gb|BC050638.1| Homo sapiens hematopoietically expressed homeobox, mRNA (cDNA clone MGC:60128 IMAGE:6174612), complete cds 
 GCCATGCAGTACCCGCACCCCGGGCCGGCGGCGGGCGCCGTGGGG

> gi|1849002

In [72]:
#guardei manualmente o alinhamento anterior no ficheiro hhex_blast.txt
alignments = AlignIO.parse("HHEX_BLAST.txt",format = "fasta")
for alignment in alignments:
    print(alignment)

Alignment with 177 rows and 45 columns
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|21211761|emb|AL590080.25|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|2063803000|emb|OU343092.1|
AGGTCAAAACCTGGTTTCAGAATCGACGCGCTAAATGGAGGAGA- gi|2063803000|emb|OU343092.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|2156558396|emb|LR962757.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|2156558431|emb|LR962882.1|
GTGTGTCTGTCTGTGTGTGTACAAGGCTGTG--TGCATCTGCG-G gi|2156558431|emb|LR962882.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|2280519189|emb|OX258980.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|2323025588|emb|OX344715.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|2233304178|emb|OW443374.1|
TAATATTTCTATGTAGTATTTATATGCATTTTAGACAAGTATAGG gi|2233304178|emb|OW443374.1|
GGGCTCGTTGCAAGTTTTCTTTCTCTCTTTCCTGTAGGTCAAAAC gi|850493841|gb|CP011907.1|
CCGGCCCCCGTGGGGCAGGGATCGCAGCCGGGCCC-GGGTGGCTA gi|850493841|gb|CP011907.1|
TCTGTTATTGGAAGCTTCAGTAGGGACCCTGAAAACAATTAACGT gi|85049

In [73]:
AlignIO.write(alignment, "align_results_hhex", "fasta")

1

In [74]:
form_stock = AlignIO.parse("align_results_hhex","fasta")
AlignIO.convert("align_results_hhex","fasta","align_results_hhex.sth","stockholm")

ValueError: Duplicate record identifier: gi|2063803000|emb|OU343092.1|

In [75]:
summary_align = AlignInfo.SummaryInfo(alignment)
consensus = summary_align.dumb_consensus()
consensus

Seq('XXXXXXXXXXXXXXXXXXAXXXXXXXXXXGXXXXXXXXXXXXXXX')

In [141]:
from Bio import Phylo
from Bio import AlignIO

In [None]:
alignment = AlignIO.read(open("align_results_hhex.sth"),"stockholm")
print(alignment)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio import AlignIO

calculator = DistanceCalculator('blosum62')
dm = calculator.get_distance(alignment)
print(dm)

In [None]:
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
constructor = DistanceTreeConstructor()
upgmatree = constructor.upgma(dm)
print(upgmatree)

In [142]:
form_phyl = AlignIO.parse("align_results_hhex.sth","stockholm")
AlignIO.convert("align_results_hhex.sth","stockholm","align_results_hhex.phy","phylip")

alignments = AlignIO.parse("align_results_hhex.phylip", "phylip")
for alignment in alignments:
    print(alignment)
    print()

FileNotFoundError: [Errno 2] No such file or directory: 'resampled.phy'

In [None]:
njtree = constructor.nj(dm)
print(njtree)

In [None]:
Phylo.write([upgmatree, njtree], "phylo_trees.nhx","newick")

In [None]:
tree_up = Phylo.draw_ascii(upgmatree)

In [None]:
tree_nj = Phylo.draw_ascii(njtree)