In [24]:
from Bio import Entrez
from Bio import SeqIO
from Bio import SearchIO
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import Medline

In [2]:
def articles(term):
    """ 
    Variáveis:
     term: termo que queremos procurar no título dos artigos, que é o nosso gene
    Returns:
        imprime o título, abstract, autores e fonta dos artigos que têm o nosso termo no título
    """
    Entrez.email = "pg49836@alunos.uminho.pt"
    handles = Entrez.einfo()

    handle1 = Entrez.esearch(db = "pubmed", term = term + "[title]", retmax ="40")
    record = Entrez.read(handle1)

    id_list = record["IdList"]
    handle = Entrez.efetch(db = "pubmed", id = id_list, rettype = "medline", retmode = "text")
    records = Medline.parse(handle)
    print()
    for record in records:
        print("Titulo:", record.get("TI", "v"))                                     
        print()
        print("Autores:", record.get("AU", "v"))
        print("Fonte:", record.get("SO", "v"))
        print()
        print("Abstract:", record.get("AB", "v"))
        print()
        print("="*140)
        print()
articles("HHEX")


Titulo: CK2-induced cooperation of HHEX with the YAP-TEAD4 complex promotes colorectal tumorigenesis.

Autores: ['Guo Y', 'Zhu Z', 'Huang Z', 'Cui L', 'Yu W', 'Hong W', 'Zhou Z', 'Du P', 'Liu CY']
Fonte: Nat Commun. 2022 Aug 25;13(1):4995. doi: 10.1038/s41467-022-32674-6.

Abstract: Dysregulation of Hippo pathway leads to hyperactivation of YAP-TEAD transcriptional complex in various cancers, including colorectal cancer (CRC). In this study, we observed that HHEX (Hematopoietically expressed homeobox) may enhance transcription activity of the YAP-TEAD complex. HHEX associates with and stabilizes the YAP-TEAD complex on the regulatory genomic loci to coregulate the expression of a group of YAP/TEAD target genes. Also, HHEX may indirectly regulate these target genes by controlling YAP/TAZ expression. Importantly, HHEX is required for the pro-tumorigenic effects of YAP during CRC progression. In response to serum stimulation, CK2 (Casein Kinase 2) phosphorylates HHEX and enhances its int

ANÁLISE DA SEQUÊNCIA 

In [3]:
def seq_analysis(gb, fasta):
    '''
    Variáveis:
        gb: nome do ficheiro com extensão ".gb"
        fasta: nome do ficheiro convertido para a extensão .fasta
    Returns:
        imprime as informações contidas no ficheiro .gb e retorna um ficheiro .fasta
    '''

    record = SeqIO.read(gb, "genbank")
    id = record.name
    seq = record.seq
    seqlen = len(record.seq)
    source = record.annotations["source"]
    tam = len(record.annotations)
    desc = record.description
    features = len(record.features)
    totannot = record.annotations
    print(f"ID: {id} \n Sequência: {seq} \n Tamanho da sequência: {seqlen} bp \n Source: {source} \n Tamanho das anotações: {tam}")
    print(f"Descrição: {desc} \n Total features: {features}")
    print()
    print(f"Annotations: {totannot}")
    
    print()
    print("FEATURES:")
    for feat in record.features:
        print("-->", feat)
    print(f"Número de features: {features}")
    for feat in record.features:
        print("Type:", feat.type)
        print("Location:", feat.location)

    featcds = [ ]
    for i in range(len(record.features)):
        if record.features[i].type == "CDS":
            featcds.append(i)
    for k in featcds:
        print (record.features[k].location)
    for k in featcds:
        print (record.features[k].extract(record.seq))
    print(featcds)

    for feat in record.features:
        if feat.type == 'CDS':
            print("Proteína codificada: ", feat.qualifiers['product'])

    for feat in record.features:
        if feat.type == 'gene':
            print("Significado biológico: ", feat.qualifiers["note"])

    
    records = SeqIO.parse("HHEX.gb","genbank")
    count = SeqIO.write(records, fasta, "fasta")
    print(f'Foi convertido {count} registo.')

#seq_analysis("HHEX.gb", "HHEX.fasta")

ANÁLISE DE HOMOLOGIAS POR BLAST

In [7]:

def blast(fasta, blast_file):
    '''
    Variáveis:
        fasta: ficheiro fasta convertido anteriormente, com extensão .fasta
        blast_file: ficheiro com o resultado do BLAST, com extensão .xml
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados nucleotide
    '''
    record = SeqIO.read(fasta, format="fasta")
    print(len(record))
    result_handle = NCBIWWW.qblast("blastn", "nt", record.seq)
    save_file = open(blast_file, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        #print(f"Matrix: {br.matrix}")
        print(f"Database: {br.database}")
        print(f"Gap penalty: {br.gap_penalties}")   

blast("HHEX.fasta", "blastn_HHEX.xml")


1724
Database: nt
Gap penalty: (5, 2)


In [5]:
def homologos(blast_file, evalue_thresh=0.01):
    '''
    Variáveis:
        blast_file: nome do ficheiro com o resultado do BLAST, com extensão .xml
        evalue_thresh: recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        retorna uma lista dos resultados do BLAST, de acordo com o valor dado pelo e-value
    '''
    evalue_tresh = evalue_thresh
    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        print("Database: ", br.database)
        print("Gap penalty: " , br.gap_penalties)
    print(len(br.alignments))
    for br_x in br.alignments:
        print(f"Acession number: {br_x.accession}")
        print(f"ID do hit: {br_x.hit_id}")
        print(f"Definição: {br_x.hit_def}")
        print(f"HSP: {br_x.hsps}")
        break #faço break porque só quero o primeiro organismo (HUMAN) -> deve dar pra fazer algo com o entrez e selecionar o Homo sapiens
    #change to be made: entrez_query = "Homo sapiens [organism]"
    print()
    for alignment in br.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("        ***ALINHAMENTO***")
                print(f"Identidade: {hsp.identities}")
                print(f"E-value: {hsp.expect}")
                print(f"Score: {hsp.score}")
                print(f"Tamanho: {hsp.align_length}")
                print(f"Caracteres iguais: {len(hsp.match)}")
                print("Query " + hsp.query[0:90] + "...")
                print("Match " + hsp.match[0:90] + "...")
                print("Sbjct " + hsp.sbjct[0:90] + "...")
                print()
    blastq_result = SearchIO.read(blast_file, "blast-xml")
    print(blastq_result)
    
    result_handle.close()
    
    blast_slice = blastq_result[:10]
    print(blast_slice)
    
#homologos("blastn_HHEX.xml")

In [12]:
def HHEX(gb, fasta, blast_file, gene, do_blast = False, evalue_thresh = 0.01):
    '''
    Variáveis:
        gb: ficheiro genbank obtido online (HHEX.gb), com extensão .gb
        blast_file:  nome do ficheiro com os resultados do BLAST ("blastn_HHEX.xml") #depois cada um alterar para o nome do seu gene
        do_blast: recebe o booleano False por default. Se blast = True, realiza um BLAST.
        evalue_tresh:  recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações dos genes
        '''

    
    seq_analysis(gb, fasta)
    if do_blast == True:
        blast(fasta, blast_file)
    homologos(blast_file, evalue_thresh = evalue_thresh)
    result_handle = open(blast_file, "r")
    #blast_record = NCBIXML.read(result_handle)
    FILE = str("results_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

HHEX("HHEX.gb", "HHEX.fasta", "blastn_HHEX.xml", "HHEX", do_blast = True)

#esta def resume todas as outras acima, por isso meio que não é necessário chamar as outras no final de cada

ID: NM_002729 
 Sequência: AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGCACCCCGGGCCGGCGGCGGGCGCCGTGGGGGTGCCGCTGTACGCGCCCACGCCGCTGCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCGCCGCGCCCACGCCCGCCCCCACGCTGCCGTCCCCCAACTCCTCCTTCACCAGCCTCGTGTCCCCCTACCGGACCCCGGTGTACGAGCCCACGCCGATCCATCCAGCCTTCTCGCACCACTCCGCCGCCGCGCTGGCCGCTGCCTACGGACCCGGCGGCTTCGGGGGCCCTCTGTACCCCTTCCCGCGGACGGTGAACGACTACACGCACGCCCTGCTCCGCCACGACCCCCTGGGCAAACCTCTACTCTGGAGCCCCTTCTTGCAGAGGCCTCTGCATAAAAGGAAAGGCGGCCAGGTGAGATTCTCCAACGACCAGACCATCGAGCTGGAGAAGAAATTCGAGACGCAGAAATATCTCTCTCCGCCCGAGAGGAAGCGTCTGGCCAAGATGCTGCAGCTCAGCGAGAGACAGGTCAAAACCTGGTTTCAGAATCGACGCGCTAAATGGAGGAGACTAAAACAGGAGAACCCTCAAAGCAATAAAAAAGAAGAACTGGAAAGTTTGGACAGTTCCTGTGATCAGAGGCAAGATTTGCCCAGTGAACAGAATAAAGGTGCTTCTTTGGATAGCTCTCAATGTTCGCCCTCCCCTGCCTCCCAGGAAGACCTTGAATCAGAGATTTCAGAGGATTCTGATCAGGAAGTGGACATTGAGGGCGATAAAAGCTATTTTAATGCTGGATGATGACCACTGGCATTGGCATGTTCAGAAAACTGGATTTAGGAATAATGTTTTGCTACAGAAAATCTTCATAGAAGAACTGGAAGGCTATATAAGAAAGGGAATCAATTCTCTGGTATTCTGGAAACCTA

FERRAMENTAS DE ANÁLISE DE PROTEÍNAS

In [13]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SearchIO
from Bio import ExPASy

In [14]:
def get_protein(id):
    """
    Variávies:
        id: identificador da proteína obtido na swissprot
    Returns:
        imprime informações sobre a proteína
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    id = seq_record.id
    seq = seq_record.seq
    tam = len(seq_record.seq)
    name = seq_record.name
    desc = seq_record.description
    com = seq_record.annotations["comment"]
    taxon = seq_record.annotations["taxonomy"]
    organism = seq_record.annotations["organism"]
    key = seq_record.annotations["keywords"]
    print(f"ID {id} \n Sequência: {seq} \n Tamanho da sequência: {tam} aa")
    print(f"Nome: {name} \n Descrição: {desc} \n Taxonomia: {taxon} \n Organismo: {organism} \n Keywords: {key}")
    
get_protein("Q03014")

ID Q03014 
 Sequência: MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPIHPAFSHHSAAALAAAYGPGGFGGPLYPFPRTVNDYTHALLRHDPLGKPLLWSPFLQRPLHKRKGGQVRFSNDQTIELEKKFETQKYLSPPERKRLAKMLQLSERQVKTWFQNRRAKWRRLKQENPQSNKKEELESLDSSCDQRQDLPSEQNKGASLDSSQCSPSPASQEDLESEISEDSDQEVDIEGDKSYFNAG 
 Tamanho da sequência: 270 aa
Nome: HHEX_HUMAN 
 Descrição: RecName: Full=Hematopoietically-expressed homeobox protein HHEX; Short=Homeobox protein HEX; AltName: Full=Homeobox protein PRH; 
 Taxonomia: ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'] 
 Organismo: Homo sapiens (Human) 
 Keywords: ['3D-structure', 'Developmental protein', 'Differentiation', 'DNA-binding', 'Homeobox', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repressor', 'Transcription', 'Transcription regulation', 'Wnt signaling pathway']


In [20]:
id = str(input())
handle = ExPASy.get_sprot_raw(id)
seq_record = SeqIO.read(handle, "swiss")
seq = seq_record.seq
print(seq)

MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPIHPAFSHHSAAALAAAYGPGGFGGPLYPFPRTVNDYTHALLRHDPLGKPLLWSPFLQRPLHKRKGGQVRFSNDQTIELEKKFETQKYLSPPERKRLAKMLQLSERQVKTWFQNRRAKWRRLKQENPQSNKKEELESLDSSCDQRQDLPSEQNKGASLDSSQCSPSPASQEDLESEISEDSDQEVDIEGDKSYFNAG


In [15]:
def prot_blast(blastp_file, id):
    """ 
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
        query: sequência proteica query
        id: identificador da proteína obtido na swissprot
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados (swissprot?)
    """
    #id = str(input())
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    #seq_prot= query
    result_handle = NCBIWWW.qblast('blastp', 'nr', seq_prot)

    save_file = open(blastp_file,"w")
    save_file.write(result_handle.read())
    save_file.close()

    blast_record = NCBIXML.parse(result_handle)
    result_handle.close()

prot_blast("prot_blastp_nr.xml", "Q03014")

#aqui fiz o blast na DB non-redundant pra descobrir os homólogos,
#depois vou fazer na swiss prot. Não sei se faz sentido

In [17]:
def homologos_p (blastp_file, evalue_thresh = None):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
        evalue_tresh: evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLASTp
    Returns:
        retorna uma lista dos resultados do BLASTp, e respetivo valor dado pelo e-value
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    FILE = str("seqshomologas_blastp.fasta")
    save_file = open(FILE, 'w+')
    if evalue_thresh == None:
        evalue_thresh = 0.05
    for alignment in blast_record.alignments:
        for hsp in range(len(alignment.hsps)):
            if alignment.hsps[hsp].expect < evalue_thresh:
                if hsp != 0:
                    pass
                else:
                    save_file.write('>' + alignment.title + '\n' + alignment.hsps[hsp].sbjct + '\n')
    
    result_handle.close()

homologos_p("prot_blastp_nr.xml")

In [18]:
def prot_file(blastp_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
    Returns:
        imprime informações dos alinhamentos das proteínas
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    evalue_tresh = 1e-30
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("      ***ALINHAMENTO***")
                print("Sequência: ", alignment.title)
                print("Tamanho da sequência: ", alignment.length)
                print("E-value:", hsp.expect)
                print("Score: ", hsp.score)
                print(f"Caracteres iguais: {len(hsp.match)}")
                print(hsp.query[0:75] + "...")
                print(hsp.match[0:75] + "...")
                print(hsp.sbjct[0:75] + "...")
                print()
    
    blastq_result = SearchIO.read(blastp_file, "blast-xml")
    print(blastq_result)
    for br in blastq_result:
        print(f'Sequence ID: {br.id}')
        print(f'Description: {br.description}')
        print(f'E-value: {br[0].evalue}')
        print(f'Bit Score: {br[0].bitscore}')
        print(f'Alignment:\n{br[0].aln}')
        print()
    result_handle.close()

prot_file("prot_blastp_nr.xml")

      ***ALINHAMENTO***
Sequência:  ref|NP_002720.1| hematopoietically-expressed homeobox protein HHEX [Homo sapiens] >ref|XP_003825722.2| hematopoietically-expressed homeobox protein HHEX [Pan paniscus] >ref|XP_507925.2| hematopoietically-expressed homeobox protein HHEX [Pan troglodytes] >sp|Q03014.1| RecName: Full=Hematopoietically-expressed homeobox protein HHEX; Short=Homeobox protein HEX; AltName: Full=Homeobox protein PRH [Homo sapiens] >gb|ABZ92007.1| hematopoietically expressed homeobox, partial [synthetic construct] >gb|AAH15110.1| Hematopoietically expressed homeobox [Homo sapiens] >gb|AAH50638.1| Hematopoietically expressed homeobox [Homo sapiens] >gb|EAW50087.1| homeobox, hematopoietically expressed, isoform CRA_a [Homo sapiens] >gb|EAW50088.1| homeobox, hematopoietically expressed, isoform CRA_a [Homo sapiens]
Tamanho da sequência:  270
E-value: 0.0
Score:  1422.0
Caracteres iguais: 270
MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPI...
MQYPHPGP

In [47]:
def PROTEIN(id, blastp_file, gene, blast = False, evalue_thresh = 0.01):
    """ 
    Variáveis:
        id: id da proteína obtido na swissprot
        blastp_file:  nome do ficheiro com os resultados do BLASTp ("blastp_HHEX.xml") #depois cada um alterar para o nome do seu gene
        blast: recebe o booleano False por default. Se blast = True, realiza um BLAST na base de dados swissprot
        evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações das proteínas
    """
    if blast == True:
        prot_blast(blastp_file, id)
    x = get_protein(id)
    print(x)
    homologos_p(blastp_file, evalue_thresh)
    prot_file(blastp_file)
    result_handle = open(blastp_file, "r")
    FILE = str("results_protein_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()
PROTEIN("Q03014", "prot_blastp_swiss.xml", "HHEX", False)

ID Q03014 
 Sequência: MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPIHPAFSHHSAAALAAAYGPGGFGGPLYPFPRTVNDYTHALLRHDPLGKPLLWSPFLQRPLHKRKGGQVRFSNDQTIELEKKFETQKYLSPPERKRLAKMLQLSERQVKTWFQNRRAKWRRLKQENPQSNKKEELESLDSSCDQRQDLPSEQNKGASLDSSQCSPSPASQEDLESEISEDSDQEVDIEGDKSYFNAG 
 Tamanho da sequência: 270 aa
Nome: HHEX_HUMAN 
 Descrição: RecName: Full=Hematopoietically-expressed homeobox protein HHEX; Short=Homeobox protein HEX; AltName: Full=Homeobox protein PRH; 
 Taxonomia: ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'] 
 Organismo: Homo sapiens (Human) 
 Keywords: ['3D-structure', 'Developmental protein', 'Differentiation', 'DNA-binding', 'Homeobox', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repressor', 'Transcription', 'Transcription regulation', 'Wnt signaling pathway']
None
      ***ALINHAMENTO***
Sequência:  sp|Q03014

In [21]:
from Bio.PDB.PDBParser import PDBParser

In [22]:
def PDB(id, pdb_file):
    """ 
    Variáveis:
        id: id da proteína obtido da base de datos de estruturas de proteínas PDB
        pdb_file: nome do ficheiro com as informações sobre a estrutura da proteína obtidas a partir da base de dados PDB ("2e1o.pdb") 
    Returns:
        imprime informações da estrutura da proteína e a sua estrutura 3D
    """
    p = PDBParser(PERMISSIVE=1)
    s = p.get_structure(id, pdb_file)
    for chain in s[0]:
        print(f'Chain ID: {chain.id}')
    smeth = s.header['structure_method']
    keywords = s.header['keywords']
    comp = s.header["compound"]
    print("Keywords: " , keywords)
    print("Structure Method: ", smeth)
    print("Composto: ", comp)

    import nglview as nv
    nv.show_biopython(s, gui=True)
    #isto funciona no jupyter notebook mas aqui dá erro... mas é isto

PDB("2E10", "2e1o.pdb")

Chain ID: A
Keywords:  dna binding protein, structural genomics, nppsfa, national project on protein structural and functional analyses, riken structural genomics/proteomics initiative, rsgi, unknown function
Structure Method:  solution nmr
Composto:  {'1': {'misc': '', 'molecule': 'homeobox protein prh', 'chain': 'a', 'fragment': 'homeobox domain', 'synonym': 'hematopoietically expressed homeobox, homeobox protein hex', 'engineered': 'yes'}}


AttributeError: 'super' object has no attribute '_ipython_display_'

In [48]:
def CDD(id, cdd_file):
    """ 
    Variáveis:
        id: 
        cdd_file: nome do ficheiro com as informações sobre os domínios das proteínas obtidas a partir da base de dados CDD ("cdd_file_HHEX")
    Returns:
        imprime as informações obtidas a partir da base de dados CDD
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    result_handle = NCBIWWW.qblast("blastp", "CDD", seq_prot)
    save_file = open(cdd_file, "w")
    blast_records = SearchIO.read(result_handle, "blast-xml")
    save_file.write(print(blast_records[:]))
    save_file.close()
    result_handle.close()
CDD("Q03014", "cdd_file_HHEX")


Program: blastp (2.13.0+)
  Query: unnamed (270)
         protein product
 Target: CDD
   Hits: 0


TypeError: write() argument must be str, not None

Alinhamento Múltiplo

In [26]:
from Bio.Align import MultipleSeqAlignment
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio import SeqIO
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import AlignInfo

In [32]:
def align(blast_file, align_file):
    """
    Variáveis:
        blast_file: ficheiro com o resultado do blast (ex: blastn_HHEX.xml)
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
    Returns:
        gera um ficheiro, com extensão .fasta (ex: obterseqs.fasta) com as sequências alinhadas e imprime-as
    """
    result_blast = open(blast_file)
    blast_records = NCBIXML.read(result_blast)
    FILE = str("obterseqs.fasta")
    save_file = open(FILE, 'w+')
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            save_file.write('>' + alignment.title + '\n' + hsp.sbjct[0:45] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()
align("blastn_HHEX.xml", "obterseqs.fasta")

> gi|1519245767|ref|NM_002729.5| Homo sapiens hematopoietically expressed homeobox (HHEX), mRNA 
 AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGCACCCCGGGCCGGCGGCGGGCGCCGTGGGGGTGCCGCTGTACGCGCCCACGCCGCTGCTGCAACCCGCACACCCGACGCCCTTTTACATCGAGGACATCCTGGGCCGCGGGCCCGCCGCGCCCACGCCCGCCCCCACGCTGCCGTCCCCCAACTCCTCCTTCACCAGCCTCGTGTCCCCCTACCGGACCCCGGTGTACGAGCCCACGCCGATCCATCCAGCCTTCTCGCACCACTCCGCCGCCGCGCTGGCCGCTGCCTACGGACCCGGCGGCTTCGGGGGCCCTCTGTACCCCTTCCCGCGGACGGTGAACGACTACACGCACGCCCTGCTCCGCCACGACCCCCTGGGCAAACCTCTACTCTGGAGCCCCTTCTTGCAGAGGCCTCTGCATAAAAGGAAAGGCGGCCAGGTGAGATTCTCCAACGACCAGACCATCGAGCTGGAGAAGAAATTCGAGACGCAGAAATATCTCTCTCCGCCCGAGAGGAAGCGTCTGGCCAAGATGCTGCAGCTCAGCGAGAGACAGGTCAAAACCTGGTTTCAGAATCGACGCGCTAAATGGAGGAGACTAAAACAGGAGAACCCTCAAAGCAATAAAAAAGAAGAACTGGAAAGTTTGGACAGTTCCTGTGATCAGAGGCAAGATTTGCCCAGTGAACAGAATAAAGGTGCTTCTTTGGATAGCTCTCAATGTTCGCCCTCCCCTGCCTCCCAGGAAGACCTTGAATCAGAGATTTCAGAGGATTCTGATCAGGAAGTGGACATTGAGGGCGATAAAAGCTATTTTAATGCTGGATGATGACCACTGGCATTGGCATGTTCAGAAAACTGGATTTAGGAATAATGTTTTGCTACA

In [53]:
def align_prot(blastp_file, align_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blast (ex: blastn_HHEX.xml)
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
    Returns:
        gera um ficheiro, com extensão .fasta (ex: obterseqs.fasta) com as sequências alinhadas e imprime-as
    """
    result_blast = open(blastp_file)
    blast_records = NCBIXML.read(result_blast)
    FILE = str("obterprotseqs.fasta")
    save_file = open(FILE, 'w+')
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            save_file.write('>' + alignment.title + '\n' + hsp.sbjct[0:45] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()
align("prot_blastp_swiss.xml", "obterprotseqs.fasta")

> sp|Q03014.1| RecName: Full=Hematopoietically-expressed homeobox protein HHEX; Short=Homeobox protein HEX; AltName: Full=Homeobox protein PRH [Homo sapiens] 
 MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPIHPAFSHHSAAALAAAYGPGGFGGPLYPFPRTVNDYTHALLRHDPLGKPLLWSPFLQRPLHKRKGGQVRFSNDQTIELEKKFETQKYLSPPERKRLAKMLQLSERQVKTWFQNRRAKWRRLKQENPQSNKKEELESLDSSCDQRQDLPSEQNKGASLDSSQCSPSPASQEDLESEISEDSDQEVDIEGDKSYFNAG

> sp|P43120.1| RecName: Full=Hematopoietically-expressed homeobox protein Hhex; Short=Homeobox protein HEX; Short=mHex; AltName: Full=Homeobox protein PRH [Mus musculus] 
 MQYPHPGPAAGAVGVP-LYAPTPLLQPAHPTPFYIEDILGRGPAAPTPAPTLPSPNSSFTSLVSPYRTPVYEPTPIHPAFSHHSAAALAAAYGPGGFGGPLYPFPRTVNDYTHALLRHDPLGKPLLWSPFLQRPLHKRKGGQVRFSNDQTIELEKKFETQKYLSPPERKRLAKMLQLSERQVKTWFQNRRAKWRRLKQENPQSNKKEELESLDSSCDQRQDLPSEQNKGASLDSSQCSPSPASQEDLESEISEDSDQEVDIEGDKSYFNAG

> sp|Q8AWG6.1| RecName: Full=Hematopoietically-expressed homeobox protein hhex; Short=Homeobox protein hex; Short=tHex [Xen

In [50]:
def parse_align(align_file, alinhamento):
    """ 
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file,format = "fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")
parse_align("obterseqs.fasta", "align_HHEX")

Alignment with 54 rows and 45 columns
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|1519245767|ref|NM_002729.5|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|15929354|gb|BC015110.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|15680040|gb|BC014336.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|1367123331|ref|XM_507925.5|
CGGAGCCATGCAGTACCCGCACCCCGGGCCGGCGGCGGGCGCCGT gi|32547|emb|X67235.1|
GCCATGCAGTACCCGCACCCCGGGCCGGCGGCGGGCGCCGTGGGG gi|30048158|gb|BC050638.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|1849002271|ref|XM_003825674.3|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|1753031712|ref|XM_031015733.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|1351383818|ref|XM_024253783.1|
GCGAGGGGCGGG--CGCGGCGGAGCCATGCAGTACCCGCACCCCG gi|292404|gb|L16499.1|HUMPRH
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTATCCGC gi|1743203112|ref|XM_030802514.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC gi|1059150898|ref|XM_017876753.1|
AGCTCTGCGAGGGGCCGGAGCGCGGCGGAGCCATGCAGTACCCGC

In [54]:
def parse_align_prot(align_file, alinhamento):
    """ 
    Variáveis:
        align_file_prot: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file,format = "fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")
parse_align_prot("obterprotseqs.fasta", "align_HHEX_prot")

Alignment with 50 rows and 45 columns
MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAP sp|Q03014.1|
MQFPHPGPAAAPAVGVPLYAPTPLLQPAHPTPFYIDDILGRGPAA sp|P43120.1|
MQYQHPSSSALGLSVPLYAPTPL-QPVHPTPFYIDDILGRSSASN sp|Q8AWG6.1|
MQYQHPSSSALGLSVPLFAPTPL---QHPTPFYIDDILGRNSASN sp|O13023.1|
GVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAAPAPHSLPAPPP sp|Q05502.1|
FYIEDILGRTGSSSGPVVPTP--TLPSPNSSFTSLIPSYRTPIYE sp|Q9IAV3.1|
LHNPHMNHHHGLVGPGLAPLSAPNGIQSLNTLHNGSGPPSH-TPF sp|D2KQB0.1|
PYMSKSPQKRKGGQIRFTNEQTDALEHKFDSHKYLSPQERKKLAK sp|Q21578.5|
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|Q61670.1|
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|Q14774.3|
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|A0JPN1.1|
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|A7MB54.1|
KRKKARTTFSGKQVFELEKQFEAKKYLSSSDRSELAKRLDVTETQ sp|P56407.2|
FTSEQLLELEKEFHCKKYLSLTERSQIAHVLKLSEVQVKIWFQNR sp|Q91907.1|
PTSFFIEDIL------------------------------LHKPK sp|Q810B3.1|
KNRRRRTAFTSEQLLELEKEFHCKKYLSLTERSQIAHALKLSEVQ sp|P52951.3|
KNRRRRTAFTSEQLLELE

In [31]:
def consensus(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format = "fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus
consensus("align_HHEX")




Seq('AGCTCTXCGAGGGGCCGGAGCGCXGCGGAGCCATGCAGTACCCGC')

In [32]:
def consensus_prot(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format = "fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus
consensus("align_HHEX_prot")




Seq('MQYPHPGPAAXXXXXXXXXXXXXLXXXXXXXXXXXXXXXXXXXAX')

In [34]:
def stockholm(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta) 
        stock_file: ficheiro com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX.sth)
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas (ex: align_results_HHEX.sth)
    """
    form_stock = AlignIO.parse(align_file,"fasta")
    AlignIO.convert(align_file,"fasta", stock_file,"stockholm")
    #dá erro de ids duplicados
stockholm("align_HHEX", "align_results_HHEX.sth")

ValueError: Duplicate record identifier: gi|1799962366|ref|XM_032170111.1|

In [55]:
def stockholm_prot(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta) 
        stock_file: ficheio com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX.sth)
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas (ex: align_results_HHEX.sth)
    """
    form_stock = AlignIO.parse(align_file,"fasta")
    AlignIO.convert(align_file,"fasta", stock_file, "stockholm")
stockholm_prot("align_HHEX_prot", "align_results_HHEX_prot.sth")

Árvore filogenética

In [35]:
from Bio import Phylo
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor


In [67]:
def phylo_tree(stockholm_file):
    """ 
    Variáveis:
        stockholm_file: ficheiro com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX_prot.sth)

    Returns:
        imprime a distância entre as espécies na árvore, constrói as árvores UPGMA e Neighbour Joining e cria um ficheiro newick (ex: phylo_trees.nhx) que contém ambas as árvores
    """
    alignment = AlignIO.read(open(stockholm_file),"stockholm")
    #print(alignment)
    print("*" * 140)
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(alignment)
    print(dm)
    print("*" * 140)
    constructor = DistanceTreeConstructor()
    upgmatree = constructor.upgma(dm)
    print(upgmatree)
    print("*" * 140)
    njtree = constructor.nj(dm)
    print(njtree)
    print("*" * 140)
    Phylo.write([upgmatree, njtree], "phylo_trees.nhx","newick")
    
    tree_up = Phylo.draw_ascii(upgmatree)
    print("*" * 140)
    tree_nj = Phylo.draw_ascii(njtree)

phylo_tree("align_results_HHEX_prot.sth")

********************************************************************************************************************************************
sp|Q03014.1|	0
sp|P43120.1|	0.9163346613545816	0
sp|Q8AWG6.1|	0.32793522267206476	1.0489795918367346	0
sp|O13023.1|	0.3659574468085106	1.0254237288135593	0.06726457399103136	0
sp|Q05502.1|	1.1952191235059761	1.316	1.1877551020408164	1.2042553191489362	0
sp|Q9IAV3.1|	1.0541666666666667	1.130801687763713	1.09009009009009	1.1132075471698113	1.217573221757322	0
sp|D2KQB0.1|	1.08300395256917	1.150197628458498	1.174089068825911	1.1223628691983123	1.1541501976284585	1.1528925619834711	0
sp|Q21578.5|	1.207171314741036	1.1285140562248996	1.1752136752136753	1.1555555555555554	1.252	1.0308370044052864	1.2173913043478262	0
sp|Q61670.1|	1.2151394422310757	1.176706827309237	1.2103004291845494	1.2466367713004485	1.1360000000000001	1.24	1.1699604743083003	1.1890756302521008	0
sp|Q14774.3|	1.2151394422310757	1.176706827309237	1.2103004291845494	1.2466367713004485	

In [59]:
form_phyl = AlignIO.parse("align_results_HHEX_prot.sth","stockholm")
AlignIO.convert("align_results_HHEX_prot.sth","stockholm","align_results_HHEX_protphy.phy","phylip")

alignments = AlignIO.parse("align_results_HHEX_protphy.phy", "phylip")
for alignment in alignments:
    print(alignment)
    print()

Alignment with 50 rows and 45 columns
MQYPHPGPAAGAVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAP sp|Q03014.
MQFPHPGPAAAPAVGVPLYAPTPLLQPAHPTPFYIDDILGRGPAA sp|P43120.
MQYQHPSSSALGLSVPLYAPTPL-QPVHPTPFYIDDILGRSSASN sp|Q8AWG6.
MQYQHPSSSALGLSVPLFAPTPL---QHPTPFYIDDILGRNSASN sp|O13023.
GVGVPLYAPTPLLQPAHPTPFYIEDILGRGPAAAPAPHSLPAPPP sp|Q05502.
FYIEDILGRTGSSSGPVVPTP--TLPSPNSSFTSLIPSYRTPIYE sp|Q9IAV3.
LHNPHMNHHHGLVGPGLAPLSAPNGIQSLNTLHNGSGPPSH-TPF sp|D2KQB0.
PYMSKSPQKRKGGQIRFTNEQTDALEHKFDSHKYLSPQERKKLAK sp|Q21578.
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|Q61670.
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|Q14774.
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|A0JPN1.
KRSWSRAVFSNLQRKGLEKRFEIQKYVTKPDRKQLAAMLGLTDAQ sp|A7MB54.
KRKKARTTFSGKQVFELEKQFEAKKYLSSSDRSELAKRLDVTETQ sp|P56407.
FTSEQLLELEKEFHCKKYLSLTERSQIAHVLKLSEVQVKIWFQNR sp|Q91907.
PTSFFIEDIL------------------------------LHKPK sp|Q810B3.
KNRRRRTAFTSEQLLELEKEFHCKKYLSLTERSQIAHALKLSEVQ sp|P52951.
KNRRRRTAFTSEQLLELEKEFHCKKYLSLTERSQIAHALKLSEVQ sp|P