In [2]:
from Bio import Entrez
from Bio import SeqIO
from Bio import SearchIO
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import Medline

In [5]:
def articles(term):
    """ 
    Variáveis:
     term: termo que queremos procurar no título dos artigos, que é o nosso gene
    Returns:
        imprime o título, abstract, autores e fonta dos artigos que têm o nosso termo no título
    """
    Entrez.email = "pg45966@alunos.uminho.pt"
    handles = Entrez.einfo()

    handle1 = Entrez.esearch(db = "pubmed", term = term + "[title]", retmax ="40")
    record = Entrez.read(handle1)

    id_list = record["IdList"]
    handle = Entrez.efetch(db = "pubmed", id = id_list, rettype = "medline", retmode = "text")
    records = Medline.parse(handle)
    print()
    for record in records:
        print("Titulo:", record.get("TI", "v"))                                     
        print()
        print("Autores:", record.get("AU", "v"))
        print("Fonte:", record.get("SO", "v"))
        print()
        print("Abstract:", record.get("AB", "v"))
        print()
        print("="*140)
        print()
articles("ide")


Titulo: High-potency nucleos(t)ide analogues alone or plus immunoglobulin for HBV prophylaxis after liver transplantation: a meta-analysis.

Autores: ['Sheng LP', 'Zhang JC', 'Zhong ZQ', 'Sheng XH', 'Ren J', 'Wang GQ']
Fonte: Hepatol Int. 2023 Jan 2. doi: 10.1007/s12072-022-10466-w.

Abstract: BACKGROUND: The optimum prophylactic regimen against hepatitis B virus (HBV) recurrence after liver transplantation (LT) in HBV-infected patients is uncertain but of great clinical relevance. New evidence suggests that hepatitis B immunoglobulin (HBIG)-free approach would become a reasonable choice in the era of high-potency nucleos(t)ide analogues (HPNAs). We aimed to provide robust estimates for long-term survival and HBV recurrence in patients receiving different HBV-prophylaxis strategies after LT. METHODS: We did a systematic review and meta-analysis using both pseudo-individual patient data recovered from included studies (IPDMA) and conventional trial-level aggregate data meta-analysis (A

ANÁLISE DA SEQUÊNCIA 

In [4]:
def seq_analysis(gb, fasta):
    '''
    Variáveis:
        gb: nome do ficheiro com extensão ".gb"
        fasta: nome do ficheiro convertido para a extensão .fasta
    Returns:
        imprime as informações contidas no ficheiro .gb e retorna um ficheiro .fasta
    '''

    record = SeqIO.read(gb, "genbank")
    id = record.name
    seq = record.seq
    seqlen = len(record.seq)
    source = record.annotations["source"]
    tam = len(record.annotations)
    desc = record.description
    features = len(record.features)
    totannot = record.annotations
    print(f"ID: {id} \n Sequência: {seq} \n Tamanho da sequência: {seqlen} bp \n Source: {source} \n Tamanho das anotações: {tam}")
    print(f"Descrição: {desc} \n Total features: {features}")
    print()
    print(f"Annotations: {totannot}")
    
    print()
    print("FEATURES:")
    for feat in record.features:
        print("-->", feat)
    print(f"Número de features: {features}")
    for feat in record.features:
        print("Type:", feat.type)
        print("Location:", feat.location)

    featcds = [ ]
    for i in range(len(record.features)):
        if record.features[i].type == "CDS":
            featcds.append(i)
    for k in featcds:
        print (record.features[k].location)
    for k in featcds:
        print (record.features[k].extract(record.seq))
    print(featcds)

    for feat in record.features:
        if feat.type == 'CDS':
            print("Proteína codificada: ", feat.qualifiers['product'])

    for feat in record.features:
        if feat.type == 'gene':
            print("Significado biológico: ", feat.qualifiers["note"])

    
    records = SeqIO.parse(gb,"genbank")
    count = SeqIO.write(records, fasta, "fasta")
    print(f'Foi convertido {count} registo.')

seq_analysis("ide.gb", "ide.fasta")

ID: NM_004969 
 Sequência: ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTTTGCGGTGATCCCGGCGACTGCGCTGGCTAATGCGGTACCGGCTAGCGTGGCTTCTGCACCCCGCACTGCCCAGCACCTTCCGCTCAGTCCTCGGCGCCCGCCTGCCGCCTCCGGAGCGCCTGTGTGGTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCAAGAGAATAGGAAATCACATTACCAAGTCTCCTGAAGACAAGCGAGAATATCGAGGGCTAGAGCTGGCCAATGGTATCAAAGTACTTCTTATCAGTGATCCCACCACGGATAAGTCATCAGCAGCACTTGATGTGCACATAGGTTCATTGTCGGATCCTCCAAATATTGCTGGCTTAAGTCATTTTTGTGAACATATGCTTTTTTTGGGAACAAAGAAATACCCTAAAGAAAATGAATACAGCCAGTTTCTCAGTGAGCATGCAGGAAGTTCAAATGCCTTTACTAGTGGAGAGCATACCAATTACTATTTTGATGTTTCTCATGAACACCTAGAAGGTGCCCTAGACAGGTTTGCACAGTTTTTTCTGTGCCCCTTGTTCGATGAAAGTTGCAAAGACAGAGAGGTGAATGCAGTTGATTCAGAACATGAGAAGAATGTGATGAATGATGCCTGGAGACTCTTTCAATTGGAAAAAGCTACAGGGAATCCTAAACACCCCTTCAGTAAATTTGGGACAGGTAACAAATATACTCTGGAGACTAGACCAAACCAAGAAGGCATTGATGTAAGACAAGAGCTACTGAAATTCCATTCTGCTTACTATTCATCCAACTTAATGGCTGTTTGTGTTTTAGGTCGAGAATCTTTAGATGACTTGACTAATCTGGTGGTAAAGTTATTTTCTGAAGTAGAGAACAAAAATGTTCCATTGCCAGAATTTCCTGAACACCCTTTCCAAGAAGAACATCTTAAACA

ANÁLISE DE HOMOLOGIAS POR BLAST

In [3]:

def blast(fasta, blast_file):
    '''
    Variáveis:
        fasta: ficheiro fasta convertido anteriormente, com extensão .fasta
        blast_file: ficheiro com o resultado do BLAST, com extensão .xml
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados nucleotide
    '''
    record = SeqIO.read(fasta, format="fasta")
    print(len(record))
    result_handle = NCBIWWW.qblast("blastn", "nt", record.seq)
    save_file = open(blast_file, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        #print(f"Matrix: {br.matrix}")
        print(f"Database: {br.database}")
        print(f"Gap penalty: {br.gap_penalties}")   

blast("ide.fasta", "blastn_ide.xml")


FileNotFoundError: [Errno 2] No such file or directory: 'ide.fasta'

In [71]:
def homologos(blast_file, evalue_thresh=0.01):
    '''
    Variáveis:
        blast_file: nome do ficheiro com o resultado do BLAST, com extensão .xml
        evalue_thresh: recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        retorna uma lista dos resultados do BLAST, de acordo com o valor dado pelo e-value
    '''
    evalue_tresh = evalue_thresh
    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        print("Database: ", br.database)
        print("Gap penalty: " , br.gap_penalties)
    print(len(br.alignments))
    for br_x in br.alignments:
        print(f"Acession number: {br_x.accession}")
        print(f"ID do hit: {br_x.hit_id}")
        print(f"Definição: {br_x.hit_def}")
        print(f"HSP: {br_x.hsps}")
        break #faço break porque só quero o primeiro organismo (HUMAN) -> deve dar pra fazer algo com o entrez e selecionar o Homo sapiens
    #change to be made: entrez_query = "Homo sapiens [organism]"
    print()
    for alignment in br.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("        ***ALINHAMENTO***")
                print(f"Identidade: {hsp.identities}")
                print(f"E-value: {hsp.expect}")
                print(f"Score: {hsp.score}")
                print(f"Tamanho: {hsp.align_length}")
                print(f"Caracteres iguais: {len(hsp.match)}")
                print("Query " + hsp.query[0:90] + "...")
                print("Match " + hsp.match[0:90] + "...")
                print("Sbjct " + hsp.sbjct[0:90] + "...")
                print()
    blastq_result = SearchIO.read(blast_file, "blast-xml")
    print(blastq_result)
    
    result_handle.close()
    
    blast_slice = blastq_result[:10]
    print(blast_slice)
    
homologos("blastn_ide.xml")

Database:  nt
Gap penalty:  (5, 2)
50
Acession number: NM_004969
ID do hit: gi|1653961657|ref|NM_004969.4|
Definição: Homo sapiens insulin degrading enzyme (IDE), transcript variant 1, mRNA
HSP: [<Bio.Blast.Record.HSP object at 0x000001CD5C706F70>]

        ***ALINHAMENTO***
Identidade: 5894
E-value: 0.0
Score: 11788.0
Tamanho: 5894
Caracteres iguais: 5894
Query ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTTTGCGGTGATCCCGGCGACTGCGCTGGCTAATGCGGTACCGGCTAG...
Match ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
Sbjct ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTTTGCGGTGATCCCGGCGACTGCGCTGGCTAATGCGGTACCGGCTAG...

        ***ALINHAMENTO***
Identidade: 5724
E-value: 0.0
Score: 11448.0
Tamanho: 5724
Caracteres iguais: 5724
Query GGTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCAAGAGAATAGGAAATCACATTACCAAGTCTCCTGAAGACAAGCGA...
Match ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
Sbjct GGTTTCCAAAAAAAGACTTACAGCA

In [72]:
def ide(gb, fasta, blast_file, gene, do_blast = False, evalue_thresh = 0.01):
    '''
    Variáveis:
        gb: ficheiro genbank obtido online (ide.gb), com extensão .gb
        blast_file:  nome do ficheiro com os resultados do BLAST ("blastn_ide.xml") #depois cada um alterar para o nome do seu gene
        do_blast: recebe o booleano False por default. Se blast = True, realiza um BLAST.
        evalue_tresh:  recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações dos genes
        '''

    
    seq_analysis(gb, fasta)
    if do_blast == True:
        blast(fasta, blast_file)
    homologos(blast_file, evalue_thresh = evalue_thresh)
    result_handle = open(blast_file, "r")
    #blast_record = NCBIXML.read(result_handle)
    FILE = str("results_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

ide("ide.gb", "ide.fasta", "blastn_ide.xml", "ide", do_blast = True)

#esta def resume todas as outras acima, por isso meio que não é necessário chamar as outras no final de cada

ID: NM_004969 
 Sequência: ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTTTGCGGTGATCCCGGCGACTGCGCTGGCTAATGCGGTACCGGCTAGCGTGGCTTCTGCACCCCGCACTGCCCAGCACCTTCCGCTCAGTCCTCGGCGCCCGCCTGCCGCCTCCGGAGCGCCTGTGTGGTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCAAGAGAATAGGAAATCACATTACCAAGTCTCCTGAAGACAAGCGAGAATATCGAGGGCTAGAGCTGGCCAATGGTATCAAAGTACTTCTTATCAGTGATCCCACCACGGATAAGTCATCAGCAGCACTTGATGTGCACATAGGTTCATTGTCGGATCCTCCAAATATTGCTGGCTTAAGTCATTTTTGTGAACATATGCTTTTTTTGGGAACAAAGAAATACCCTAAAGAAAATGAATACAGCCAGTTTCTCAGTGAGCATGCAGGAAGTTCAAATGCCTTTACTAGTGGAGAGCATACCAATTACTATTTTGATGTTTCTCATGAACACCTAGAAGGTGCCCTAGACAGGTTTGCACAGTTTTTTCTGTGCCCCTTGTTCGATGAAAGTTGCAAAGACAGAGAGGTGAATGCAGTTGATTCAGAACATGAGAAGAATGTGATGAATGATGCCTGGAGACTCTTTCAATTGGAAAAAGCTACAGGGAATCCTAAACACCCCTTCAGTAAATTTGGGACAGGTAACAAATATACTCTGGAGACTAGACCAAACCAAGAAGGCATTGATGTAAGACAAGAGCTACTGAAATTCCATTCTGCTTACTATTCATCCAACTTAATGGCTGTTTGTGTTTTAGGTCGAGAATCTTTAGATGACTTGACTAATCTGGTGGTAAAGTTATTTTCTGAAGTAGAGAACAAAAATGTTCCATTGCCAGAATTTCCTGAACACCCTTTCCAAGAAGAACATCTTAAACA

FERRAMENTAS DE ANÁLISE DE PROTEÍNAS

In [8]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SearchIO
from Bio import ExPASy

In [11]:
def get_protein(id):
    """
    Variávies:
        id: identificador da proteína obtido na swissprot
    Returns:
        imprime informações sobre a proteína
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    id = seq_record.id
    seq = seq_record.seq
    tam = len(seq_record.seq)
    name = seq_record.name
    desc = seq_record.description
    com = seq_record.annotations["comment"]
    taxon = seq_record.annotations["taxonomy"]
    organism = seq_record.annotations["organism"]
    key = seq_record.annotations["keywords"]
    print(f"ID {id} \n Sequência: {seq} \n Tamanho da sequência: {tam} aa")
    print(f"Nome: {name} \n Descrição: {desc} \n Taxonomia: {taxon} \n Organismo: {organism} \n Keywords: {key}")
    
get_protein("P14735.4")

ID P14735 
 Sequência: RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPAIKRIGNHITKSPEDKREYRGLELANGIKVLLMSDPTTDKSSAALDVHIGSLSDPPNIAGLSHFCEHMLFLGTKKYPKENEYSQFLSEHAGSSNAFTSGEHTNYYFDVSHEHLEGALDRFAQFFLCPLFDESCKDREVNAVDSEHEKNVMNDAWRLFQLEKATGNPKHPFSKFGTGNKYTLETRPNQEGIDVRQELLKFHSAYYSSNLMAVCVLGRESLDDLTNLVVKLFSEVENKNVPLPEFPEHPFQEEHLKQLYKIVPIKDIRNLYVTFPIPDLQKYYKSNPGHYLGHLIGHEGPGSLLSELKSKGWVNTLVGGQKEGARGFMFFIINVDLTEEGLLHVEDIILHMFQYIQKLRAEGPQEWVFQECKDLNAVAFRFKDKERPRGYTSKIAGILHYYPLEEVLTAEYLLEEFRPDLIEMVLDKLRPENVRVAIVSKSFEGKTDRTEEWYGTQYKQEAIPDEVIKKWQNADLNGKFKLPTKNEFIPTNFEILPLEKEATPYPALIKDTVMSKLWFKQDDKKKKPKACLNFEFFSPFAYVDPLHCNMAYLYLELLKDSLNEYAYAAELAGLSYDLQNTIYGMYLSVKGYNDKQPILLKKIIEKMATFEIDEKRFEIIKEAYMRSLNNFRAEQPHQHAMYYLRLLMTEVAWTKDELKEALDDVTLPRLKAFIPQLLSRLHIEALLHGNITKQAALGIMQMVEDTLIEHAHTKPLLPSQLVRYREVQLPDRGWFVYQQRNEVHNNCGIEIYYQTDMQSTSENMFLELFCQIISEPCFNTLRTKEQLGYIVFSGPRRANGIQSLRFIIQSEKPPHYLESRVEAFLITMEKSIEDMTEEAFQKHIQALAIRRLDKPKKLSAECAKYWGEIISQQYNFDRDNTEVAYLKTLTKEDIIKFYKEMLAVDAPRRHKVSVHVLAREMDSCPVVGEFPCQNDI

In [53]:
def prot_blast(blastp_file, id):
    """ 
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
        query: sequência proteica query
        id: identificador da proteína obtido na swissprot
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados (swissprot?)
    """
    #id = str(input())
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    #seq_prot= query
    result_handle = NCBIWWW.qblast('blastp', 'nr', seq_prot)

    save_file = open(blastp_file,"w")
    save_file.write(result_handle.read())
    save_file.close()

    blast_record = NCBIXML.parse(result_handle)
    result_handle.close()

prot_blast("prot1_blastp_nr.xml", "P14735.4")

#aqui fiz o blast na DB non-redundant pra descobrir os homólogos,
#depois vou fazer na swiss prot. Não sei se faz sentido

In [54]:
def homologos_p (blastp_file, evalue_thresh = None):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
        evalue_tresh: evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLASTp
    Returns:
        retorna uma lista dos resultados do BLASTp, e respetivo valor dado pelo e-value
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    FILE = str("seqshomologas_blastp.fasta")
    save_file = open(FILE, 'w+')
    if evalue_thresh == None:
        evalue_thresh = 0.05
    for alignment in blast_record.alignments:
        for hsp in range(len(alignment.hsps)):
            if alignment.hsps[hsp].expect < evalue_thresh:
                if hsp != 0:
                    pass
                else:
                    save_file.write('>' + alignment.title + '\n' + alignment.hsps[hsp].sbjct + '\n')
    
    result_handle.close()

homologos_p("prot1_blastp_nr.xml")

In [55]:
def prot_file(blastp_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
    Returns:
        imprime informações dos alinhamentos das proteínas
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    evalue_tresh = 1e-30
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("      ***ALINHAMENTO***")
                print("Sequência: ", alignment.title)
                print("Tamanho da sequência: ", alignment.length)
                print("E-value:", hsp.expect)
                print("Score: ", hsp.score)
                print(f"Caracteres iguais: {len(hsp.match)}")
                print(hsp.query[0:75] + "...")
                print(hsp.match[0:75] + "...")
                print(hsp.sbjct[0:75] + "...")
                print()
    
    blastq_result = SearchIO.read(blastp_file, "blast-xml")
    print(blastq_result)
    for br in blastq_result:
        print(f'Sequence ID: {br.id}')
        print(f'Description: {br.description}')
        print(f'E-value: {br[0].evalue}')
        print(f'Bit Score: {br[0].bitscore}')
        print(f'Alignment:\n{br[0].aln}')
        print()
    result_handle.close()

prot_file("prot1_blastp_nr.xml")

      ***ALINHAMENTO***
Sequência:  gb|AAA52712.1| insulin-degrading enzyme [Homo sapiens]
Tamanho da sequência:  1019
E-value: 0.0
Score:  5490.0
Caracteres iguais: 1018
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPAIKRIGNHITKSPEDKREYRGLELANGIKVL...
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPAIKRIGNHITKSPEDKREYRGLELANGIKVL...
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPAIKRIGNHITKSPEDKREYRGLELANGIKVL...

      ***ALINHAMENTO***
Sequência:  ref|NP_004960.2| insulin-degrading enzyme isoform 1 [Homo sapiens] >sp|P14735.4| RecName: Full=Insulin-degrading enzyme; AltName: Full=Abeta-degrading protease; AltName: Full=Insulin protease; Short=Insulinase; AltName: Full=Insulysin [Homo sapiens] >gb|AIC49009.1| IDE, partial [synthetic construct] >emb|SJX26104.1| unnamed protein product, partial [Human ORFeome Gateway entry vector] >gb|AAH96336.1| Insulin-degrading enzyme [Homo sapiens] >gb|AAH96337.1| Insulin-degrading enzyme [Homo sapiens] >gb|AAH96339.1| Insulin-degrading enzyme [Homo sapiens]

In [57]:
def PROTEIN(id, blastp_file, gene, blast = False, evalue_thresh = 0.01):
    """ 
    Variáveis:
        id: id da proteína obtido na swissprot
        blastp_file:  nome do ficheiro com os resultados do BLASTp ("blastp_HHEX.xml") #depois cada um alterar para o nome do seu gene
        blast: recebe o booleano False por default. Se blast = True, realiza um BLAST na base de dados swissprot
        evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações das proteínas
    """
    if blast == True:
        prot_blast(blastp_file, id)
    x = get_protein(id)
    print(x)
    homologos_p(blastp_file, evalue_thresh)
    prot_file(blastp_file)
    result_handle = open(blastp_file, "r")
    FILE = str("results_protein_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()
PROTEIN("P14735.4", "prot1_blastp_nr.xml", "ide", False)

ID P14735 
 Sequência: RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPAIKRIGNHITKSPEDKREYRGLELANGIKVLLMSDPTTDKSSAALDVHIGSLSDPPNIAGLSHFCEHMLFLGTKKYPKENEYSQFLSEHAGSSNAFTSGEHTNYYFDVSHEHLEGALDRFAQFFLCPLFDESCKDREVNAVDSEHEKNVMNDAWRLFQLEKATGNPKHPFSKFGTGNKYTLETRPNQEGIDVRQELLKFHSAYYSSNLMAVCVLGRESLDDLTNLVVKLFSEVENKNVPLPEFPEHPFQEEHLKQLYKIVPIKDIRNLYVTFPIPDLQKYYKSNPGHYLGHLIGHEGPGSLLSELKSKGWVNTLVGGQKEGARGFMFFIINVDLTEEGLLHVEDIILHMFQYIQKLRAEGPQEWVFQECKDLNAVAFRFKDKERPRGYTSKIAGILHYYPLEEVLTAEYLLEEFRPDLIEMVLDKLRPENVRVAIVSKSFEGKTDRTEEWYGTQYKQEAIPDEVIKKWQNADLNGKFKLPTKNEFIPTNFEILPLEKEATPYPALIKDTVMSKLWFKQDDKKKKPKACLNFEFFSPFAYVDPLHCNMAYLYLELLKDSLNEYAYAAELAGLSYDLQNTIYGMYLSVKGYNDKQPILLKKIIEKMATFEIDEKRFEIIKEAYMRSLNNFRAEQPHQHAMYYLRLLMTEVAWTKDELKEALDDVTLPRLKAFIPQLLSRLHIEALLHGNITKQAALGIMQMVEDTLIEHAHTKPLLPSQLVRYREVQLPDRGWFVYQQRNEVHNNCGIEIYYQTDMQSTSENMFLELFCQIISEPCFNTLRTKEQLGYIVFSGPRRANGIQSLRFIIQSEKPPHYLESRVEAFLITMEKSIEDMTEEAFQKHIQALAIRRLDKPKKLSAECAKYWGEIISQQYNFDRDNTEVAYLKTLTKEDIIKFYKEMLAVDAPRRHKVSVHVLAREMDSCPVVGEFPCQNDI

In [58]:
from Bio.PDB.PDBParser import PDBParser

In [25]:
pip install nglview

Collecting nglviewNote: you may need to restart the kernel to use updated packages.
  Using cached nglview-3.0.3.tar.gz (5.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'

Building wheels for collected packages: nglview
  Building wheel for nglview (PEP 517): started
  Building wheel for nglview (PEP 517): finished with status 'done'
  Created wheel for nglview: filename=nglview-3.0.3-py3-none-any.whl size=8057560 sha256=1842e7b14aed4cdfe628d2b26d1f0f2d607ef36a3805f8831b723cd80c02399b
  Stored in directory: c:\users\maria\appdata\local\pip\cache\wheels\ed\12\ba\4e227d89934c5d7bdf91387286e45dc868613ab32fa6ce36cf
Successfully built nglview
Installing collected packages: nglview
Successfully installed nglview-3.0.3


In [40]:
def PDB(id, pdb_file):
    """ 
    Variáveis:
        id: id da proteína obtido da base de datos de estruturas de proteínas PDB
        pdb_file: nome do ficheiro com as informações sobre a estrutura da proteína obtidas a partir da base de dados PDB ("2e1o.pdb") 
    Returns:
        imprime informações da estrutura da proteína e a sua estrutura 3D
    """
    p = PDBParser(PERMISSIVE=1)
    s = p.get_structure(id, pdb_file)
    for chain in s[0]:
        print(f'Chain ID: {chain.id}')
    smeth = s.header['structure_method']
    keywords = s.header['keywords']
    comp = s.header["compound"]
    print("Keywords: " , keywords)
    print("Structure Method: ", smeth)
    print("Composto: ", comp)

    import nglview as nv
    nv.show_biopython(s, gui=True)
    #isto funciona no jupyter notebook mas aqui dá erro... mas é isto

PDB("2g48", "2g48.pdb")

Chain ID: A
Chain ID: B
Chain ID: C
Chain ID: D
Keywords:  protein-peptide complex, hydrolase
Structure Method:  x-ray diffraction
Composto:  {'1': {'misc': '', 'molecule': 'insulin-degrading enzyme', 'chain': 'a, b', 'synonym': 'insulysin, insulinase, insulin protease', 'ec_number': '3.4.24.56', 'ec': '3.4.24.56', 'engineered': 'yes', 'mutation': 'yes'}, '2': {'misc': '', 'molecule': 'islet amyloid polypeptide', 'chain': 'c, d', 'fragment': 'residues 34-70', 'synonym': 'diabetes-associated peptide, dap, amylin, insulinoma amyloidpeptide ', 'engineered': 'yes'}}




Alinhamento Múltiplo

In [None]:
def CDD(id, cdd_file):
    """ 
    Variáveis:
        id: 
        cdd_file: nome do ficheiro com as informações sobre os domínios das proteínas obtidas a partir da base de dados CDD ("cdd_file_HHEX")
    Returns:
        imprime as informações obtidas a partir da base de dados CDD
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    result_handle = NCBIWWW.qblast("blastp", "CDD", seq_prot)
    save_file = open(cdd_file, "w")
    blast_records = SearchIO.read(result_handle, "blast-xml")
    save_file.write(print(blast_records[:]))
    save_file.close()
    result_handle.close()
CDD("P14735.4", "cdd_file_ide")

In [60]:
from Bio.Align import MultipleSeqAlignment
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio import SeqIO
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import AlignInfo

In [61]:
def align(blast_file, align_file):
    """
    Variáveis:
        blast_file: ficheiro com o resultado do blast (ex: blastn_HHEX.xml)
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
    Returns:
        gera um ficheiro, com extensão .fasta (ex: obterseqs.fasta) com as sequências alinhadas e imprime-as
    """
    result_blast = open(blast_file)
    blast_records = NCBIXML.read(result_blast)
    FILE = str(align_file)
    save_file = open(FILE, 'w+')
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            save_file.write('>' + alignment.title + '\n' + hsp.sbjct[0:45] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()
align("blastn_ide.xml", "obterseqs.fasta")

> gi|1653961657|ref|NM_004969.4| Homo sapiens insulin degrading enzyme (IDE), transcript variant 1, mRNA 
 ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTTTGCGGTGATCCCGGCGACTGCGCTGGCTAATGCGGTACCGGCTAGCGTGGCTTCTGCACCCCGCACTGCCCAGCACCTTCCGCTCAGTCCTCGGCGCCCGCCTGCCGCCTCCGGAGCGCCTGTGTGGTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCAAGAGAATAGGAAATCACATTACCAAGTCTCCTGAAGACAAGCGAGAATATCGAGGGCTAGAGCTGGCCAATGGTATCAAAGTACTTCTTATCAGTGATCCCACCACGGATAAGTCATCAGCAGCACTTGATGTGCACATAGGTTCATTGTCGGATCCTCCAAATATTGCTGGCTTAAGTCATTTTTGTGAACATATGCTTTTTTTGGGAACAAAGAAATACCCTAAAGAAAATGAATACAGCCAGTTTCTCAGTGAGCATGCAGGAAGTTCAAATGCCTTTACTAGTGGAGAGCATACCAATTACTATTTTGATGTTTCTCATGAACACCTAGAAGGTGCCCTAGACAGGTTTGCACAGTTTTTTCTGTGCCCCTTGTTCGATGAAAGTTGCAAAGACAGAGAGGTGAATGCAGTTGATTCAGAACATGAGAAGAATGTGATGAATGATGCCTGGAGACTCTTTCAATTGGAAAAAGCTACAGGGAATCCTAAACACCCCTTCAGTAAATTTGGGACAGGTAACAAATATACTCTGGAGACTAGACCAAACCAAGAAGGCATTGATGTAAGACAAGAGCTACTGAAATTCCATTCTGCTTACTATTCATCCAACTTAATGGCTGTTTGTGTTTTAGGTCGAGAATCTTTAGATGACTTGACTAATCTGGTGGTAAAG

In [62]:
def align_prot(blastp_file, align_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blast (ex: blastn_HHEX.xml)
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
    Returns:
        gera um ficheiro, com extensão .fasta (ex: obterseqs.fasta) com as sequências alinhadas e imprime-as
    """
    result_blast = open(blastp_file)
    blast_records = NCBIXML.read(result_blast)
    FILE = str(align_file)
    save_file = open(FILE, 'w+')
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            save_file.write('>' + alignment.title + '\n' + hsp.sbjct[0:45] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()
align("prot1_blastp_nr.xml", "obterprotseqs.fasta")

> gb|AAA52712.1| insulin-degrading enzyme [Homo sapiens] 
 RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPAIKRIGNHITKSPEDKREYRGLELANGIKVLLMSDPTTDKSSAALDVHIGSLSDPPNIAGLSHFCEHMLFLGTKKYPKENEYSQFLSEHAGSSNAFTSGEHTNYYFDVSHEHLEGALDRFAQFFLCPLFDESCKDREVNAVDSEHEKNVMNDAWRLFQLEKATGNPKHPFSKFGTGNKYTLETRPNQEGIDVRQELLKFHSAYYSSNLMAVCVLGRESLDDLTNLVVKLFSEVENKNVPLPEFPEHPFQEEHLKQLYKIVPIKDIRNLYVTFPIPDLQKYYKSNPGHYLGHLIGHEGPGSLLSELKSKGWVNTLVGGQKEGARGFMFFIINVDLTEEGLLHVEDIILHMFQYIQKLRAEGPQEWVFQECKDLNAVAFRFKDKERPRGYTSKIAGILHYYPLEEVLTAEYLLEEFRPDLIEMVLDKLRPENVRVAIVSKSFEGKTDRTEEWYGTQYKQEAIPDEVIKKWQNADLNGKFKLPTKNEFIPTNFEILPLEKEATPYPALIKDTVMSKLWFKQDDKKKKPKACLNFEFFSPFAYVDPLHCNMAYLYLELLKDSLNEYAYAAELAGLSYDLQNTIYGMYLSVKGYNDKQPILLKKIIEKMATFEIDEKRFEIIKEAYMRSLNNFRAEQPHQHAMYYLRLLMTEVAWTKDELKEALDDVTLPRLKAFIPQLLSRLHIEALLHGNITKQAALGIMQMVEDTLIEHAHTKPLLPSQLVRYREVQLPDRGWFVYQQRNEVHNNCGIEIYYQTDMQSTSENMFLELFCQIISEPCFNTLRTKEQLGYIVFSGPRRANGIQSLRFIIQSEKPPHYLESRVEAFLITMEKSIEDMTEEAFQKHIQALAIRRLDKPKKLSAECAKYWGEIISQQYNFDRDNTEVAYLKTLTKEDIIKFYKE

In [63]:
def parse_align(align_file, alinhamento):
    """ 
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file,format = "fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")
parse_align("obterseqs.fasta", "align_ide")

Alignment with 60 rows and 45 columns
ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTT gi|1653961657|ref|NM_004969.4|
GGTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATC gi|2217277033|ref|XM_017016188.2|
GTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCA gi|2217277031|ref|XM_047425171.1|
ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTT gi|2217277031|ref|XM_047425171.1|
GTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCA gi|2217277030|ref|XM_017016187.2|
GTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCA gi|2217277024|ref|XM_047425169.1|
ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTT gi|2217277024|ref|XM_047425169.1|
TGCTTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCAT gi|1017029490|ref|NM_001322796.1|
TTTCCAAAAAAAGACTTACAGCAAAATGAATAATCCAGCCATCAA gi|1890343265|ref|NM_001322795.2|
ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTT gi|1890343265|ref|NM_001322795.2|
ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTT gi|1849002264|ref|XM_008950689.4|
ATGCGCAGTGCGCAGGGCCGGCTCGAAGCGCAAGCAGGAAGCGTT gi|1367123317|ref|XM_507922.7|
ATGCGCAG

In [64]:
def parse_align_prot(align_file, alinhamento):
    """ 
    Variáveis:
        align_file_prot: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file,format = "fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")
parse_align_prot("obterprotseqs.fasta", "align_ide_prot")

Alignment with 50 rows and 45 columns
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPA gb|AAA52712.1|
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPA ref|NP_004960.2|
RYRLAWLLHPALSSTFRSVLGARLPPPERLCGFQKKTYSKMNNPA ref|XP_008948937.2|
RYRLAWLLHPALPSTFRSVLGARLPPLERLCGFQKKTYSKMNNPA ref|XP_031992582.1|
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPA dbj|BAG35668.1|
RYRLAWLLHPPLPSTFRSVLGARLPPPERLCGCQKKTYSKMNNPA ref|XP_018890632.1|
RYRLAWLLHPALPSTFRSVLGARLPPLERLCGFQKKTYSKMNNPA ref|XP_030665115.1|
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPA ref|XP_023082030.1|
RYRLAWLLHPALPSTFRSVFGARLPPPERLCGFEKKTYSKMNNPA ref|XP_003922440.2|
RYRLAWLRHPALPSTFRSVFGARLPPPERLCGFQKKTYSKMNNPA ref|NP_001245003.1|
RYRLAWLRHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPA ref|XP_007961753.1|
RYRLAWLRHPALPSTFRSVFGARLPPPERLCGFQKKTYSKMNNPA ref|XP_011737146.1|
RYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQRKTYSKMNNPA ref|XP_010379621.1|
RYRLAWLRHPALPTTFRSVFGARLPPPERLCGFQKKTYSKMNNPA ref|XP_005566007.1|
RYRLAWLLHPALPSTFRSIFGARLPPPERLCGFQKKTYSKMNNSA ref|

In [65]:
def consensus(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format = "fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus
consensus("align_ide")




Seq('XTXXXCAXXXXXXAXXXXXXXXXXXAAXXXXAAXXXXXXXXXXXX')

In [66]:
def consensus_prot(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format = "fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus
consensus("align_ide_prot")




Seq('RYRLAWLLHPALPSTFRSVXGARLPPPERLCGFQKKTYSKMNNPA')

In [47]:
def stockholm(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta) 
        stock_file: ficheiro com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX.sth)
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas (ex: align_results_HHEX.sth)
    """
    form_stock = AlignIO.parse(align_file,"fasta")
    AlignIO.convert(align_file,"fasta", stock_file,"stockholm")
    #dá erro de ids duplicados
stockholm("align_ide", "align_results_ide.sth")

ValueError: Duplicate record identifier: gi|2217277031|ref|XM_047425171.1|

In [67]:
def stockholm_prot(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta) 
        stock_file: ficheio com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX.sth)
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas (ex: align_results_HHEX.sth)
    """
    form_stock = AlignIO.parse(align_file,"fasta")
    AlignIO.convert(align_file,"fasta", stock_file, "stockholm")
stockholm_prot("align_ide_prot", "align_results_ide_prot.sth")

Árvore filogenética

In [68]:
from Bio import Phylo
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor


In [70]:
def phylo_tree(stockholm_file):
    """ 
    Variáveis:
        stockholm_file: ficheiro com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX_prot.sth)

    Returns:
        imprime a distância entre as espécies na árvore, constrói as árvores UPGMA e Neighbour Joining e cria um ficheiro newick (ex: phylo_trees.nhx) que contém ambas as árvores
    """
    alignment = AlignIO.read(open(stockholm_file),"stockholm")
    #print(alignment)
    #print("*" * 140)
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(alignment)
    print(dm)
    print("*" * 140)
    constructor = DistanceTreeConstructor()
    upgmatree = constructor.upgma(dm)
    print(upgmatree)
    print("*" * 140)
    njtree = constructor.nj(dm)
    print(njtree)
    print("*" * 140)
    Phylo.write([upgmatree, njtree], "phylo_trees.nhx","newick")
    
    tree_up = Phylo.draw_ascii(upgmatree)
    print("*" * 140)
    tree_nj = Phylo.draw_ascii(njtree)

phylo_tree("align_results_ide_prot.sth")

gb|AAA52712.1|	0
ref|NP_004960.2|	0.0	0
ref|XP_008948937.2|	0.03265306122448974	0.03265306122448974	0
ref|XP_031992582.1|	0.04081632653061229	0.04081632653061229	0.06198347107438018	0
dbj|BAG35668.1|	0.0	0.0	0.03265306122448974	0.04081632653061229	0
ref|XP_018890632.1|	0.07569721115537853	0.07569721115537853	0.10756972111553786	0.1155378486055777	0.07569721115537853	0
ref|XP_030665115.1|	0.04081632653061229	0.04081632653061229	0.06198347107438018	0.0	0.04081632653061229	0.1155378486055777	0
ref|XP_023082030.1|	0.0	0.0	0.03265306122448974	0.04081632653061229	0.0	0.07569721115537853	0.04081632653061229	0
ref|XP_003922440.2|	0.036437246963562764	0.036437246963562764	0.06882591093117407	0.07692307692307687	0.036437246963562764	0.10358565737051795	0.07692307692307687	0.036437246963562764	0
ref|NP_001245003.1|	0.05241935483870963	0.05241935483870963	0.08467741935483875	0.092741935483871	0.05241935483870963	0.1155378486055777	0.092741935483871	0.05241935483870963	0.040322580645161255	0
ref|XP

In [51]:
form_phyl = AlignIO.parse("align_results_ide_prot.sth","stockholm")
AlignIO.convert("align_results_HHEX_prot.sth","stockholm","align_results_HHEX_protphy.phy","phylip")

alignments = AlignIO.parse("align_results_ide_protphy.phy", "phylip")
for alignment in alignments:
    print(alignment)
    print()

FileNotFoundError: [Errno 2] No such file or directory: 'align_results_HHEX_prot.sth'