In [1]:
from Bio import Entrez
from Bio import SeqIO
from Bio import SearchIO
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import Medline

In [2]:
def articles(term):
    """ 
    Variáveis:
     term: termo que queremos procurar no título dos artigos, que é o nosso gene
    Returns:
        imprime o título, abstract, autores e fonta dos artigos que têm o nosso termo no título
    """
    Entrez.email = "pg49836@alunos.uminho.pt"
    handles = Entrez.einfo()

    handle1 = Entrez.esearch(db = "pubmed", term = term + "[title]", retmax ="40")
    record = Entrez.read(handle1)

    id_list = record["IdList"]
    handle = Entrez.efetch(db = "pubmed", id = id_list, rettype = "medline", retmode = "text")
    records = Medline.parse(handle)
    print()
    for record in records:
        print("Titulo:", record.get("TI", "v"))                                     
        print()
        print("Autores:", record.get("AU", "v"))
        print("Fonte:", record.get("SO", "v"))
        print()
        print("Abstract:", record.get("AB", "v"))
        print()
        print("="*140)
        print()

In [3]:
articles("FTO")


Titulo: FTO variant RS 1121980 interact with metabolic response after weight loss with a meal replacement hypocaloric diet in Caucasian obese subjects.

Autores: ['de Luis DA', 'Izaola O', 'Primo D', 'Lopez JJ']
Fonte: Eur Rev Med Pharmacol Sci. 2022 Dec;26(24):9336-9344. doi: 10.26355/eurrev_202212_30684.

Abstract: OBJECTIVE: One genetic variant (rs1121980) of FTO gene has been related with body mass index and visceral adiposity. The objective of our study was to investigate the role of rs1121980 genetic variant of FTO gene on weight loss and metabolic changes secondary to a partial meal replacement (pMR) hypocaloric diet. PATIENTS AND METHODS: We conducted an interventional study on 219 obese Caucasian subjects with body mass index (BMI) > 30 kg/m2. The subjects received two intakes per day of a normocaloric hyperproteic formula for 12 weeks. Adiposity and biochemical parameters (lipid profile, insulin, homeostasis model assessment (HOMA-IR) and glucose) were determined. RESULTS: A

ANÁLISE DA SEQUÊNCIA 

In [11]:
def seq_analysis(gb, fasta):
    '''
    Variáveis:
        gb: nome do ficheiro com extensão ".gb"
        fasta: nome do ficheiro convertido para a extensão .fasta
    Returns:
        imprime as informações contidas no ficheiro .gb e retorna um ficheiro .fasta
    '''

    record = SeqIO.read(gb, "genbank")
    id = record.name
    seq = record.seq
    seqlen = len(record.seq)
    source = record.annotations["source"]
    tam = len(record.annotations)
    desc = record.description
    features = len(record.features)
    totannot = record.annotations
    print(f"ID: {id} \n Sequência: {seq} \n Tamanho da sequência: {seqlen} bp \n Source: {source} \n Tamanho das anotações: {tam}")
    print(f"Descrição: {desc} \n Total features: {features}")
    print()
    print(f"Annotations: {totannot}")
    
    print()
    print("FEATURES:")
    for feat in record.features:
        print("-->", feat)
    print(f"Número de features: {features}")
    for feat in record.features:
        print("Type:", feat.type)
        print("Location:", feat.location)

    featcds = [ ]
    for i in range(len(record.features)):
        if record.features[i].type == "CDS":
            featcds.append(i)
    for k in featcds:
        print (record.features[k].location)
    for k in featcds:
        print (record.features[k].extract(record.seq))
    print(featcds)

    for feat in record.features:
        if feat.type == 'CDS':
            print("Proteína codificada: ", feat.qualifiers['product'])

    for feat in record.features:
        if feat.type == 'gene':
            print("Significado biológico: ", feat.qualifiers["note"])

    
    records = SeqIO.parse(gb,"genbank")
    count = SeqIO.write(records, fasta, "fasta")
    print(f'Foi convertido {count} registo.')


ANÁLISE DE HOMOLOGIAS POR BLAST

In [6]:

def blast(fasta, blast_file):
    '''
    Variáveis:
        fasta: ficheiro fasta convertido anteriormente, com extensão .fasta
        blast_file: ficheiro com o resultado do BLAST, com extensão .xml
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados nucleotide
    '''
    record = SeqIO.read(fasta, format="fasta")
    print(len(record))
    result_handle = NCBIWWW.qblast("blastn", "nt", record.seq)
    save_file = open(blast_file, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        #print(f"Matrix: {br.matrix}")
        print(f"Database: {br.database}")
        print(f"Gap penalty: {br.gap_penalties}")   

In [8]:
def homologos(blast_file, evalue_thresh=0.01):
    '''
    Variáveis:
        blast_file: nome do ficheiro com o resultado do BLAST, com extensão .xml
        evalue_thresh: recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        retorna uma lista dos resultados do BLAST, de acordo com o valor dado pelo e-value
    '''
    evalue_tresh = evalue_thresh
    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        print("Database: ", br.database)
        print("Gap penalty: " , br.gap_penalties)
    print(len(br.alignments))
    for br_x in br.alignments:
        print(f"Acession number: {br_x.accession}")
        print(f"ID do hit: {br_x.hit_id}")
        print(f"Definição: {br_x.hit_def}")
        print(f"HSP: {br_x.hsps}")
        break #faço break porque só quero o primeiro organismo (HUMAN) -> deve dar pra fazer algo com o entrez e selecionar o Homo sapiens
    #change to be made: entrez_query = "Homo sapiens [organism]"
    print()
    for alignment in br.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("        ***ALINHAMENTO***")
                print(f"Identidade: {hsp.identities}")
                print(f"E-value: {hsp.expect}")
                print(f"Score: {hsp.score}")
                print(f"Tamanho: {hsp.align_length}")
                print(f"Caracteres iguais: {len(hsp.match)}")
                print("Query " + hsp.query[0:90] + "...")
                print("Match " + hsp.match[0:90] + "...")
                print("Sbjct " + hsp.sbjct[0:90] + "...")
                print()
    blastq_result = SearchIO.read(blast_file, "blast-xml")
    print(blastq_result)
    
    result_handle.close()
    
    blast_slice = blastq_result[:10]
    print(blast_slice)

In [9]:
def gene(gb, fasta, blast_file, gene, do_blast = False, evalue_thresh = 0.01):
    '''
    Variáveis:
        gb: ficheiro genbank obtido online (HHEX.gb), com extensão .gb
        blast_file:  nome do ficheiro com os resultados do BLAST ("blastn_HHEX.xml") #depois cada um alterar para o nome do seu gene
        do_blast: recebe o booleano False por default. Se blast = True, realiza um BLAST.
        evalue_tresh:  recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações dos genes
        '''

    
    seq_analysis(gb, fasta)
    if do_blast == True:
        blast(fasta, blast_file)
    homologos(blast_file, evalue_thresh = evalue_thresh)
    result_handle = open(blast_file, "r")
    #blast_record = NCBIXML.read(result_handle)
    FILE = str("results_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

#esta def resume todas as outras acima, por isso meio que não é necessário chamar as outras no final de cada

In [12]:
gene("FTO.gb", "FTO.fasta", "blastn_FTO.xml", "FTO", do_blast = True)

ID: NM_001363901 
 Sequência: CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCTCCTGTGCTAAATCCCGTGGCGCTCGCGGGTGTCGCCGCGGTGCATCCTGGGAGTTGTAGTTTTTTCTACTCAGAGGGAGAATAGCTCCAGACGGGAGCAGGACGCTGAGAGAACTACATGCAGGAGGCGGGGTCCAGGGCGAGGGATCTACGCAGCTTGCGGTGGCGAAGGCGGCTTTAGTGGCAGCATGAAGCGCACCCCGACTGCCGAGGAACGAGAGCGCGAAGCTAAGAAACTGAGGCTTCTTGAAGAGCTTGAAGACACTTGGCTCCCTTATCTGACCCCCAAAGATGATGAATTCTATCAGCAGTGGCAGCTGAAATATCCTAAACTAATTCTCCGAGAAGCCAGCAGTGTATCTGAGGAGCTCCATAAAGAGGTTCAAGAAGCCTTTCTCACACTGCACAAGCATGGCTGCTTATTTCGGGACCTGGTTAGGATCCAAGGCAAAGATCTGCTCACTCCGGTATCTCGCATCCTCATTGGTAATCCAGGCTGCACCTACAAGTACCTGAACACCAGGCTCTTTACGGTCCCCTGGCCAGTGAAAGGGTCTAATATAAAACACACCGAGGCTGAAATAGCCGCTGCTTGTGAGACCTTCCTCAAGCTCAATGACTACCTGCAGATAGAAACCATCCAGGCTTTGGAAGAACTTGCTGCCAAAGAGAAGGCTAATGAGGATGCTGTGCCATTGTGTATGTCTGCAGATTTCCCCAGGGTTGGGATGGGTTCATCCTACAACGGACAAGATGAAGTGGACATTAAGAGCAGAGCAGCATACAACGTAACTTTGCTGAATTTCATGGATCCTCAGAAAATGCCATACCTGAAAGAGGAACCTTATTTTGGCATGGGGAAAATGGCAGTGAGCTGGCATCATGATGAAAATCTGGTGGACAGGTCAGCGGTGGCAGTGTACAGTTATAGCTGTG



Program: blastn (2.13.0+)
  Query: No (4148)
         definition line
 Target: nt
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  gi|1394533464|ref|NM_001363901.1|  Homo sapiens FTO alp...
            1      1  gi|1394333984|ref|NM_001363899.1|  Homo sapiens FTO alp...
            2      1  gi|1411140281|ref|XM_025370000.1|  PREDICTED: Theropith...
            3      1  gi|1411140275|ref|XM_025369997.1|  PREDICTED: Theropith...
            4      4  gi|1519243379|ref|NM_001080432.3|  Homo sapiens FTO alp...
            5      3  gi|1394533504|ref|NM_001363905.1|  Homo sapiens FTO alp...
            6      2  gi|1394533483|ref|NM_001363891.1|  Homo sapiens FTO alp...
            7      3  gi|1394533462|ref|NM_001363897.1|  Homo sapiens FTO alp...
            8      2  gi|12698048|dbj|AB051539.1|  Homo sapiens KIAA1

FERRAMENTAS DE ANÁLISE DE PROTEÍNAS

In [2]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SearchIO
from Bio import ExPASy

In [20]:
def get_protein(id):
    """
    Variávies:
        id: identificador da proteína obtido na swissprot
    Returns:
        imprime informações sobre a proteína
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    id = seq_record.id
    seq = seq_record.seq
    tam = len(seq_record.seq)
    name = seq_record.name
    desc = seq_record.description
    com = seq_record.annotations["comment"]
    taxon = seq_record.annotations["taxonomy"]
    organism = seq_record.annotations["organism"]
    key = seq_record.annotations["keywords"]
    print(f"ID {id} \n Sequência: {seq} \n Tamanho da sequência: {tam} aa")
    print(f"Nome: {name} \n Descrição: {desc} \n Taxonomia: {taxon} \n Organismo: {organism} \n Keywords: {key}")


In [21]:
def prot_blast(blastp_file, id):
    """ 
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
        query: sequência proteica query
        id: identificador da proteína obtido na swissprot
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados (swissprot?)
    """
    #id = str(input())
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    #seq_prot= query
    result_handle = NCBIWWW.qblast('blastp', 'nr', seq_prot)

    save_file = open(blastp_file,"w")
    save_file.write(result_handle.read())
    save_file.close()

    blast_record = NCBIXML.parse(result_handle)
    result_handle.close()

#aqui fiz o blast na DB non-redundant pra descobrir os homólogos,
#depois vou fazer na swiss prot. Não sei se faz sentido

In [22]:
def homologos_p (blastp_file, evalue_thresh = None):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
        evalue_tresh: evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLASTp
    Returns:
        retorna uma lista dos resultados do BLASTp, e respetivo valor dado pelo e-value
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    FILE = str("seqshomologas_blastp.fasta")
    save_file = open(FILE, 'w+')
    if evalue_thresh == None:
        evalue_thresh = 0.05
    for alignment in blast_record.alignments:
        for hsp in range(len(alignment.hsps)):
            if alignment.hsps[hsp].expect < evalue_thresh:
                if hsp != 0:
                    pass
                else:
                    save_file.write('>' + alignment.title + '\n' + alignment.hsps[hsp].sbjct + '\n')
    
    result_handle.close()

In [23]:
def prot_file(blastp_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
    Returns:
        imprime informações dos alinhamentos das proteínas
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    evalue_tresh = 1e-30
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("      ***ALINHAMENTO***")
                print("Sequência: ", alignment.title)
                print("Tamanho da sequência: ", alignment.length)
                print("E-value:", hsp.expect)
                print("Score: ", hsp.score)
                print(f"Caracteres iguais: {len(hsp.match)}")
                print(hsp.query[0:75] + "...")
                print(hsp.match[0:75] + "...")
                print(hsp.sbjct[0:75] + "...")
                print()
    
    blastq_result = SearchIO.read(blastp_file, "blast-xml")
    print(blastq_result)
    for br in blastq_result:
        print(f'Sequence ID: {br.id}')
        print(f'Description: {br.description}')
        print(f'E-value: {br[0].evalue}')
        print(f'Bit Score: {br[0].bitscore}')
        print(f'Alignment:\n{br[0].aln}')
        print()
    result_handle.close()

In [24]:
def PROTEIN(id, blastp_file, gene, blast = False, evalue_thresh = 0.01):
    """ 
    Variáveis:
        id: id da proteína obtido na swissprot
        blastp_file:  nome do ficheiro com os resultados do BLASTp ("blastp_HHEX.xml") #depois cada um alterar para o nome do seu gene
        blast: recebe o booleano False por default. Se blast = True, realiza um BLAST na base de dados swissprot
        evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações das proteínas
    """
    if blast == True:
        prot_blast(blastp_file, id)
    x = get_protein(id)
    print(x)
    homologos_p(blastp_file, evalue_thresh)
    prot_file(blastp_file)
    result_handle = open(blastp_file, "r")
    FILE = str("results_protein_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()


In [25]:
PROTEIN("Q9C0B1", "prot_blastp_swiss.xml", "FTO", blast = True)

ID Q9C0B1 
 Sequência: MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLKYPKLILREASSVSEELHKEVQEAFLTLHKHGCLFRDLVRIQGKDLLTPVSRILIGNPGCTYKYLNTRLFTVPWPVKGSNIKHTEAEIAAACETFLKLNDYLQIETIQALEELAAKEKANEDAVPLCMSADFPRVGMGSSYNGQDEVDIKSRAAYNVTLLNFMDPQKMPYLKEEPYFGMGKMAVSWHHDENLVDRSAVAVYSYSCEGPEEESEDDSHLEGRDPDIWHVGFKISWDIETPGLAIPLHQGDCYFMLDDLNATHQHCVLAGSQPRFSSTHRVAECSTGTLDYILQRCQLALQNVCDDVDNDDVSLKSFEPAVLKQGEEIHNEVEFEWLRQFWFQGNRYRKCTDWWCQPMAQLEALWKKMEGVTNAVLHEVKREGLPVEQRNEILTAILASLTARQNLRREWHARCQSRIARTLPADQKPECRPYWEKDDASMPLPFDLTDIVSELRGQLLEAKP 
 Tamanho da sequência: 505 aa
Nome: FTO_HUMAN 
 Descrição: RecName: Full=Alpha-ketoglutarate-dependent dioxygenase FTO {ECO:0000305}; AltName: Full=Fat mass and obesity-associated protein {ECO:0000303|PubMed:17496892}; AltName: Full=U6 small nuclear RNA (2'-O-methyladenosine-N(6)-)-demethylase FTO {ECO:0000305}; EC=1.14.11.- {ECO:0000269|PubMed:30197295}; AltName: Full=U6 small nuclear RNA N(6)-methyladenosine-demethylase FTO {ECO:0000305}; EC=1.14.11.- {ECO:0000269

In [1]:
from Bio.PDB.PDBParser import PDBParser

In [17]:
def PDB(id, pdb_file):
    """ 
    Variáveis:
        id: id da proteína obtido da base de datos de estruturas de proteínas PDB
        pdb_file: nome do ficheiro com as informações sobre a estrutura da proteína obtidas a partir da base de dados PDB ("2e1o.pdb") 
    Returns:
        imprime informações da estrutura da proteína e a sua estrutura 3D
    """

    p = PDBParser(PERMISSIVE=1)
    s = p.get_structure(id, pdb_file)
    for chain in s[0]:
        print(f'Chain ID: {chain.id}')
    smeth = s.header['structure_method']
    keywords = s.header['keywords']
    comp = s.header["compound"]
    print("Keywords: " , keywords)
    print("Structure Method: ", smeth)
    print("Composto: ", comp)
    
    import nglview as nv
    view = nv.show_biopython(s, gui=True)
    
    #Não consigo obter imagem da estrutura, erro "I/O operation on a closed file, após pesquisa encontrei que pode ser um erro com o biopython 1.80"

In [18]:
PDB("3LFM", "3lfm.pdb")

Chain ID: A
Keywords:  fat mass and obesity associated (fto) protein, fe2+/2-oxoglutarate (2-og)-dependent oxidative dna/rna demethylases, oxidoreductase
Structure Method:  x-ray diffraction
Composto:  {'1': {'misc': '', 'molecule': 'protein fto', 'chain': 'a', 'fragment': 'unp residues 32-505', 'synonym': 'fat mass and obesity-associated protein', 'ec': '1.14.11.-', 'engineered': 'yes'}}


ValueError: I/O operation on closed file

In [None]:
def CDD(id, cdd_file):
    """ 
    Variáveis:
        id: 
        cdd_file: nome do ficheiro com as informações sobre os domínios das proteínas obtidas a partir da base de dados CDD ("cdd_file_HHEX")
    Returns:
        imprime as informações obtidas a partir da base de dados CDD
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    result_handle = NCBIWWW.qblast("blastp", "CDD", seq_prot)
    save_file = open(cdd_file, "w")
    blast_records = SearchIO.read(result_handle, "blast-xml")
    save_file.write(print(blast_records[:]))
    save_file.close()
    result_handle.close()


In [None]:
CDD("Q9C0B1", "cdd_file_FTO.txt")

Program: blastp (2.13.0+)
  Query: unnamed (505)
         protein product
 Target: CDD
   Hits: 0




TypeError: write() argument must be str, not None

Alinhamento Múltiplo

In [2]:
from Bio.Align import MultipleSeqAlignment
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio import SeqIO
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import AlignInfo

In [8]:
def align(blast_file, align_file):
    """
    Variáveis:
        blast_file: ficheiro com o resultado do blast (ex: blastn_HHEX.xml)
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
    Returns:
        gera um ficheiro, com extensão .fasta (ex: obterseqs.fasta) com as sequências alinhadas e imprime-as
    """
    result_blast = open(blast_file)
    blast_records = NCBIXML.read(result_blast)
    FILE = str(align_file)
    save_file = open(FILE, 'w+')
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
        
            save_file.write('>' + alignment.title + '\n' + hsp.sbjct[0:35] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()

In [9]:
align("blastn_FTO.xml", "obterseqs.fasta")

> gi|1394533464|ref|NM_001363901.1| Homo sapiens FTO alpha-ketoglutarate dependent dioxygenase (FTO), transcript variant 9, mRNA 
 CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCTCCTGTGCTAAATCCCGTGGCGCTCGCGGGTGTCGCCGCGGTGCATCCTGGGAGTTGTAGTTTTTTCTACTCAGAGGGAGAATAGCTCCAGACGGGAGCAGGACGCTGAGAGAACTACATGCAGGAGGCGGGGTCCAGGGCGAGGGATCTACGCAGCTTGCGGTGGCGAAGGCGGCTTTAGTGGCAGCATGAAGCGCACCCCGACTGCCGAGGAACGAGAGCGCGAAGCTAAGAAACTGAGGCTTCTTGAAGAGCTTGAAGACACTTGGCTCCCTTATCTGACCCCCAAAGATGATGAATTCTATCAGCAGTGGCAGCTGAAATATCCTAAACTAATTCTCCGAGAAGCCAGCAGTGTATCTGAGGAGCTCCATAAAGAGGTTCAAGAAGCCTTTCTCACACTGCACAAGCATGGCTGCTTATTTCGGGACCTGGTTAGGATCCAAGGCAAAGATCTGCTCACTCCGGTATCTCGCATCCTCATTGGTAATCCAGGCTGCACCTACAAGTACCTGAACACCAGGCTCTTTACGGTCCCCTGGCCAGTGAAAGGGTCTAATATAAAACACACCGAGGCTGAAATAGCCGCTGCTTGTGAGACCTTCCTCAAGCTCAATGACTACCTGCAGATAGAAACCATCCAGGCTTTGGAAGAACTTGCTGCCAAAGAGAAGGCTAATGAGGATGCTGTGCCATTGTGTATGTCTGCAGATTTCCCCAGGGTTGGGATGGGTTCATCCTACAACGGACAAGATGAAGTGGACATTAAGAGCAGAGCAGCATACAACGTAACTTTGCTGAATTTCATGGATCCTCAGAAAATGCCATACCTGAA

In [3]:
def align_prot(blastp_file, align_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blast (ex: blastn_HHEX.xml)
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
    Returns:
        gera um ficheiro, com extensão .fasta (ex: obterseqs.fasta) com as sequências alinhadas e imprime-as
    """
    result_blast = open(blastp_file)
    blast_records = NCBIXML.read(result_blast)
    
    FILE = str(align_file)

    save_file = open(FILE, 'w+')

    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            save_file.write('>' + alignment.title + '\n' + hsp.sbjct[0:45] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()
            

In [45]:
align_prot("prot_blastp_swiss.xml", "obterprotseqs.fasta")

> ref|NP_001073901.1| alpha-ketoglutarate-dependent dioxygenase FTO isoform 3 [Homo sapiens] >ref|XP_018868158.1| alpha-ketoglutarate-dependent dioxygenase FTO isoform X2 [Gorilla gorilla gorilla] >sp|Q9C0B1.3| RecName: Full=Alpha-ketoglutarate-dependent dioxygenase FTO; AltName: Full=Fat mass and obesity-associated protein; AltName: Full=U6 small nuclear RNA (2'-O-methyladenosine-N(6)-)-demethylase FTO; AltName: Full=U6 small nuclear RNA N(6)-methyladenosine-demethylase FTO; AltName: Full=mRNA (2'-O-methyladenosine-N(6)-)-demethylase FTO; Short=m6A(m)-demethylase FTO; AltName: Full=mRNA N(6)-methyladenosine demethylase FTO; AltName: Full=tRNA N1-methyl adenine demethylase FTO [Homo sapiens] >gb|AAI48443.1| Fat mass and obesity associated, partial [synthetic construct] >gb|AAI53033.1| Fat mass and obesity associated [synthetic construct] >gb|EAW82811.1| hCG23453 [Homo sapiens] >gb|KAI2578512.1| FTO alpha-ketoglutarate dependent dioxygenase [Homo sapiens] >gb|KAI4054901.1| FTO alpha-ket

In [6]:
def parse_align(align_file, alinhamento):
    """ 
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file,format = "fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")

In [10]:
parse_align("obterseqs.fasta", "align_FTO")

Alignment with 2294 rows and 35 columns
CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCT gi|1394533464|ref|NM_001363901.1|
CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCT gi|1394333984|ref|NM_001363899.1|
CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCT gi|1411140281|ref|XM_025370000.1|
CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCT gi|1411140275|ref|XM_025369997.1|
GATGATCTCAATGCCACCCACCAACACTGTGTTTT gi|1519243379|ref|NM_001080432.3|
GCGGTGGCGAAGGCGGCTTTAGTGGCAGCATGAAG gi|1519243379|ref|NM_001080432.3|
CCAGGCTGGAGTGCAGTGGCATGATCTCGGCTCAC gi|1519243379|ref|NM_001080432.3|
TTTTTTTTTTTTCTTTTTTTTTTTCAGACAGGGTC gi|1519243379|ref|NM_001080432.3|
GATGATCTCAATGCCACCCACCAACACTGTGTTTT gi|1394533504|ref|NM_001363905.1|
CAGTGGCAGCTGAAATATCCTAAACTAATTCTCCG gi|1394533504|ref|NM_001363905.1|
CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCT gi|1394533504|ref|NM_001363905.1|
GATGATCTCAATGCCACCCACCAACACTGTGTTTT gi|1394533483|ref|NM_001363891.1|
CTACGCTCTTCCAGCTGTCGGACCTGGGAAATTCT gi|1394533483|ref|NM_001363891.1|
GATGATCTCAATGCCACCCACCAACACTGTGTTTT gi|1394533462|

In [60]:
def parse_align_prot(align_file, alinhamento):
    """ 
    Variáveis:
        align_file_prot: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file,format = "fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")

In [61]:
parse_align_prot("obterprotseqs.fasta", "align_FTO_prot")

Alignment with 50 rows and 45 columns
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|NP_001073901.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|XP_024781684.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK dbj|BAG53322.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|XP_032013254.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK dbj|BAB21843.2|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|XP_510968.2|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|NP_001350820.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|XP_024781683.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|XP_003263101.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|NP_001350823.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK gb|PNJ62567.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|XP_032013253.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|XP_009429081.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK ref|NP_001126250.1|
MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK gb|KAI25

In [13]:
def consensus(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format = "fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus

In [14]:
consensus("align_FTO")




Seq('TTTTTTTTTTTTTTTTXTXXXXXXXXXXXXXXXXX')

In [15]:
def consensus_prot(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format = "fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus

In [16]:
consensus_prot("align_FTO_prot")




Seq('MKRTPTAEEREREAKKLRLLEELEDTWLPYLTPKDDEFYQQWQLK')

In [17]:
def stockholm(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta) 
        stock_file: ficheiro com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX.sth)
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas (ex: align_results_HHEX.sth)
    """
    form_stock = AlignIO.parse(align_file,"fasta")
    AlignIO.convert(align_file,"fasta", stock_file,"stockholm")
    #dá erro de ids duplicados


In [18]:
stockholm("align_FTO", "align_results_FTO.sth")

ValueError: Duplicate record identifier: gi|1519243379|ref|NM_001080432.3|

In [19]:
def stockholm_prot(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta) 
        stock_file: ficheio com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX.sth)
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas (ex: align_results_HHEX.sth)
    """
    form_stock = AlignIO.parse(align_file,"fasta")
    AlignIO.convert(align_file,"fasta", stock_file, "stockholm")


In [20]:
stockholm_prot("align_FTO_prot", "align_results_FTO_prot.sth")

Árvore filogenética

In [74]:
from Bio import Phylo
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor


In [75]:
def phylo_tree(stockholm_file):
    """ 
    Variáveis:
        stockholm_file: ficheiro com as sequências alinhadas, com extensão .sth (ex: align_results_HHEX_prot.sth)

    Returns:
        imprime a distância entre as espécies na árvore, constrói as árvores UPGMA e Neighbour Joining e cria um ficheiro newick (ex: phylo_trees.nhx) que contém ambas as árvores
    """
    alignment = AlignIO.read(open(stockholm_file),"stockholm")
    #print(alignment)
    print("*" * 140)
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(alignment)
    print(dm)
    print("*" * 140)
    constructor = DistanceTreeConstructor()
    upgmatree = constructor.upgma(dm)
    print(upgmatree)
    print("*" * 140)
    njtree = constructor.nj(dm)
    print(njtree)
    print("*" * 140)
    Phylo.write([upgmatree, njtree], "phylo_trees.nhx","newick")
    
    tree_up = Phylo.draw_ascii(upgmatree)
    print("*" * 140)
    tree_nj = Phylo.draw_ascii(njtree)


In [76]:
phylo_tree("align_results_FTO_prot.sth")

********************************************************************************************************************************************
ref|NP_001073901.1|	0
ref|XP_024781684.1|	0.0	0
dbj|BAG53322.1|	0.0	0.0	0
ref|XP_032013254.1|	0.0	0.0	0.0	0
dbj|BAB21843.2|	0.0	0.0	0.0	0.0	0
ref|XP_510968.2|	0.0	0.0	0.0	0.0	0.0	0
ref|NP_001350820.1|	0.0	0.0	0.0	0.0	0.0	0.0	0
ref|XP_024781683.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
ref|XP_003263101.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
ref|NP_001350823.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
gb|PNJ62567.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
ref|XP_032013253.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
ref|XP_009429081.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
ref|NP_001126250.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
gb|KAI2578511.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
gb|KAI2578510.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
ref|XP_011921544.1|	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0

In [77]:
form_phyl = AlignIO.parse("align_results_FTO_prot.sth","stockholm")
AlignIO.convert("align_results_FTO_prot.sth","stockholm","align_results_FTO_protphy.phy","phylip")

alignments = AlignIO.parse("align_results_FTO_protphy.phy", "phylip")
for alignment in alignments:
    print(alignment)
    print()

ValueError: Repeated name 'ref|NP_001' (originally 'ref|NP_001350820.1|'), possibly due to truncation