In [5]:
from Bio import Entrez
from Bio import SeqIO
from Bio import SearchIO
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import Medline


In [6]:
def articles(term):
    """ 
    Variáveis:
     term: termo que queremos procurar no título dos artigos, que é o nosso gene
    Returns:
        imprime o título, abstract, autores e fonta dos artigos que têm o nosso termo no título
    """
    Entrez.email = "pg49837@alunos.uminho.pt"
    handles = Entrez.einfo()

    handle1 = Entrez.esearch(db="pubmed", term=term + "[title]", retmax="40")
    record = Entrez.read(handle1)

    id_list = record["IdList"]
    handle = Entrez.efetch(db="pubmed", id=id_list,
                           rettype="medline", retmode="text")
    records = Medline.parse(handle)
    print()
    for record in records:
        print("Titulo:", record.get("TI", "v"))
        print()
        print("Autores:", record.get("AU", "v"))
        print("Fonte:", record.get("SO", "v"))
        print()
        print("Abstract:", record.get("AB", "v"))
        print()
        print("="*140)
        print()


articles("MTNR1B")



Titulo: The rs10830963 Polymorphism of the MTNR1B Gene: Association With Abnormal Glucose, Insulin and C-peptide Kinetics.

Autores: ['Vejrazkova D', 'Vankova M', 'Vcelak J', 'Krejci H', 'Anderlova K', 'Tura A', 'Pacini G', 'Sumova A', 'Sladek M', 'Bendlova B']
Fonte: Front Endocrinol (Lausanne). 2022 Jun 6;13:868364. doi: 10.3389/fendo.2022.868364. eCollection 2022.

Abstract: BACKGROUND: The MTNR1B gene encodes a receptor for melatonin, a hormone regulating biorhythms. Disruptions in biorhythms contribute to the development of type 2 diabetes mellitus (T2DM). Genetic studies suggest that variability in the MTNR1B gene affects T2DM development. Our aim was to compare the distribution of the genetic variant rs10830963 between persons differing in glucose tolerance in a sample of the Czech population (N=1206). We also evaluated possible associations of the polymorphism with insulin sensitivity, beta cell function, with the shape of glucose, insulin and C-peptide trajectories measured 7

ANÁLISE DA SEQUÊNCIA 

In [7]:
def seq_analysis(gb, fasta):
    '''
    Variáveis:
        gb: nome do ficheiro com extensão ".gb"
        fasta: nome do ficheiro convertido para a extensão .fasta
    Returns:
        imprime as informações contidas no ficheiro .gb e retorna um ficheiro .fasta
    '''

    record = SeqIO.read(gb, "genbank")
    id = record.name
    seq = record.seq
    seqlen = len(record.seq)
    source = record.annotations["source"]
    tam = len(record.annotations)
    desc = record.description
    features = len(record.features)
    totannot = record.annotations
    print(f"ID: {id} \n Sequência: {seq} \n Tamanho da sequência: {seqlen} bp \n Source: {source} \n Tamanho das anotações: {tam}")
    print(f"Descrição: {desc} \n Total features: {features}")
    print()
    print(f"Annotations: {totannot}")

    print()
    print("FEATURES:")
    for feat in record.features:
        print("-->", feat)
    print(f"Número de features: {features}")
    for feat in record.features:
        print("Type:", feat.type)
        print("Location:", feat.location)

    featcds = []
    for i in range(len(record.features)):
        if record.features[i].type == "CDS":
            featcds.append(i)
    for k in featcds:
        print(record.features[k].location)
    for k in featcds:
        print(record.features[k].extract(record.seq))
    print(featcds)

    for feat in record.features:
        if feat.type == 'CDS':
            print("Proteína codificada: ", feat.qualifiers['product'])

    for feat in record.features:
        if feat.type == 'gene':
            print("Significado biológico: ", feat.qualifiers["note"])

    records = SeqIO.parse(gb, "genbank")
    count = SeqIO.write(records, fasta, "fasta")
    print(f'Foi convertido {count} registo.')


ANÁLISE DE HOMOLOGIAS POR BLAST

In [8]:

def blast(fasta, blast_file):
    '''
    Variáveis:
        fasta: ficheiro fasta convertido anteriormente, com extensão .fasta
        blast_file: ficheiro com o resultado do BLAST, com extensão .xml
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados nucleotide
    '''
    record = SeqIO.read(fasta, format="fasta")
    print(len(record))
    result_handle = NCBIWWW.qblast("blastn", "nt", record.seq)
    save_file = open(blast_file, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        #print(f"Matrix: {br.matrix}")
        print(f"Database: {br.database}")
        print(f"Gap penalty: {br.gap_penalties}")


blast("MTNR1B.fasta", "blastn_MTNR1B.xml")


1634
Database: nt
Gap penalty: (5, 2)


In [9]:
def homologos(blast_file, evalue_thresh=0.01):
    '''
    Variáveis:
        blast_file: nome do ficheiro com o resultado do BLAST, com extensão .xml
        evalue_thresh: recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        retorna uma lista dos resultados do BLAST, de acordo com o valor dado pelo e-value
    '''
    evalue_tresh = evalue_thresh
    result_handle = open(blast_file)
    blast_record = NCBIXML.parse(result_handle)
    for br in blast_record:
        print("Database: ", br.database)
        print("Gap penalty: ", br.gap_penalties)
    print(len(br.alignments))
    for br_x in br.alignments:
        print(f"Acession number: {br_x.accession}")
        print(f"ID do hit: {br_x.hit_id}")
        print(f"Definição: {br_x.hit_def}")
        print(f"HSP: {br_x.hsps}")
        # faço break porque só quero o primeiro organismo (HUMAN) -> deve dar pra fazer algo com o entrez e selecionar o Homo sapiens
        break
    # change to be made: entrez_query = "Homo sapiens [organism]"
    print()
    for alignment in br.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("        ***ALINHAMENTO***")
                print(f"Identidade: {hsp.identities}")
                print(f"E-value: {hsp.expect}")
                print(f"Score: {hsp.score}")
                print(f"Tamanho: {hsp.align_length}")
                print(f"Caracteres iguais: {len(hsp.match)}")
                print("Query " + hsp.query[0:90] + "...")
                print("Match " + hsp.match[0:90] + "...")
                print("Sbjct " + hsp.sbjct[0:90] + "...")
                print()
    blastq_result = SearchIO.read(blast_file, "blast-xml")
    print(blastq_result)

    result_handle.close()

    blast_slice = blastq_result[:10]
    print(blast_slice)


homologos("blastn_MTNR1B.xml")


Database:  nt
Gap penalty:  (5, 2)
50
Acession number: NM_005959
ID do hit: gi|1789691083|ref|NM_005959.5|
Definição: Homo sapiens melatonin receptor 1B (MTNR1B), mRNA
HSP: [<Bio.Blast.Record.HSP object at 0x7f1854531c90>]

        ***ALINHAMENTO***
Identidade: 1634
E-value: 0.0
Score: 3268.0
Tamanho: 1634
Caracteres iguais: 1634
Query CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAGCACAGCGCGGGAGAGTCTGCGATGTCAGAGAACGGC...
Match ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
Sbjct CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAGCACAGCGCGGGAGAGTCTGCGATGTCAGAGAACGGC...

        ***ALINHAMENTO***
Identidade: 1634
E-value: 0.0
Score: 3268.0
Tamanho: 1634
Caracteres iguais: 1634
Query CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAGCACAGCGCGGGAGAGTCTGCGATGTCAGAGAACGGC...
Match ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
Sbjct CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAA

In [10]:
def MTNR1B(gb, fasta, blast_file, gene, do_blast=False, evalue_thresh=0.01):
    '''
    Variáveis:
        gb: ficheiro genbank obtido online  com extensão .gb
        blast_file:  nome do ficheiro com os resultados do BLAST #depois cada um alterar para o nome do seu gene
        do_blast: recebe o booleano False por default. Se blast = True, realiza um BLAST.
        evalue_tresh:  recebe valor 0.05, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações dos genes
        '''

    seq_analysis(gb, fasta)
    if do_blast == True:
        blast(fasta, blast_file)
    homologos(blast_file, evalue_thresh=evalue_thresh)
    result_handle = open(blast_file, "r")
    #blast_record = NCBIXML.read(result_handle)
    FILE = str("results_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()


MTNR1B("MTNR1B.gb", "MTNR1B.fasta", "blastn_MTNR1B.xml", "MTNR1B", do_blast=True)

# esta def resume todas as outras acima


ID: NM_005959 
 Sequência: CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAGCACAGCGCGGGAGAGTCTGCGATGTCAGAGAACGGCTCCTTCGCCAACTGCTGCGAGGCGGGCGGGTGGGCAGTGCGCCCGGGCTGGTCGGGGGCTGGCAGCGCGCGGCCCTCCAGGACCCCTCGACCTCCCTGGGTGGCTCCAGCGCTGTCCGCGGTGCTCATCGTCACCACCGCCGTGGACGTCGTGGGCAACCTCCTGGTGATCCTCTCCGTGCTCAGGAACCGCAAGCTCCGGAACGCAGGTAATTTGTTCTTGGTGAGTCTGGCATTGGCTGACCTGGTGGTGGCCTTCTACCCCTACCCGCTAATCCTCGTGGCCATCTTCTATGACGGCTGGGCCCTGGGGGAGGAGCACTGCAAGGCCAGCGCCTTTGTGATGGGCCTGAGCGTCATCGGCTCTGTCTTCAATATCACTGCCATCGCCATTAACCGCTACTGCTACATCTGCCACAGCATGGCCTACCACCGAATCTACCGGCGCTGGCACACCCCTCTGCACATCTGCCTCATCTGGCTCCTCACCGTGGTGGCCTTGCTGCCCAACTTCTTTGTGGGGTCCCTGGAGTACGACCCACGCATCTATTCCTGCACCTTCATCCAGACCGCCAGCACCCAGTACACGGCGGCAGTGGTGGTCATCCACTTCCTCCTCCCTATCGCTGTCGTGTCCTTCTGCTACCTGCGCATCTGGGTGCTGGTGCTTCAGGCCCGCAGGAAAGCCAAGCCAGAGAGCAGGCTGTGCCTGAAGCCCAGCGACTTGCGGAGCTTTCTAACCATGTTTGTGGTGTTTGTGATCTTTGCCATCTGCTGGGCTCCACTTAACTGCATCGGCCTCGCTGTGGCCATCAACCCCCAAGAAATGGCTCCCCAGATCCCTGAGGGGCTATTTGTCACTAGCTACTTACTGGCTTATTTCA

FERRAMENTAS DE ANÁLISE DE PROTEÍNAS

In [11]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SearchIO
from Bio.PDB.PDBParser import PDBParser
from Bio import ExPASy


In [13]:
def get_protein(id):
    """
    Variávies:
        id: identificador da proteína obtido na swissprot
    Returns:
        imprime informações sobre a proteína
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    id = seq_record.id
    seq = seq_record.seq
    tam = len(seq_record.seq)
    name = seq_record.name
    desc = seq_record.description
    com = seq_record.annotations["comment"]
    taxon = seq_record.annotations["taxonomy"]
    organism = seq_record.annotations["organism"]
    key = seq_record.annotations["keywords"]
    print(f"ID {id} \n Sequência: {seq} \n Tamanho da sequência: {tam} aa")
    print(f"Nome: {name} \n Descrição: {desc} \n Taxonomia: {taxon} \n Organismo: {organism} \n Keywords: {key}")


get_protein("P49286.1")


ID P49286 
 Sequência: MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSAVLIVTTAVDVVGNLLVILSVLRNRKLRNAGNLFLVSLALADLVVAFYPYPLILVAIFYDGWALGEEHCKASAFVMGLSVIGSVFNITAIAINRYCYICHSMAYHRIYRRWHTPLHICLIWLLTVVALLPNFFVGSLEYDPRIYSCTFIQTASTQYTAAVVVIHFLLPIAVVSFCYLRIWVLVLQARRKAKPESRLCLKPSDLRSFLTMFVVFVIFAICWAPLNCIGLAVAINPQEMAPQIPEGLFVTSYLLAYFNSCLNAIVYGLLNQNFRREYKRILLALWNPRHCIQDASKGSHAEGLQSPAPPIIGVQHQADAL 
 Tamanho da sequência: 362 aa
Nome: ML1B_HUMAN 
 Descrição: MELATONIN RECEPTOR TYPE 1B (MEL-1B-R). 
 Taxonomia: ['EUKARYOTA', 'METAZOA', 'CHORDATA', 'VERTEBRATA', 'TETRAPODA', 'MAMMALIA', 'EUTHERIA', 'PRIMATES'] 
 Organismo: HOMO SAPIENS (HUMAN) 
 Keywords: ['G-PROTEIN COUPLED RECEPTOR', 'TRANSMEMBRANE', 'GLYCOPROTEIN']


In [21]:
id = str(input())
handle = ExPASy.get_sprot_raw(id)
seq_record = SeqIO.read(handle, "swiss")
seq = seq_record.seq
print(seq)

MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSAVLIVTTAVDVVGNLLVILSVLRNRKLRNAGNLFLVSLALADLVVAFYPYPLILVAIFYDGWALGEEHCKASAFVMGLSVIGSVFNITAIAINRYCYICHSMAYHRIYRRWHTPLHICLIWLLTVVALLPNFFVGSLEYDPRIYSCTFIQTASTQYTAAVVVIHFLLPIAVVSFCYLRIWVLVLQARRKAKPESRLCLKPSDLRSFLTMFVVFVIFAICWAPLNCIGLAVAINPQEMAPQIPEGLFVTSYLLAYFNSCLNAIVYGLLNQNFRREYKRILLALWNPRHCIQDASKGSHAEGLQSPAPPIIGVQHQADAL


In [14]:
def prot_blast(blastp_file, id):
    """ 
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
        query: sequência proteica query
        id: identificador da proteína obtido na swissprot
    Returns:
        retorna um ficheiro .xml com o resultado do blastn na base de dados (swissprot?)
    """
    #id = str(input())
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    #seq_prot= query
    result_handle = NCBIWWW.qblast('blastp', 'nr', seq_prot)

    save_file = open(blastp_file, "w")
    save_file.write(result_handle.read())
    save_file.close()

    blast_record = NCBIXML.parse(result_handle)
    result_handle.close()
# blast na DB non-redundant pra descobrir os homólogos,


In [15]:
def homologos_p(blastp_file, evalue_thresh=None):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp 
        evalue_tresh: evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLASTp
    Returns:
        retorna uma lista dos resultados do BLASTp, e respetivo valor dado pelo e-value
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    FILE = str("seqshomologas_blastp.fasta")
    save_file = open(FILE, 'w+')
    if evalue_thresh == None:
        evalue_thresh = 0.05
    for alignment in blast_record.alignments:
        for hsp in range(len(alignment.hsps)):
            if alignment.hsps[hsp].expect < evalue_thresh:
                if hsp != 0:
                    pass
                else:
                    save_file.write('>' + alignment.title +
                                    '\n' + alignment.hsps[hsp].sbjct + '\n')

    result_handle.close()


In [16]:
def prot_file(blastp_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blastp (ex: prot_blastp_nr.xml)
    Returns:
        imprime informações dos alinhamentos das proteínas
    """
    result_handle = open(blastp_file)
    blast_record = NCBIXML.read(result_handle)
    evalue_tresh = 1e-30
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < evalue_tresh:
                print("      ***ALINHAMENTO***")
                print("Sequência: ", alignment.title)
                print("Tamanho da sequência: ", alignment.length)
                print("E-value:", hsp.expect)
                print("Score: ", hsp.score)
                print(f"Caracteres iguais: {len(hsp.match)}")
                print(hsp.query[0:75] + "...")
                print(hsp.match[0:75] + "...")
                print(hsp.sbjct[0:75] + "...")
                print()

    blastq_result = SearchIO.read(blastp_file, "blast-xml")
    print(blastq_result)
    for br in blastq_result:
        print(f'Sequence ID: {br.id}')
        print(f'Description: {br.description}')
        print(f'E-value: {br[0].evalue}')
        print(f'Bit Score: {br[0].bitscore}')
        print(f'Alignment:\n{br[0].aln}')
        print()
    result_handle.close()


In [27]:
def PROTEIN(id, blastp_file, gene, blast=False, evalue_thresh=0.01):
    """ 
    Variáveis:
        id: id da proteína obtido na swissprot
        blastp_file:  nome do ficheiro com os resultados do BLASTp  #depois cada um alterar para o nome do seu gene
        blast: recebe o booleano False por default. Se blast = True, realiza um BLAST na base de dados swissprot
        evalue_tresh:  recebe valor 0.01, por default, ou um número inteiro. Este parâmetro descreve o valor de e-value máximo aceitável para o tratamento do output do BLAST
    Returns:
        gera um ficheiro em formato de texto (.txt), que contém as informações das proteínas
    """
    if blast == True:
        prot_blast(blastp_file, id)
    x = get_protein(id)
    print(x)
    homologos_p(blastp_file, evalue_thresh)
    prot_file(blastp_file)
    result_handle = open(blastp_file, "r")
    FILE = str("results_protein_" + gene + ".txt")
    save_file = open(FILE, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()


PROTEIN("P49286.1", "protein_blastp_swiss.xml", "MTNR1B", False)


ID P49286 
 Sequência: MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSAVLIVTTAVDVVGNLLVILSVLRNRKLRNAGNLFLVSLALADLVVAFYPYPLILVAIFYDGWALGEEHCKASAFVMGLSVIGSVFNITAIAINRYCYICHSMAYHRIYRRWHTPLHICLIWLLTVVALLPNFFVGSLEYDPRIYSCTFIQTASTQYTAAVVVIHFLLPIAVVSFCYLRIWVLVLQARRKAKPESRLCLKPSDLRSFLTMFVVFVIFAICWAPLNCIGLAVAINPQEMAPQIPEGLFVTSYLLAYFNSCLNAIVYGLLNQNFRREYKRILLALWNPRHCIQDASKGSHAEGLQSPAPPIIGVQHQADAL 
 Tamanho da sequência: 362 aa
Nome: ML1B_HUMAN 
 Descrição: MELATONIN RECEPTOR TYPE 1B (MEL-1B-R). 
 Taxonomia: ['EUKARYOTA', 'METAZOA', 'CHORDATA', 'VERTEBRATA', 'TETRAPODA', 'MAMMALIA', 'EUTHERIA', 'PRIMATES'] 
 Organismo: HOMO SAPIENS (HUMAN) 
 Keywords: ['G-PROTEIN COUPLED RECEPTOR', 'TRANSMEMBRANE', 'GLYCOPROTEIN']
None
      ***ALINHAMENTO***
Sequência:  sp|P49286.1| RecName: Full=Melatonin receptor type 1B; Short=Mel-1B-R; Short=Mel1b receptor [Homo sapiens]
Tamanho da sequência:  362
E-value: 0.0
Score:  1901.0
Caracteres iguais: 362
MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSAVLIVTTAVDVVGNLLVILSV

In [None]:
def PDB(id, pdb_file):
    """ 
    Variáveis:
        id: id da proteína obtido da base de datos de estruturas de proteínas PDB
        pdb_file: nome do ficheiro com as informações sobre a estrutura da proteína obtidas a partir da base de dados PDB ("2e1o.pdb") 
    Returns:
        imprime informações da estrutura da proteína e a sua estrutura 3D
    """
    p = PDBParser(PERMISSIVE=1)
    s = p.get_structure(id, pdb_file)
    for chain in s[0]:
        print(f'Chain ID: {chain.id}')
    smeth = s.header['structure_method']
    keywords = s.header['keywords']
    comp = s.header["compound"]
    print("Keywords: ", keywords)
    print("Structure Method: ", smeth)
    print("Composto: ", comp)

    import nglview as nv
    nv.show_biopython(s, gui=True)


# erro
PDB("6me6", "6me6.pdb")


In [32]:
def CDD(id, cdd_file):
    """ 
    Variáveis:
        id: 
        cdd_file: nome do ficheiro com as informações sobre os domínios das proteínas obtidas a partir da base de dados CDD 
    Returns:
        imprime as informações obtidas a partir da base de dados CDD
    """
    handle = ExPASy.get_sprot_raw(id)
    seq_record = SeqIO.read(handle, "swiss")
    seq_prot = seq_record.seq
    result_handle = NCBIWWW.qblast("blastp", "CDD", seq_prot)
    save_file = open(cdd_file, "w")
    blast_records = SearchIO.read(result_handle, "blast-xml")
    save_file.write(print(blast_records[:]))
    save_file.close()
    result_handle.close()


CDD("P49286.1", "cdd_file_MTNR1B")


Program: blastp (2.13.0+)
  Query: unnamed (362)
         protein product
 Target: CDD
   Hits: 0


TypeError: write() argument must be str, not None

ALINHAMENTO MÚLTIPLO

In [33]:
from Bio import SeqIO
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import AlignInfo
from Bio.Align import MultipleSeqAlignment
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW


In [34]:
def align(blast_file, align_file):
    """
    Variáveis:
        blast_file: ficheiro com o resultado do blast 
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta 
    Returns:
        gera um ficheiro, com extensão .fasta com as sequências alinhadas e imprime-as
    """
    result_blast = open(blast_file)
    blast_records = NCBIXML.read(result_blast)
    FILE = str(align_file)
    save_file = open(FILE, 'w+')
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            save_file.write('>' + alignment.title +
                            '\n' + hsp.sbjct[0:45] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()


align("blastn_MTNR1B.xml", "obtersequencias.fasta")


> gi|1789691083|ref|NM_005959.5| Homo sapiens melatonin receptor 1B (MTNR1B), mRNA 
 CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGGTGGCCAAAGCACAGCGCGGGAGAGTCTGCGATGTCAGAGAACGGCTCCTTCGCCAACTGCTGCGAGGCGGGCGGGTGGGCAGTGCGCCCGGGCTGGTCGGGGGCTGGCAGCGCGCGGCCCTCCAGGACCCCTCGACCTCCCTGGGTGGCTCCAGCGCTGTCCGCGGTGCTCATCGTCACCACCGCCGTGGACGTCGTGGGCAACCTCCTGGTGATCCTCTCCGTGCTCAGGAACCGCAAGCTCCGGAACGCAGGTAATTTGTTCTTGGTGAGTCTGGCATTGGCTGACCTGGTGGTGGCCTTCTACCCCTACCCGCTAATCCTCGTGGCCATCTTCTATGACGGCTGGGCCCTGGGGGAGGAGCACTGCAAGGCCAGCGCCTTTGTGATGGGCCTGAGCGTCATCGGCTCTGTCTTCAATATCACTGCCATCGCCATTAACCGCTACTGCTACATCTGCCACAGCATGGCCTACCACCGAATCTACCGGCGCTGGCACACCCCTCTGCACATCTGCCTCATCTGGCTCCTCACCGTGGTGGCCTTGCTGCCCAACTTCTTTGTGGGGTCCCTGGAGTACGACCCACGCATCTATTCCTGCACCTTCATCCAGACCGCCAGCACCCAGTACACGGCGGCAGTGGTGGTCATCCACTTCCTCCTCCCTATCGCTGTCGTGTCCTTCTGCTACCTGCGCATCTGGGTGCTGGTGCTTCAGGCCCGCAGGAAAGCCAAGCCAGAGAGCAGGCTGTGCCTGAAGCCCAGCGACTTGCGGAGCTTTCTAACCATGTTTGTGGTGTTTGTGATCTTTGCCATCTGCTGGGCTCCACTTAACTGCATCGGCCTCGCTGTGGCCATCAACCCCCAAGAA

In [35]:
def align_prot(blastp_file, align_file):
    """
    Variáveis:
        blastp_file: ficheiro com o resultado do blast 
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta 
        gera um ficheiro, com extensão .fasta com as sequências alinhadas e imprime-as
    """
    result_blast = open(blastp_file)
    blast_records = NCBIXML.read(result_blast)
    FILE = str(align_file)
    save_file = open(FILE, 'w+')
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            save_file.write('>' + alignment.title +
                            '\n' + hsp.sbjct[0:45] + '\n')
            print(">", alignment.title, "\n", hsp.query)
            print()


align_prot("protein_blastp_swiss.xml", "obterprotsequencias.fasta")


> sp|P49286.1| RecName: Full=Melatonin receptor type 1B; Short=Mel-1B-R; Short=Mel1b receptor [Homo sapiens] 
 MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSAVLIVTTAVDVVGNLLVILSVLRNRKLRNAGNLFLVSLALADLVVAFYPYPLILVAIFYDGWALGEEHCKASAFVMGLSVIGSVFNITAIAINRYCYICHSMAYHRIYRRWHTPLHICLIWLLTVVALLPNFFVGSLEYDPRIYSCTFIQTASTQYTAAVVVIHFLLPIAVVSFCYLRIWVLVLQARRKAKPESRLCLKPSDLRSFLTMFVVFVIFAICWAPLNCIGLAVAINPQEMAPQIPEGLFVTSYLLAYFNSCLNAIVYGLLNQNFRREYKRILLALWNPRHCIQDASKGSHAEGLQSPAPPIIGVQHQADAL

> sp|P48039.1| RecName: Full=Melatonin receptor type 1A; Short=Mel-1A-R; Short=Mel1a receptor [Homo sapiens] 
 SGAGSARPSRTP--------RPPWVAPALSAVLIVTTAVDVVGNLLVILSVLRNRKLRNAGNLFLVSLALADLVVAFYPYPLILVAIFYDGWALGEEHCKASAFVMGLSVIGSVFNITAIAINRYCYICHSMAYHRIYRRWHTPLHICLIWLLTVVALLPNFFVGSLEYDPRIYSCTFIQTASTQYTAAVVVIHFLLPIAVVSFCYLRIWVLVLQARRKAKPESRLCLKPSDLRSFLTMFVVFVIFAICWAPLNCIGLAVAINPQEMAPQIPEGLFVTSYLLAYFNSCLNAIVYGLLNQNFRREYKRILLALWNPRHCIQDASKGSHAEGLQSPAP

> sp|Q13585.3| RecName: Full=Melatonin-related receptor; AltName: Full=G pro

In [36]:
def parse_align(align_file, alinhamento):
    """ 
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta 
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta 
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file, format="fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")


parse_align("obtersequencias.fasta", "align_MTNR1B")


Alignment with 56 rows and 45 columns
CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGG gi|1789691083|ref|NM_005959.5|
CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGG gi|46575756|gb|BC069163.1|
CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGG gi|1753034566|ref|XM_004051965.3|
CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGG---------- gi|1849011891|ref|XM_003813777.2|
CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGG---------- gi|1367141642|ref|XM_016921786.2|
ATGTCAGAGAACGGCTCCTTCGCCAACTGCTGCGAGGCGGGCGGG gi|2217282951|ref|XM_011542839.3|
CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGG gi|1743164614|ref|XM_030829598.1|
CGGCTCAGTACTGCGCGCGCCCTGCGGCTGTCCGGGGCTGCGCGG gi|1800005746|ref|XM_032168105.1|
CGGCTCAGTACTGCGCGAGCCCTGCGGCTGTCCGAGGCCGCGCGG gi|795222065|ref|XM_011995005.1|
CGGCTCAGTACTGCGCCAGCCGTGCGGCTGTCCGAGGCCGCGCGG gi|1411108678|ref|XM_025357858.1|
CGGCTCAGTACTGCGCTCGCCCTGCGGCTGTCCGGGGCCGCGCGG gi|1059110641|ref|XM_017851599.1|
CGGCTCAGTACCGCGCGCGCCCTGCGGCTGTCCGGGGCCGCGCGG gi|1482585288|ref|XM_023209139.2|
GCTCAGTACTGCG

In [37]:
def parse_align_prot(align_file, alinhamento):
    """ 
    Variáveis:
        align_file_prot: ficheiro com as sequências alinhadas, com extensão .fasta (ex: obterseqs.fasta)
        alinhamento: parse do ficheiro align_file com as sequências alinhadas, com extensão .fasta (ex: align_HHEX.fasta)
    Returns:
        gera um ficheiro com as sequências alinhadas, com extensão .fasta, e imprime-as
        """
    alignments = AlignIO.parse(align_file, format="fasta")
    for alignment in alignments:
        print(alignment)
    AlignIO.write(alignment, alinhamento, "fasta")


parse_align_prot("obterprotsequencias.fasta", "align_MTNR1B_prot")


Alignment with 17 rows and 45 columns
MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSA sp|P49286.1|
QGNGSALPNASQPVLRGDGARPSWLASALACVLIFTIVVDILGNL sp|P48039.1|
MVITIVVDLIGNSMVILAVTKNKKLRNSGNIFVVSLSVADMLVAI sp|Q13585.3|
LALAYGAVIILG----VSGNLALIIIILKQKEMRNVTNILIVNLS sp|P25929.1|
TPELPGRAKLALVLTGVLIF--ALALFGNALVFYVVTRSKAMRTV sp|Q96P65.2|
SGGGDNRTLVGPAPSAGARAVLVPVLYLLVCAAGLGGNTLVIYVV sp|P35346.3|
MVGNTLVCFIVLKNRHMHTVTNMFILNLAVSDLLVGIFCMPTTLV sp|Q9GZQ6.1|
SAILISFIYSVVCLVGLCGNSMVIYVILRYAKMKTATNIYILNLA sp|P30872.1|
TITVVLAVLILITVAGNVVVCLAVGLNRRLRNLTNCFIVSLAITD sp|P25021.1|
PQVAAIFIISYFLIFFLCMMGNTVVCFIVMRNKHMHTVTNLFILN sp|Q9Y5X5.2|
AIQCIYALVCLVGLVGNALVIFVILRYAKMKTATNIYLLNLAVAD sp|P31391.2|
LTFIYFVVCIIGLCGNTLVIYVILRYAKMKTITNIYILNLAIADE sp|P30874.1|
ALGVLGNSLVITVLARSKPGKPRSTTNLFILNLSIADLAYLLFCI sp|P47211.3|
IETVVGVLGNLCLMCVTVRQKEKANVTNLLIANLAFSDFLMCLLC sp|P0DQD5.1|
GWA-EPDSNGSAGSEDAQL-EPAHISPAIPVIITAVYSVVFVVGL sp|P41145.2|
PPTGSPSMITAITIMALYSIVCVVGLFGNFLVMYVIVRYTKMKTA sp|P35372.2|
LLGNCLVMYVILRHTKMK

In [38]:
def consensus(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta 
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format="fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus


consensus("align_MTNR1B")





Seq('XXGXXXXXXXXXGXXXXXXXXXXXXXXXXXXXXGXGXXXGXXXGG')

In [39]:
def consensus_prot(align_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta 
    Returns
        retorna a sequência consensus
    """
    alignments = AlignIO.parse(align_file, format="fasta")
    for alignment in alignments:
        print("")
    summary_align = AlignInfo.SummaryInfo(alignment)
    consensus = summary_align.dumb_consensus()
    return consensus


consensus_prot("align_MTNR1B_prot")





Seq('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

In [40]:
def stockholm(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta 
        stock_file: ficheiro com as sequências alinhadas, com extensão .sth 
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas 
    """
    form_stock = AlignIO.parse(align_file, "fasta")
    AlignIO.convert(align_file, "fasta", stock_file, "stockholm")


# erro de ids
stockholm("align_MTNR1B", "align_results_MTNR1B.sth")


ValueError: Duplicate record identifier: gi|324711000|ref|NG_028160.1|

In [41]:
def stockholm_prot(align_file, stock_file):
    """
    Variáveis:
        align_file: ficheiro com as sequências alinhadas, com extensão .fasta 
        stock_file: ficheio com as sequências alinhadas, com extensão .sth 
    Returns:
        converte o ficheiro .fasta num ficheiro no formato .stockholm, com as sequências alinhadas 
    """
    form_stock = AlignIO.parse(align_file, "fasta")
    AlignIO.convert(align_file, "fasta", stock_file, "stockholm")


stockholm_prot("align_MTNR1B_prot", "align_results_MTNR1B_prot.sth")


ÁRVORE FILOGETICA

In [42]:
from Bio import Phylo
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor


In [43]:
def phylo_tree(stockholm_file):
    """ 
    Variáveis:
        stockholm_file: ficheiro com as sequências alinhadas, com extensão .sth

    Returns:
        imprime a distância entre as espécies na árvore, constrói as árvores UPGMA e Neighbour Joining e cria um ficheiro newick (ex: phylo_trees.nhx) que contém ambas as árvores
    """
    alignment = AlignIO.read(open(stockholm_file), "stockholm")
    # print(alignment)
    #print("*" * 140)
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(alignment)
    print(dm)
    print("*" * 140)
    constructor = DistanceTreeConstructor()
    upgmatree = constructor.upgma(dm)
    print(upgmatree)
    print("*" * 140)
    njtree = constructor.nj(dm)
    print(njtree)
    print("*" * 140)
    Phylo.write([upgmatree, njtree], "phylo_trees.nhx", "newick")

    tree_up = Phylo.draw_ascii(upgmatree)
    print("*" * 140)
    tree_nj = Phylo.draw_ascii(njtree)


phylo_tree("align_results_MTNR1B_prot.sth")


sp|P49286.1|	0
sp|P48039.1|	1.1333333333333333	0
sp|Q13585.3|	1.2392156862745098	1.087719298245614	0
sp|P25929.1|	1.2412280701754386	1.2115384615384615	1.1894736842105262	0
sp|Q96P65.2|	1.2100840336134453	1.1851851851851851	1.1844660194174756	1.1336898395721926	0
sp|P35346.3|	1.203921568627451	1.0745614035087718	1.2212389380530975	1.0869565217391304	1.1330275229357798	0
sp|Q9GZQ6.1|	1.1686274509803922	1.1637931034482758	1.0775862068965518	1.0961538461538463	1.1945701357466063	1.0948275862068966	0
sp|P30872.1|	1.1764705882352942	1.1885964912280702	1.1719457013574661	0.696969696969697	1.0754716981132075	1.1283185840707965	1.1336206896551724	0
sp|P25021.1|	1.1764705882352942	1.2280701754385965	1.074766355140187	1.0512820512820513	1.0728155339805825	1.2123893805309733	1.0689655172413792	1.0407239819004526	0
sp|Q9Y5X5.2|	1.215686274509804	1.2330508474576272	1.1101694915254237	1.074074074074074	1.1651785714285714	1.1991525423728813	1.1440677966101696	0.9830508474576272	1.042372881355932	0
sp

In [44]:
form_phyl = AlignIO.parse("align_results_MTNR1B_prot.sth", "stockholm")
AlignIO.convert("align_results_MTNR1B_prot.sth", "stockholm",
                "align_results_MTNR1B_protphy.phy", "phylip")

alignments = AlignIO.parse("align_results_MTNR1B_protphy.phy", "phylip")
for alignment in alignments:
    print(alignment)
    print()


Alignment with 17 rows and 45 columns
MSENGSFANCCEAGGWAVRPGWSGAGSARPSRTPRPPWVAPALSA sp|P49286.
QGNGSALPNASQPVLRGDGARPSWLASALACVLIFTIVVDILGNL sp|P48039.
MVITIVVDLIGNSMVILAVTKNKKLRNSGNIFVVSLSVADMLVAI sp|Q13585.
LALAYGAVIILG----VSGNLALIIIILKQKEMRNVTNILIVNLS sp|P25929.
TPELPGRAKLALVLTGVLIF--ALALFGNALVFYVVTRSKAMRTV sp|Q96P65.
SGGGDNRTLVGPAPSAGARAVLVPVLYLLVCAAGLGGNTLVIYVV sp|P35346.
MVGNTLVCFIVLKNRHMHTVTNMFILNLAVSDLLVGIFCMPTTLV sp|Q9GZQ6.
SAILISFIYSVVCLVGLCGNSMVIYVILRYAKMKTATNIYILNLA sp|P30872.
TITVVLAVLILITVAGNVVVCLAVGLNRRLRNLTNCFIVSLAITD sp|P25021.
PQVAAIFIISYFLIFFLCMMGNTVVCFIVMRNKHMHTVTNLFILN sp|Q9Y5X5.
AIQCIYALVCLVGLVGNALVIFVILRYAKMKTATNIYLLNLAVAD sp|P31391.
LTFIYFVVCIIGLCGNTLVIYVILRYAKMKTITNIYILNLAIADE sp|P30874.
ALGVLGNSLVITVLARSKPGKPRSTTNLFILNLSIADLAYLLFCI sp|P47211.
IETVVGVLGNLCLMCVTVRQKEKANVTNLLIANLAFSDFLMCLLC sp|P0DQD5.
GWA-EPDSNGSAGSEDAQL-EPAHISPAIPVIITAVYSVVFVVGL sp|P41145.
PPTGSPSMITAITIMALYSIVCVVGLFGNFLVMYVIVRYTKMKTA sp|P35372.
LLGNCLVMYVILRHTKMKTATNIYIFNLALADTLV-LLTLPFQGT sp|P