# Laboratórios de Bioinformática
### Análise da sequência e das features presentes no NCBI

Grupo 6: Joana Gabriel, Maria Couto, Teresa Coimbra

In [2]:
def get_idlist(termo_de_pesquisa):
    '''Recebe uma string que será o termo de pesquisa
    Retorna o conjunto de ids dos records que resultam dessa pesquisa (uma lista)'''
    from Bio import Entrez
    Entrez.email = 'lbiogrupo6@gmail.com'
    handle = Entrez.esearch(db='protein', term = termo_de_pesquisa, retmax = 100) #pesquisar na base de dados
    record = Entrez.read(handle)
    handle.close()
    idlist = record['IdList']
    return idlist

In [3]:
def get_ids(str1,str2, idlist):
    '''Recebe duas strings p.ex: str1 = nome do gene e do organismo, str2 = idlist
    Retorna uma lista de IDS filtrados de acordo com a pesquisa'''
    from Bio import Entrez
    res = []
    for id1 in idlist: 
        handle = Entrez.efetch(db='protein', id = id1 , rettype="gb", retmode="text")
        Entrez.email = 'lbiogrupo6@gmail.com'
        record = handle.read()
        if (str1 in record) and (str2 in record):
            res.append(id1)
    
    return res

In [4]:
def get_seqio(res_ids):
    '''Recebe a lista de IDS após a pesquisa
    Retorna uma lista de records SeqIO'''
    from Bio import Entrez
    from Bio import SeqIO
    RES = []
    for id0 in res_ids:
        Entrez.email = 'lbiogrupo6@gmail.com'
        handle = Entrez.efetch(db="protein", id=id0, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        RES.append(record)
    return RES

In [18]:
def get_fasta(ids):
    '''Recebe uma lista de ids para um dos genes
    Cria ficheiros fasta correspondentes a cada um dos ids'''
    from Bio import Entrez
    from Bio import SeqIO
    import re
    for id0 in ids:
        Entrez.email = 'lbiogrupo6@gmail.com'
        handle = Entrez.efetch(db="protein", id=id0, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        nome = str(record.id +'.faa')
        nome1 = re.sub('[*,"\/\\\[\]:;\|]', '', nome)  # Retirar os caracteres não permitidos para guardar o ficheiro
        SeqIO.write(record, nome1, 'fasta')
        
def get_annotations(record):
    '''Retorna um dicionario com id do record: anotação do record'''
    a = {}  # dicionario com as annotations
    for r in record: #para cada record
        a[r.id] = r.annotations
    return a      

def get_features(record):
    '''Recebe os records
    Faz print do ID, features e qualifiers'''
    for feat in record:
    
        print('\n')
        print('\n')
        print('-->ID:', feat.id)
        n = feat.features
        for i in range(len(record)):
            print('*********************')
            print('Features: ')
            print(n[i])                
            


In [19]:
def teste1():
    str1 = 'heterogeneous nuclear ribonucleoprotein A1'
    str2 = 'Homo sapiens'
    
    termo_de_pesquisa = 'hnrnpa1'
    idList = get_idlist(termo_de_pesquisa)
    
    ids = get_ids(str1,str2,idList)
    
    get_fasta(ids)
    
    record = get_seqio(ids) #pega no resultado filtrado dos ids
    anota = get_annotations(record)
    print(anota)
    feature = get_features(record)
    if feature != None:
        print(feature)
        
teste1()

{'NP_112420.1': {'topology': 'linear', 'data_file_division': 'PRI', 'date': '10-JAN-2021', 'accessions': ['NP_112420'], 'sequence_version': 1, 'db_source': 'REFSEQ: accession NM_031157.4', 'keywords': ['RefSeq', 'MANE Select'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='The nuclear localization sequence mediates hnRNPA1 amyloid fibril formation revealed by cryoEM structure', ...), Reference(title='Post-translational modifications of hnRNP A1 differentially modulate retroviral IRES-mediated translation initiation', ...), Reference(title='Heterogeneous Nuclear Ribonucleoprotein A1 (hnRNP A1) and hnRNP A2 Inhibit Splicing to Human Papillomavirus 16 Splice Site SA409 through a UAG-Containing Sequence in the E7 Coding Region', ...), Reference(tit

In [20]:
def teste2():
    '''Pesquisa pela nucleoproteína'''
    str1 = 'Nucleoprotein'
    str2 = 'SARS-CoV-2'
    termo_de_pesquisa = 'NCAP_SARS2'
    idlist = get_idlist(termo_de_pesquisa)
    
    ids = get_ids(str1,str2,idlist)
    get_fasta(ids)
    
    record = get_seqio(ids) 
    anota = get_annotations(record)
    
    print(anota)
    
    feature = get_features(record)

teste2()

{'P0DTC9.1': {'topology': 'linear', 'data_file_division': 'VRL', 'date': '02-DEC-2020', 'accessions': ['P0DTC9'], 'sequence_version': 1, 'db_source': 'UniProtKB: locus NCAP_SARS2, accession P0DTC9; class: standard. created: Apr 22, 2020. sequence updated: Apr 22, 2020. annotation updated: Dec 2, 2020. xrefs: MN908947.3, QHD43423.2, YP_009724397.2, 6M3M_A, 6M3M_B, 6M3M_C, 6M3M_D, 6VYO_A, 6VYO_B, 6VYO_C, 6VYO_D, 6WJI_A, 6WJI_B, 6WJI_C, 6WJI_D, 6WJI_E, 6WJI_F, 6WKP_A, 6WKP_B, 6WKP_C, 6WKP_D, 6WZO_A, 6WZO_B, 6WZO_C, 6WZO_D, 6WZQ_A, 6WZQ_B, 6WZQ_C, 6WZQ_D, 6YI3_A, 6YUN_A, 6YUN_B, 6ZCO_A, 7C22_A, 7C22_B, 7C22_C, 7C22_D, 7CDZ_A, 7CDZ_B, 7CDZ_C, 7CDZ_D, 7CE0_A, 7CE0_B, 7CE0_C, 7CE0_D xrefs (non-sequence databases): PDBsum:6M3M, PDBsum:6VYO, PDBsum:6WJI, PDBsum:6WKP, PDBsum:6WZO, PDBsum:6WZQ, PDBsum:6YI3, PDBsum:6YUN, PDBsum:6ZCO, PDBsum:7C22, PDBsum:7CDZ, PDBsum:7CE0, BMRB:P0DTC9, SMR:P0DTC9, BioGRID:4383847, ComplexPortal:CPX-5686, IntAct:P0DTC9, GeneID:43740575, SIGNOR:P0DTC9, Proteomes:UP00

In [21]:
def teste3():
    '''Pesquisa pela proteína SMAD3'''
    str1 = 'mothers against decapentaplegic homolog 3'
    str2 = 'Homo sapiens'
    termo_de_pesquisa = 'SMAD3'
 
    idlist = get_idlist(termo_de_pesquisa)
    
    ids = get_ids(str1,str2,idlist)
    get_fasta(ids)
    
    record = get_seqio(ids) 
    anota = get_annotations(record)
    
    print(anota)
    
    feature = get_features(record)
    if feature != None:
        print(feature)
teste3()

{'NP_001138576.1': {'topology': 'linear', 'data_file_division': 'PRI', 'date': '03-JAN-2021', 'accessions': ['NP_001138576'], 'sequence_version': 1, 'db_source': 'REFSEQ: accession NM_001145104.2', 'keywords': ['RefSeq'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Role of AP-2alpha/TGF-beta1/Smad3 axis in rats with intervertebral disc degeneration', ...), Reference(title='Interactome Mapping Provides a Network of Neurodegenerative Disease Proteins and Uncovers Widespread Protein Aggregation in Affected Brains', ...), Reference(title='In Vitro Lineage-Specific Differentiation of Vascular Smooth Muscle Cells in Response to SMAD3 Deficiency: Implications for SMAD3-Related Thoracic Aortic Aneurysm', ...), Reference(title='Statin inhibits large h