# Searching the UniProt database and saving fastas:

This notebook is really just to demonstrate how Andrew finds the sequences for the datasets.  <br>

If you do call it from within our github repository, you'll probably want to add the fastas to the `.gitignore` file.

In [1]:
# Import bioservices module, to run remote UniProt queries
# (will probably need to pip install this to use)
from bioservices import UniProt


## Connecting to UniProt using bioservices:


In [2]:
service = UniProt() 
fasta_path = 'refined_query_fastas/' #optional file organization param

## Query with signal_peptide

In [3]:
def data_saving_function_with_SP(organism,save_path=''):
    
    secreted_query = f'(((organism:{organism} OR host:{organism}) annotation:("signal peptide") keyword:secreted) NOT annotation:(type:transmem)) AND reviewed:yes'
    secreted_result = service.search(secreted_query, frmt="fasta")
    secreted_outfile = f'{save_path}{organism}_secreted_SP_new.fasta'
    with open(secreted_outfile, 'a') as ofh:
        ofh.write(secreted_result)
        

    cytoplasm_query = f'(((organism:{organism} OR host:{organism}) locations:(location:cytoplasm)) NOT (annotation:(type:transmem) OR annotation:("signal peptide"))) AND reviewed:yes'
    cytoplasm_result = service.search(cytoplasm_query, frmt="fasta")
    cytoplasm_outfile = f'{save_path}{organism}_cytoplasm_SP_new.fasta'
    with open(cytoplasm_outfile, 'a') as ofh:
        ofh.write(cytoplasm_result)
    
    membrane_query = f'(((organism:{organism} OR host:{organism}) annotation:(type:transmem)) annotation:("signal peptide")) AND reviewed:yes'
    membrane_result = service.search(membrane_query, frmt="fasta")
    membrane_outfile = f'{save_path}{organism}_membrane_SP_new.fasta'
    with open(membrane_outfile, 'a') as ofh:
        ofh.write(membrane_result)
    

    

In [4]:
data_saving_function_with_SP('human',fasta_path)

In [4]:
data_saving_function_with_SP('escherichia',fasta_path)

## Query without signal_peptide

In [3]:
def data_saving_function_without_SP(organism,save_path=''):
# maybe new: 

    secreted_query = f'(((organism:{organism} OR host:{organism}) AND (keyword:secreted OR goa:("extracellular region [5576]"))) NOT (annotation:(type:transmem) OR goa:("membrane [16020]") OR locations:(location:cytoplasm) OR goa:("cytoplasm [5737]") )) AND reviewed:yes'
    secreted_result = service.search(secreted_query, frmt="fasta")
    secreted_outfile = f'{save_path}{organism}_secreted_noSP_new_new.fasta'
    with open(secreted_outfile, 'a') as ofh:
        ofh.write(secreted_result)
        
    cytoplasm_query = f'(((organism:{organism} OR host:{organism}) AND (locations:(location:cytoplasm) OR goa:("cytoplasm [5737]")) ) NOT (annotation:(type:transmem) OR goa:("membrane [16020]") OR keyword:secreted OR goa:("extracellular region [5576]") )) AND reviewed:yes'
    cytoplasm_result = service.search(cytoplasm_query, frmt="fasta")
    cytoplasm_outfile = f'{save_path}{organism}_cytoplasm_noSP_new_new.fasta'
    with open(cytoplasm_outfile, 'a') as ofh:
        ofh.write(cytoplasm_result)
        

    membrane_query= f'(((organism:{organism} OR host:{organism}) AND ( annotation:(type:transmem) OR goa:("membrane [16020]") )) NOT ( keyword:secreted OR goa:("extracellular region [5576]") OR locations:(location:cytoplasm) OR goa:("cytoplasm [5737]") )) AND reviewed:yes'
    membrane_result = service.search(membrane_query, frmt="fasta")
    membrane_outfile = f'{save_path}{organism}_membrane_noSP_new_new.fasta'
    with open(membrane_outfile, 'a') as ofh:
        ofh.write(membrane_result)
    

    

In [4]:
data_saving_function_without_SP('human',fasta_path)

In [None]:
data_saving_function_without_SP('yeast',fasta_path)

In [None]:
data_saving_function_without_SP('escherichia',fasta_path)

## Query ALL SHIT (warning: do not do unless you have lots of free time and computer memory)

In [3]:
def data_saving_function_without_SP_full_uniprot(save_path=''):
# maybe new: 
    secreted_query = f'((keyword:secreted OR goa:("extracellular region [5576]")) NOT (annotation:(type:transmem) OR goa:("membrane [16020]") OR locations:(location:cytoplasm) OR goa:("cytoplasm [5737]") )) AND reviewed:yes'
    secreted_result = service.search(secreted_query, frmt="fasta")
    secreted_outfile = f'{save_path}all_secreted_noSP_new_new.fasta'
    with open(secreted_outfile, 'a') as ofh:
        ofh.write(secreted_result)
        
    cytoplasm_query = f'(( locations:(location:cytoplasm) OR goa:("cytoplasm [5737]") ) NOT (annotation:(type:transmem) OR goa:("membrane [16020]") OR keyword:secreted OR goa:("extracellular region [5576]") )) AND reviewed:yes'
    cytoplasm_result = service.search(cytoplasm_query, frmt="fasta")
    cytoplasm_outfile = f'{save_path}all_cytoplasm_noSP_new_new.fasta'
    with open(cytoplasm_outfile, 'a') as ofh:
        ofh.write(cytoplasm_result)
        
    membrane_query= f'(( annotation:(type:transmem) OR goa:("membrane [16020]") ) NOT ( keyword:secreted OR goa:("extracellular region [5576]") OR locations:(location:cytoplasm) OR goa:("cytoplasm [5737]") )) AND reviewed:yes'
    membrane_result = service.search(membrane_query, frmt="fasta")
    membrane_outfile = f'{save_path}all_membrane_noSP_new_new.fasta'
    with open(membrane_outfile, 'a') as ofh:
        ofh.write(membrane_result)
    

    

In [4]:
data_saving_function_without_SP_full_uniprot(fasta_path)