# Imports & General settings
change paths for your environments and folder architecture

In [1]:
%%time
# Standard library imports
import glob
import logging as log
import os
import shutil
import subprocess
import sys
import urllib.request

# Third party imports


CPU times: user 20 µs, sys: 4 µs, total: 24 µs
Wall time: 30.8 µs


In [2]:
# Logger configuration

log_message="verbose"
if log_message=="verbose":
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.INFO
    )
elif log_message=="debug":
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.WARNING
    )
else:
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.ERROR
    )

logger = log.getLogger(__name__)


In [13]:
# General settings

saving_dir_flag = 'projects/Xray_calc'


myhost = os.uname()[1]
if myhost.startswith('fend'):
    
    # Cluster where ROSETTA calculations are running
    basepath = ''

    path_to_rosetta_dir = '_path_to_software_/Rosetta_2021_Aug_c7009b3'
    path_to_prism_exec_dir = '_path_to_software_/PRISM_tools/rosetta_stability-v0.1.1/software/'

    rosetta_working_dir = os.path.join(basepath, saving_dir_flag)
    os.makedirs(rosetta_working_dir, exist_ok=True)

    # Local application imports
    try:
        sys.path.insert(1, os.path.join(path_to_prism_exec_dir, 'scripts'))
        from pdb_to_prism import download_pdb, rosetta_energy_to_prism
    except (ModuleNotFoundError, ImportError) as e:
        logger.error("{} fileure".format(type(e)))
        print(e)
    else:
        logger.info("Import succeeded")

else:
    
    # Cluster where GEMME calculations are running
    basepath = '/storage1/tiemann'#''

    path_to_gemme_exec = os.path.join(basepath, 'dev/repos/prism_gemme_pipeline/frag_pipeline.py')
    path_to_python = os.path.join(basepath, 'dev/miniconda3/bin/python')

    gemme_working_dir = os.path.join(basepath, saving_dir_flag)
    os.makedirs(gemme_working_dir, exist_ok=True)


In [4]:
# Target list to run on

target_list = [
    ['P08100', 'OPSD', ['GPCR', 'A'], ['4zwj-A']],
    ['P11166', 'GTR1', ['Transporter', 'SLC2'], ['6tha-A']],
    ['O15118', 'NPC1', ['Transporter', 'SLC65'], ['5u74-A', '6w5s-A']],
    ['Q99835', 'SMO', ['', ''], ['5l7d-A']],
    ['P31213', 'S5A2', ['', ''], ['7bw1-A']],
    ['P41181', 'AQP2', ['', ''], ['4nef-A']],
    ['Q9ULV1', 'FZD4', ['', ''], ['6bd4-A']],
    ['P16615', 'AT2A2', ['', ''], ['7bt2-A']],
    ['P17787', 'ACHB2', ['', ''], ['5kxi-C']],
    ['P43681', 'ACHA4', ['IonChannel', 'ligand'], ['5kxi-A', '6cnj-A', '6usf-A']],
    ['Q9H221', 'ABCG8', ['', ''], ['5do7-D']],
    ['Q9H3H5', 'GPT', ['Enzyme', 'Glycosyltransferase'], ['6fm9-A', '6bw6-B']],
    ['P32245', 'MC4R', ['', ''], ['6w25-A']],
    ['P29033', 'CXB2', ['', ''], ['2zw3-A']],
    ['Q8N5M9', 'JAGN1', ['', ''], ['6wvd-A']],
    ['Q9H222', 'ABCG5', ['', ''], ['5do7-A']],
]

# Functions

In [5]:
def extract_by_uniprot_fasta(keyword):
    # extract information from uniprot
    url_base = "https://www.uniprot.org/uniprot/"
    search_params = "?query=reviewed:yes" +\
        "+AND+accession:" + keyword
    return_params = "+&format=tab&columns=id,sequence,organism,entry%20name"
    url = url_base + search_params + return_params

    data_array = []
    for line in urllib.request.urlopen(url):
        line = line.decode('utf-8')
        unprocessed = line[:-1].split('\t')
        data_array.append(unprocessed)

    return data_array

In [6]:
def run_pipeline(input_pdb, working_dir, protein_id, mode='fullrun', chain='A', partition='sbinlab', mutfile=None, mut_mode='all', 
                 relax_xml_file=None, outflag='', cartesian=False, scale=1.0):

    submit_script = os.path.join(working_dir, f'submit_{protein_id}.sh')
    with open(submit_script, 'w') as fp:
        fp.write(('#!/bin/sh \n'
                f'#SBATCH --job-name={protein_id} \n'
                '#SBATCH --time=24:00:00 \n'
                '#SBATCH --array=1 \n'
                f'#SBATCH --partition={partition} \n'))

        if outflag!='':
            out_dir = os.path.join(working_dir, f'run_{outflag}' )
        else:
            out_dir = os.path.join(working_dir, f'run' )
        input_args = [
            '--structure', input_pdb,
            '--mutate_mode', mut_mode,
            '--outputpath', out_dir,
            '--chainid', chain,
            '--mode', mode,
            '--is_membrane', 'True',
            '--mp_calc_span_mode', 'DSSP',
            '--mp_align_ref', f'{protein_id}_{chain}',
            '--mp_prep_align_mode', 'OPM',
            '--slurm_partition', partition,
            '--scale', f"{scale}",
            '--overwrite_path', 'True',
                     ]
        if cartesian:
            input_args.append(f'--mp_cart_ddg 1')
        
        if mutfile:
            input_args.append(f'--mutations {mutfile}')
            
        path_to_pipeline = os.path.join(path_to_prism_exec_dir, 'rosetta_ddG_pipeline')
        command = f"python {os.path.join(path_to_pipeline, 'run_pipeline.py')}", " ".join(input_args)
        
        fp.write(f'{" ".join(command)} \n')
    return submit_script

# Calculate GEMME

In [8]:
gemme_output = os.path.join(gemme_working_dir, 'prism_gemme')
os.makedirs(gemme_output, exist_ok=True)
gemme_workdir = os.path.join(gemme_working_dir, 'gemme')
os.makedirs(gemme_workdir, exist_ok=True)


In [14]:
%%time
for index, target in enumerate(target_list):
    
    accessionID, accessionName, classlist, pdblist = target
    result_gemme_subdir = os.path.join(gemme_output, accessionID[0:2], accessionID[2:4], accessionID[4:6])
    final_gemme_prism = os.path.join(result_gemme_subdir, f'prism_gemme_002_{accessionID}.txt')
    if os.path.isfile(final_gemme_prism):
        logger.info(f'{accessionID} already calculated')
    else:
        gemme_workdir_target = os.path.join(gemme_workdir, accessionID)
        os.makedirs(gemme_workdir_target, exist_ok=True)
        gemme_workdir_input = os.path.join(gemme_workdir_target, 'input')
        os.makedirs(gemme_workdir_input, exist_ok=True)
        gemme_run_dir = os.path.join(gemme_workdir_target, 'run-v2')

        def get_fasta(uniprot_id, run_dir):
            os.makedirs(run_dir, exist_ok=True)
            fasta_file = os.path.join(run_dir, f'{uniprot_id}.fasta')
            extracted = extract_by_uniprot_fasta(uniprot_id)
            with open(fasta_file, 'w') as fp:
                fp.write(f'>sp|{accessionID}|{extracted[1][3]}\n')
                fp.write(extracted[1][1])
            return fasta_file

#         fasta_file = get_fasta(accessionID, gemme_workdir_input)
        fasta_file = os.path.join(gemme_workdir_input, f'{accessionID}.fasta')

        submitter = f'{path_to_python} {path_to_gemme_exec} -s {fasta_file} -o {gemme_run_dir} '
        logger.info(f'Submitting {accessionID}: {submitter}')
        #pipes = subprocess.Popen(submitter, shell=True, cwd=gemme_workdir_target,stdout=subprocess.PIPE,stderr=subprocess.PIPE,)
        #std_out, std_err = pipes.communicate()
        logger.info(f'Execution of {accessionID} done')


INFO:Submitting P08100: /storage1/tiemann/dev/miniconda3/bin/python /storage1/tiemann/dev/repos/prism_gemme_pipeline/frag_pipeline.py -s /storage1/tiemann/projects/Xray_calc/gemme/P08100/input/P08100.fasta -o /storage1/tiemann/projects/Xray_calc/gemme/P08100/run-v2 
INFO:Execution of P08100 done
INFO:Submitting P11166: /storage1/tiemann/dev/miniconda3/bin/python /storage1/tiemann/dev/repos/prism_gemme_pipeline/frag_pipeline.py -s /storage1/tiemann/projects/Xray_calc/gemme/P11166/input/P11166.fasta -o /storage1/tiemann/projects/Xray_calc/gemme/P11166/run-v2 
INFO:Execution of P11166 done
INFO:Submitting O15118: /storage1/tiemann/dev/miniconda3/bin/python /storage1/tiemann/dev/repos/prism_gemme_pipeline/frag_pipeline.py -s /storage1/tiemann/projects/Xray_calc/gemme/O15118/input/O15118.fasta -o /storage1/tiemann/projects/Xray_calc/gemme/O15118/run-v2 
INFO:Execution of O15118 done
INFO:Submitting Q99835: /storage1/tiemann/dev/miniconda3/bin/python /storage1/tiemann/dev/repos/prism_gemme_p

CPU times: user 30.7 ms, sys: 15.9 ms, total: 46.6 ms
Wall time: 46.8 ms


In [None]:
%%time
for index, target in enumerate(target_list):
    
    accessionID, accessionName, classlist, pdblist = target
    result_gemme_subdir = os.path.join(gemme_output, accessionID[0:2], accessionID[2:4], accessionID[4:6])
    final_gemme_prism = os.path.join(result_gemme_subdir, f'prism_gemme_XXX_{accessionID}.txt')
    if os.path.isfile(final_gemme_prism):
        logger.info(f'{accessionID} already calculated')
    else:
        gemme_workdir_target = os.path.join(gemme_workdir, accessionID)
        os.makedirs(gemme_workdir_target, exist_ok=True)
        gemme_workdir_input = os.path.join(gemme_workdir_target, 'input')
        os.makedirs(gemme_workdir_input, exist_ok=True)
        gemme_run_dir = os.path.join(gemme_workdir_target, 'run')
        os.makedirs(gemme_run_dir, exist_ok=True)

        os.makedirs(result_gemme_subdir, exist_ok = True)
        tmp_prism_gemme = glob.glob(os.path.join(gemme_run_dir, 'gemme_scores', f'prism_gemme_*'))[0]
        shutil.copyfile(tmp_prism_gemme, final_gemme_prism)
       # shutil.rmtree(gemme_workdir_target, ignore_errors=True)


# Calculate ∆∆Gs

In [7]:
#prepare ddG calculation directory

rosetta_dir = os.path.join(rosetta_working_dir, 'rosetta')
os.makedirs(rosetta_dir, exist_ok=True)
shared_dir = os.path.join(rosetta_working_dir, 'shared')
rosetta_prism = os.path.join(rosetta_working_dir, 'prism_rosetta')
os.makedirs(rosetta_prism, exist_ok=True)

for targetID, target in enumerate(target_list):
    target_uniprot = target[0]
    target_work_dir = os.path.join(rosetta_dir, target_uniprot)
    os.makedirs(target_work_dir, exist_ok=True)
    target_input_dir = os.path.join(target_work_dir, 'input')
    os.makedirs(target_input_dir, exist_ok=True)
    target_calc_dir = os.path.join(target_work_dir, 'calc')
    os.makedirs(target_calc_dir, exist_ok=True)


## Manual obtain and check structures
execute and then check & clean the script manually

In [8]:
for targetID, target in enumerate(target_list):
    target_uniprot = target[0]
    target_work_dir = os.path.join(rosetta_dir, target_uniprot)
    target_input_dir = os.path.join(target_work_dir, 'input')
    target_pdbID = target[3][0].split('-')[0]
    download_pdb(target_pdbID, output_dir=target_input_dir)

Downloading PDB structure '4zwj'...
Downloading PDB structure '6tha'...
Downloading PDB structure '5u74'...
Downloading PDB structure '5l7d'...
Downloading PDB structure '7bw1'...
Downloading PDB structure '4nef'...
Downloading PDB structure '6bd4'...
Downloading PDB structure '7bt2'...
Downloading PDB structure '5kxi'...
Downloading PDB structure '5kxi'...
Downloading PDB structure '5do7'...
Downloading PDB structure '6fm9'...
Downloading PDB structure '6w25'...
Downloading PDB structure '2zw3'...
Downloading PDB structure '6wvd'...
Downloading PDB structure '5do7'...


In [9]:
for targetID, target in enumerate(target_list):
    target_uniprot = target[0]
    target_work_dir = os.path.join(rosetta_dir, target_uniprot)
    target_input_dir = os.path.join(target_work_dir, 'input')
    target_pdbID = target[3][0].split('-')[0]
    target_pdbchain = target[3][0].split('-')[1]
    pdb_input = os.path.join(target_input_dir, f"{target_pdbID.lower()}.pdb")
    pdb_clean = os.path.join(target_input_dir, f'{target_pdbID.lower()}_{target_pdbchain}-clean.pdb')
    shutil.copy(pdb_input, pdb_clean)

## make input files (rosetta-pdb and mutfiles)

In [10]:
execution_lists = []
for targetID, target in enumerate(target_list):
    target_uniprot = target[0]
    target_work_dir = os.path.join(rosetta_dir, target_uniprot)
    target_input_dir = os.path.join(target_work_dir, 'input')
    target_pdbID = target[3][0].split('-')[0]
    target_pdbchain = target[3][0].split('-')[1]
    
    pdb_input = os.path.join(target_input_dir, f'{target_pdbID.lower()}_{target_pdbchain}-clean.pdb')
    merged_prism_file = os.path.join(shared_dir, f'prism_merged_XXX_{target_uniprot}_MP.txt')
    script = os.path.join(path_to_prism_exec_dir, 'scripts/prism2mutfile.py')
    func = f'python {script} {merged_prism_file} -o {target_input_dir} -i {pdb_input} -r True'
    pipes = subprocess.Popen(func, shell=True, cwd=target_input_dir,stdout=subprocess.PIPE,stderr=subprocess.PIPE,)
    std_out, std_err = pipes.communicate()
    logger.info(f'Execution of {targetID} done')
    


INFO:Execution of 0 done
INFO:Execution of 1 done
INFO:Execution of 2 done
INFO:Execution of 3 done
INFO:Execution of 4 done
INFO:Execution of 5 done
INFO:Execution of 6 done
INFO:Execution of 7 done
INFO:Execution of 8 done
INFO:Execution of 9 done
INFO:Execution of 10 done
INFO:Execution of 11 done
INFO:Execution of 12 done
INFO:Execution of 13 done
INFO:Execution of 14 done
INFO:Execution of 15 done


## run pipeline

In [13]:
# Set correct environment
os.environ['ddG_pipeline'] = os.path.join(path_to_prism_exec_dir, 'rosetta_ddG_pipeline')
os.getenv('ddG_pipeline')
os.environ['Rosetta_main_path'] = os.path.join(path_to_rosetta_dir, 'source')
os.getenv('Rosetta_main_path')
os.environ['Rosetta_database_path'] = os.path.join(path_to_rosetta_dir, 'database')
os.getenv('Rosetta_database_path')


'/sbinlab/software/Rosetta_2021_Aug_c7009b3/database'

In [None]:
execution_lists = []
for targetID, target in enumerate(target_list):
    target_uniprot = target[0]
    target_work_dir = os.path.join(rosetta_dir, target_uniprot)
    target_input_dir = os.path.join(target_work_dir, 'input')
    target_calc_dir = os.path.join(target_work_dir, 'calc')
    target_pdbID = target[3][0].split('-')[0]
    target_pdbchain = target[3][0].split('-')[1]

    pdb_input = os.path.join(target_input_dir, f'{target_pdbID.lower()}_{target_pdbchain}-clean_renum.pdb')
    mutfile_input = os.path.join(target_input_dir, 'mutfile_all')

    
    if targetID%3 == 0:
        partition = 'sbinlab'
    elif targetID%3 == 1:
        partition = 'sbinlab_ib'
    else:
        partition = 'sbinlab_ib2'
    
    exect = run_pipeline(pdb_input, target_calc_dir, target_pdbID, mode='fullrun',#'relax' #'create' #'fullrun', #ddg_calculation #proceed
                         chain=target_pdbchain, partition=partition,
                         mutfile=mutfile_input, mut_mode='mut_file',
                         outflag='cart', cartesian=True
                        )
    execution_lists.append(exect)


#Submit (switch with care....)
do_execute=True
if do_execute:
    for submit in execution_lists:
        subprocess.call(f'sbatch {submit}', shell=True, cwd=rosetta_dir)
        logger.info(submit)
else:
    logger.warning("executing set to false - only switch on when you know what you're doing!")
    

## check if ready

In [None]:
for targetID, target in enumerate(target_list):
    target_uniprot = target[0]
    target_work_dir = os.path.join(rosetta_dir, target_uniprot)
    target_input_dir = os.path.join(target_work_dir, 'input')
    target_calc_dir = os.path.join(target_work_dir, 'calc')
    target_pdbID = target[3][0].split('-')[0]
    target_pdbchain = target[3][0].split('-')[1]
    logger.info(target_uniprot)
    prism_file = os.path.join(target_calc_dir, 'run_cart', 'output', f'prism_rosetta_XXX_{target_pdbID}_{target_pdbchain}-clean_renum.txt')
    if os.path.isfile(prism_file):
        pdb_file = glob.glob(os.path.join(target_calc_dir, 'run_cart', 'output', f'*_final.pdb'))[0]
        logger.info(f"Calculation for {target_uniprot} ({target[1]}) finished")
        final_target_rosetta_prism_dir = os.path.join(rosetta_prism, target_uniprot[0:2], target_uniprot[2:4], target_uniprot[4:6])
        final_target_rosetta_prism = os.path.join(final_target_rosetta_prism_dir, f'prism_rosetta_XXX_{target_uniprot}_{target_pdbID}_{target_pdbchain}_cartesian.txt')
        final_target_rosetta_pdb_prism = os.path.join(final_target_rosetta_prism_dir, f'prism_rosettapdb_XXX_{target_uniprot}_{target_pdbID}_{target_pdbchain}_cartesian.txt')

        if os.path.isfile(final_target_rosetta_prism):
            logger.info(f'{target_uniprot} already copied')
        else:
            os.makedirs(final_target_rosetta_prism_dir, exist_ok = True)
            shutil.copyfile(prism_file, final_target_rosetta_prism)
            rosetta_energy_to_prism(pdb_file, final_target_rosetta_pdb_prism, target_pdbID, target_pdbchain, final_target_rosetta_prism_dir, uniprot_id=target_uniprot)
            #shutil.copyfile(pdb_file, final_target_rosetta_pdb)
            logger.info(f'{target_uniprot} now copied')
    else:
        logger.info(f"Calculation for {target_uniprot} ({target[1]}) not finished")

