In [1]:
from pathlib import Path
import Bio
from Bio.PDB import PDBList
import shutil
from Bio.PDB import PDBParser
import mdtraj
from Bio.Align.Applications import ClustalOmegaCommandline
import subprocess
import biotite.sequence as seq
import biotite.sequence.graphics as graphics
from Bio import SeqIO
import matplotlib.pyplot as plt
from os import path, system
from engens.core.CrystalUtils import *

# List of PDB files/ids - as input to the workflow

In [2]:
# Get the PDB ID-s of your files

# Our example:
# PI3K pdb IDs - from the paper: https://doi.org/10.1016/j.jmb.2020.09.002 
# Supplementary material table S2.

pdbIds =  "2rd0 3hhm 3hiz 4jps".split()
'''
5swg 5swo 5swp 5swr 5swt 5sx8 5sx9 5sxa \
5sxb 5sxc 5sxd 5sxe 5sxf 5sxi 5sxj 5sxk \
5uk8 5ukj 5ul1 5xgh 5xgi 5xgj 6nct \
4a55 2y3a \
5dxu 5m6u 5t8f 5ubt 5vlr 6g6w 6pyr 6pyu".split()
'''

print("{} total PI3K pdb ids used here.".format(len(pdbIds)))

4 total PI3K pdb ids used here.


## Step 1  - download PDBs

In [3]:
# Make the directory to  store the files in 
# Change this to any name you'd like
pdir = "./PI3Ks_pdbs/"
Path(pdir).mkdir(parents=True, exist_ok=True)

# -----------------STEP 1.1 - download PDBs------------------------ #
# Initialize CrystalUtils - it will autimatically (if no file_names provieded):
# Download PDBs
# Fix them
crystal_utils = CrystalUtils(pdb_codes = pdbIds, dst_folder = pdir)

100%|██████████| 4/4 [00:00<00:00, 1231.90it/s]

Fetching and fixing: 2rd0
Already found
Fetching and fixing: 3hhm
Already found
Fetching and fixing: 3hiz
Already found
Fetching and fixing: 4jps
Already found





In [4]:
# -----------------STEP 1.2 - extract protein sequences------------------------ #
# Fasta file name - place to store the sequence
crystal_utils.extract_protein_sequence()


'PI3Ks_pdbs/sequence_output/sequences.fasta'

In [5]:
# -----------------STEP 1.3 - compute MSA with clustal omega------------------------ #
# ----------------- and extract maximum common substructure ------------------------ #
print("Running MSA of the sequences")
crystal_utils.performMSA()

Running MSA of the sequences
Running MSA of the sequences
Found PI3Ks_pdbs/sequence_output/sequence_aligned.fasta
Continuous region #1 found starting in AA range 14-239
Continuous region #2 found starting in AA range 256-318
Continuous region #3 found starting in AA range 327-414
Continuous region #4 found starting in AA range 425-499
Continuous region #5 found starting in AA range 523-858
Continuous region #6 found starting in AA range 869-935
Continuous region #7 found starting in AA range 948-965
Visualization od MSA and common regions: PI3Ks_pdbs/sequence_output/sequence_aligned_regions.html
Extracting substructures
Extracting common regions from each file (backbone)


Extracting common regions from each file (backbone): 100%|██████████| 4/4 [00:00<00:00, 9383.23it/s]

File exists: PI3Ks_pdbs/structure_output/2rd0_fixed_bbstrip.pdb
File exists: PI3Ks_pdbs/structure_output/3hhm_fixed_bbstrip.pdb
File exists: PI3Ks_pdbs/structure_output/3hiz_fixed_bbstrip.pdb
File exists: PI3Ks_pdbs/structure_output/4jps_fixed_bbstrip.pdb
Converting to trajectory bb_traj.xtc





Extracting common regions from each file (full residues)


Extracting common regions from each file (full residues): 100%|██████████| 4/4 [00:00<00:00, 8077.62it/s]

File exists: PI3Ks_pdbs/structure_output/2rd0_fixedresstrip.pdb
File exists: PI3Ks_pdbs/structure_output/3hhm_fixedresstrip.pdb
File exists: PI3Ks_pdbs/structure_output/3hiz_fixedresstrip.pdb
File exists: PI3Ks_pdbs/structure_output/4jps_fixedresstrip.pdb
Converting to trajectory resstrip_traj.xtc





In [13]:
# ----------------- visualize and inspect the alignment! ------------------------ #
from IPython.display import IFrame
IFrame('PI3Ks_pdbs/sequence_output/sequence_aligned_regions.html', width=1050, height=200)

# Now - continue with the normal workflow: but add the list of pdb files as an argument to EnGens

- remember to align the trajectory (align = True when constructing EnGen)
- make sure your binding_site_selstr is something that is generalizable to different possibly mutated residues
- same for the featurization (do not use all atom featurization - since different residues have different number of atoms)
- do not use TICA/HDE
- do not use VAMP nets to select features

these are only for use with time series data (MDs)

In [15]:
import glob
from engens.core.EnGens import *

# input files:
pdb_files_processed = glob.glob(path.join(crystal_utils.dst_structure, "*resstrip.pdb"))
# any random bbstrip file
top_loc = path.join(crystal_utils.dst_structure, crystal_utils.pdb_codes[0]+'_fixed_bbstrip.pdb')
# backbone - common residue trajectory
traj_loc = path.join(crystal_utils.dst_structure, "bb_traj.xtc")
# input files - containing full common residues
input_files = pdb_files_processed
structure_names = [name[name.rfind("/")+1:name.rfind("/")+5] for name in input_files]

engen = EnGen(traj_loc, top_loc, cryst_pdb_list = True, file_names = input_files, structure_names = structure_names, align=True)




Aligning trajectory: 100%|██████████| 1/1 [00:00<00:00,  5.80it/s]
Cleaning files...: 100%|██████████| 1/1 [00:00<00:00, 5825.42it/s]
Aligning pdb files (might take a while): 100%|██████████| 4/4 [00:00<00:00,  4.39it/s]
Loading files (might take a while): 100%|██████████| 4/4 [00:01<00:00,  2.14it/s]


# Workflow 1 - extract features from the PDB files

**Input:** reference PDB and trajectory


**Output:** featurized trajectory
<hr>
Steps:

1. Load reference PDB and trajectory in the EnGen object
3. Provide set of featurizations of interest (or use default)
4. Evaluate different featurization (optional)
5. Choose the best featurization
6. Extract those features

In [16]:
# required imports 
import engens.core.FeatureSelector as fs
import pickle
import mdshare
import mdtraj
import numpy as np
import nglview
from IPython.display import Javascript, display
import json

### Step 1 - load the structure and trajectory

Provide the path to the files with the reference trajectory and topology.
(You can use any format that <a fref = https://mdtraj.org/1.9.4/api/generated/mdtraj.load.html> mdtraj.load </a> will take as input).

Optionally, provide a subset of the structure that you will use for featurization (e.g. binding site) as a <a href=https://mdtraj.org/1.9.4/atom_selection.html> atom selection string </a> or a list of atom indices.


In [18]:
nglwidget = select_residues_nglview(top_loc)
nglwidget

ThemeManager()

NGLWidget(gui_style='ngl')

In [19]:
## Option 4 - continue selection 1
selection = None
display(Javascript(js_script))

<IPython.core.display.Javascript object>

In [None]:
## Option 4 - continue selection 2
if not selection is None and len(selection) > 0:
    binding_site_selstr = get_selstring(selection)
    #binding_site_selstr = "(10 <= resid) and (resid <= 50)" 
    engen = EnGen(traj_loc, top_loc, binding_site_selstr, align = True)

#------------------------end of options----------------------------#

In [20]:

#visualize the trajectory (optional - if trajectory too large, skip this step)
nglwidget = engen.show_animated_traj()
nglwidget.clear_representations()
nglwidget.add_ball_and_stick()
nglwidget.center()
nglwidget

NGLWidget(max_frame=48)

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [21]:
nglview.write_html("tmp.html", [nglwidget], frame_range=(0, 20))

### Step 2 - select different featurizations

Here we select ways to featurize the trajectory. Any PyEmma <a href = http://www.emma-project.org/latest/api/generated/pyemma.coordinates.featurizer.html> trajectory featurization </a> can be used in this step and any of the parameters of the respective featurizations can be provided. Users can also use the default initialization which includes three sets of features: (1) amino-acid pairwise distances; (2) torsion angles and (3) amino-acid pairwise distances with the torsion angels.

In [None]:
# remove any existing featurizers
engen.reset_featurizers()
# initialize default features 
engen.init_featurizers_default()
description = engen.describe_featurizers()
print(description)

In [None]:
# Split chains to groups and assign group distances as features

tmp_traj = mdtraj.load(engen.ref)
df_top = tmp_traj.topology.to_dataframe()[0]

groups = {}
groups_list = []
for chain in df_top.chainID.unique():
    group_resSeq = df_top[df_top.chainID == chain].serial
    groups_list.append(list(group_resSeq))
    groups[chain] = list(group_resSeq)

In [None]:
df_top

In [None]:
# remove any existing featurizers
engen.reset_featurizers()

#center of mass and torsion angles
feat = {
    "add_group_mindist": {"group_definitions": groups_list}
}

#add the respective features to the engen structure
engen.add_featurizer(feat)
description = engen.describe_featurizers()
print(description)

### Step 3 - evaluate the featurizations

This step is optional - we recommend evaluating the featurizations and picking the best using PyEmma's implementation of <a href=http://www.emma-project.org/latest/tutorials/notebooks/00-pentapeptide-showcase.html#Feature-selection> VAMP approach </a>.

This helps you choose a set of features with which to proceed to the next Workflow.

### Not an option for crystal structure input!!

### Step 4 - pick the featurization

We suggest using the featurization which gives you the highest VAMP2 score from the analysis above. To do so, run the cell below.

In [None]:
#apply features
engen.apply_featurizations()
#print possible features
print(engen.describe_featurizers())
#select the number of the desired feature
feat_num = 0
# initialize selector
featsel = fs.UserFeatureSelection(feat_num, engen)
#select the feature
featsel.select_feature()

### Step 5 - save the results as input for Workflow2 - dimensionality reduction

In [None]:
# save the results for next workflow
with open("wf1_resulting_EnGen.pickle", "wb") as file:
    pickle.dump(engen, file, -1)