In [None]:
#install pyrosetta
!pip install pyrosettacolabsetup
import pyrosettacolabsetup; pyrosettacolabsetup.install_pyrosetta()
import pyrosetta; pyrosetta.init()

In [None]:
#change directory to the google drive home directory
import os
os.chdir('google_drive/MyDrive/')

In [None]:
#set up pyrosetta
from pyrosetta import *
from google.colab import files
init()

In [None]:
#navigate to folder that has your pdbs
os.chdir('pdbs')

In [None]:
#define function to calculate distance between a-carbons in the protein and a-carbons in the substrate
def distance(pose, R1, R2):
  A1 = pose.residue(R1)
  A2 = pose.residue(R2)
  p1 = A1.xyz("CA")
  p2 = A2.xyz("CA") #If your substrate is not a peptide, then change "CA" to whichever substrate residue atom each distance should be measured to

  vector = (p1-p2)

  return vector.norm()

In the following step, impoirt your file by replacing the file name with the pdb file name of your homolog with substrate bound

In [None]:
poseA = pose_from_pdb("1xtg.pdb")

In [None]:
sequenceA = poseA.sequence() #gets the sequence of the protein, including the bound substrate

In [None]:
sequenceA #view your sequence. Note that ligands will be present, too

run the next cell with the sequence of the substrate pasted into the parantheses to find its position in the sequence

In [None]:
substrate_site = sequenceA.find('')

In [None]:
subResA = sequenceA[substrate_site:] #stores the substrate sequence

In [None]:
protResA = sequenceA[:substrate_site] #stores the protein sequence

In [None]:
#this block gets the indices of the protein residues up until the ligand(s) that might appear at the end (in this case 'Z' for zinc).
# May need to change 'Z' to the ligand that appears at the end of your sequence, if applicable.
protIndA = []
i = 1

if sequenceA.find('Z') != -1:
  while i < sequenceA.find('Z'):
    protIndA.append(i)
    i += 1
else:
  protIndA = list(range(len(sequenceA)))[:substrate_site]
  protIndA = [x+1 for x in protIndA]

In [None]:
#get the indices of the substrate
subIndA = []
for i in range(len(subResA)):
  subIndA.append(sequenceA.find(subResA) + i +1)

The next cell defines the distance cutoff for residues to constrain, in Angstroms. change this value to your desired cutoff

In [None]:
cutoff = 18

In [None]:
bindingPocketResA = []
#iterate over protease residues
for x in protIndA:
  #iterate over substrate residues
  for y in subIndA:

    #special case for Zn, which has no a-carbon. #If you have other non-protein ligands you'll need to update the code to treat them as special cases too
    if poseA.residue(y).name() == 'ZN':
      t1 = poseA.residue(y).xyz("ZN")
      t2 = poseA.residue(x).xyz("CA")
      dist = (t1-t2).norm()
      if dist < cutoff:
        bindingPocketResA.append(x)
        break

    else:
      #distance between amino acids
      dist = distance(poseA,x,y)
      if dist < cutoff:
        bindingPocketResA.append(x)
        break


In [None]:
#format list of residues with distance below cutoff
bindPocketPDB_A = []
for x in bindingPocketResA:
  n = poseA.pdb_info().pose2pdb(x)
  n = n.split()
  n = ' '.join(n[:1])
  bindPocketPDB_A.append(n)

create a pymol command to select all the residues within the distance cutoff (the binding pocket). change 1xtg below to your substrate-bound homolog pdb id

In [None]:
command = 'select bp, 1xtg and chain A and resi '
for x in bindPocketPDB_A:
  command = command + x + '+'

In [None]:
#view the command so that it can be copied and pasted into pymol, allowing you to highlight the active site residues by running it in pymol with the homolog protein structure open to see the residues.
command

Next step: convert the active site residue indices in the substrate-bound homolog to the active site residues in your target protein with the following steps

In [None]:
cd ..

In [None]:
cd fastas

In [None]:
!pip install biopython
from Bio import AlignIO

Run the next cell with the name of your alignment file pasted in the first set of parantheses (include the .fasta extension)

In [None]:
alignment = AlignIO.read('', "fasta")

In [None]:
#Define the function to map the homolog residues to the target protein residues by sequence alignment
def index_convert(resList):

  outList = []
  outList_num = []
  res_counter_A = 0
  res_counter_B = 0
  for pos in range(alignment.get_alignment_length()):

    if alignment[1][pos] != '-':
      res_counter_B += 1

    if alignment[0][pos] != '-':
      res_counter_A += 1
      if res_counter_A in resList:
        outList.append(alignment[1][res_counter_B])
        outList_num.append(res_counter_B)

  return(outList,outList_num)


In [None]:
#convert to int type
bindPocketPDB_A_int = []
for x in bindPocketPDB_A:
  bindPocketPDB_A_int.append(int(x))

In [None]:
res, num = index_convert(bindPocketPDB_A_int)

Run the following cell by replacing BoNTE_AF with the pdb id of your target protein to generate the pymol command to highlight the active site residues.
Copy this command and run in pymol with the target protein structure open to see the residues.

In [None]:
command = 'select bp, BoNTE_AF and chain A and resi '
for x in num:
  command = command + str(x) + '+'

In [None]:
command

Now make the list of target protein residues to fix. This list will be pasted into the conservation analysis script in the next phase of design when the full list of constraints is generated

In [None]:
PDB_fix_Res = num

In [None]:
#get the number of fixed residues
len(PDB_fix_Res)

In [None]:
#print the list of fixed residues to be copied
PDB_fix_Res