# INITIALIZATION

In [8]:
# Loading some modules
%load_ext autoreload
%autoreload 2

# Getting basic libraries:
import os, sys, math
import numpy as np
from Bio.PDB import *

# Libraries for structure checking: --> 
import biobb_structure_checking
import biobb_structure_checking.constants as cts
from biobb_structure_checking.structure_checking import StructureChecking
base_dir_path=biobb_structure_checking.__path__[0]
args = cts.set_defaults(base_dir_path,{'notebook':True})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# PREPARATION

    1 - Obtain the required structure from the PDB

In [9]:
# Loading the protein into a variable
parser = PDBParser()
structure = parser.get_structure("6m0j","./assignment_data/6m0j.pdb")

    2 - Check at PDB which is the composition of a “Biological unit”. Remove all chains but those involved in the biological unit, if necessary

In [10]:
# Preparing the paths
base_path = './assignment_data/'
args['output_format'] = "pdb"
args['keep_canonical'] = False
args['input_structure_path'] = base_path + '6m0j.cif'
args['output_structure_path'] = base_path + '6m0j_fixed.pdb'
args['output_structure_path_charges'] = base_path + '6m0j_fixed.pdbqt'
args['time_limit'] = False
args['nocache'] = False
args['copy_input'] = False
args['build_warnings'] = False
args['debug'] = False
args['verbose'] = False
args['coords_only'] = False
args['overwrite'] = False
'''
args['input_structure_path'] = base_path + '6m0j.cif'
args['output_structure_path'] = base_path + '6m0j_fixed.pdb'
args['output_structure_path_charges'] = base_path + '6m0j_fixed.pdbqt'
args['debug'] = False
args['verbose'] = False
'''

"\nargs['input_structure_path'] = base_path + '6m0j.cif'\nargs['output_structure_path'] = base_path + '6m0j_fixed.pdb'\nargs['output_structure_path_charges'] = base_path + '6m0j_fixed.pdbqt'\nargs['debug'] = False\nargs['verbose'] = False\n"

In [11]:
# Intializing the checking engine, loading the structure and showing info
st_c = StructureChecking(base_dir_path, args)

Canonical sequence for model 0:
Structure ./assignment_data/6m0j.cif loaded
 PDB id: 6M0J 
 Title: Crystal structure of 2019-nCoV spike receptor-binding domain bound with ACE2
 Experimental method: X-RAY DIFFRACTION
 Keywords: VIRAL PROTEIN/HYDROLASE
 Resolution (A): 2.4500

 Num. models: 1
 Num. chains: 2 (A: Protein, E: Protein)
 Num. residues:  876
 Num. residues with ins. codes:  0
 Num. residues with H atoms: 0
 Num. HETATM residues:  85
 Num. ligands or modified residues:  5
 Num. water mol.:  80
 Num. atoms:  6543
Metal/Ion residues found
 ZN A901
Small mol ligands found
NAG A902
NAG A903
NAG A904
NAG E601


    3 - Remove all heteroatoms


In [12]:
#Remove Hydrogen
st_c.rem_hydrogen()
#Remove water
st_c.water("yes")
#Remove metals
st_c.metals("All")
#Remove ligands
st_c.ligands("All")


Running rem_hydrogen.
No residues with Hydrogen atoms found
Running water. Options: yes
Detected 80 Water molecules
Canonical sequence for model 0:
Removed 80 Water molecules
Running metals. Options: All
Found 1 Metal ions
  ZN A901.ZN 
Canonical sequence for model 0:
Metal Atoms removed All (1)
Running ligands. Options: All
Detected 4 Ligands
 NAG A902
 NAG A903
 NAG A904
 NAG E601
Canonical sequence for model 0:
Ligands removed All (4)


    4 - Perform a quality checking on the structures, and add missing side-chains and hydrogen atoms and atom charges, using
    the biobb_structure_checking module

In [13]:
import os, sys, math
import numpy as np
from Bio.PDB import *

# Libraries for structure checking: --> 
import biobb_structure_checking
import biobb_structure_checking.constants as cts
from biobb_structure_checking.structure_checking import StructureChecking
base_dir_path=biobb_structure_checking.__path__[0]
args = cts.set_defaults(base_dir_path,{'notebook':True})

###Preparation

    #1 Obtain the required structure from the PDB
f=open("./assignment_data/6m0j.pdb", "r")#for executing in visual use absolute path
parser = PDBParser()
structure = parser.get_structure("6m0j",f)

    #2 Check at PDB which is the composition of a “Biological unit”.
# Remove all chains but those involved in the biological unit, if necessary
base_path = './assignment_data/'
args['output_format'] = "pdb"
args['keep_canonical'] = False
args['input_structure_path'] = base_path + '6m0j.cif'
args['output_structure_path'] = base_path + '6m0j_fixed.pdb'
args['output_structure_path_charges'] = base_path + '6m0j_fixed.pdbqt'
args['time_limit'] = False
args['nocache'] = False
args['copy_input'] = False
args['build_warnings'] = False
args['debug'] = False
args['verbose'] = False
args['coords_only'] = False
args['overwrite'] = False

#Intializing the checking engine, loading the structure and showing info
st_c = StructureChecking(base_dir_path, args)

    #3 Remove all heteroatoms
#Remove Hydrogen
st_c.rem_hydrogen()

#Remove water
st_c.water("yes")

#Remove metals
st_c.metals("All")

#Remove ligands
st_c.ligands("All")

    #4

biobb_structure_checking
#Fix amides
st_c.amide("All")
#fix the chirality of some aa
st_c.chiral("All") 
#fix the backbone
st_c.backbone('--fix_atoms All --fix_chain none --add_caps none')
#detects and rebuilds missing protein side chains
st_c.fixside("All")
#add hydrogens
st_c.add_hydrogen("auto")

#you could also do everything with st_c.checkall() but we did it manually so its clearer what we do
st_c._save_structure(args['output_structure_path'])

#st_c.rem_hydrogen('yes')
#st_c.add_hydrogen('--add_charges --add_mode auto')
#st_c._save_structure(args['output_structure_path_charges'])

Canonical sequence for model 0:
Structure ./assignment_data/6m0j.cif loaded
 PDB id: 6M0J 
 Title: Crystal structure of 2019-nCoV spike receptor-binding domain bound with ACE2
 Experimental method: X-RAY DIFFRACTION
 Keywords: VIRAL PROTEIN/HYDROLASE
 Resolution (A): 2.4500

 Num. models: 1
 Num. chains: 2 (A: Protein, E: Protein)
 Num. residues:  876
 Num. residues with ins. codes:  0
 Num. residues with H atoms: 0
 Num. HETATM residues:  85
 Num. ligands or modified residues:  5
 Num. water mol.:  80
 Num. atoms:  6543
Metal/Ion residues found
 ZN A901
Small mol ligands found
NAG A902
NAG A903
NAG A904
NAG E601
Running rem_hydrogen.
No residues with Hydrogen atoms found
Running water. Options: yes
Detected 80 Water molecules
Canonical sequence for model 0:
Removed 80 Water molecules
Running metals. Options: All
Found 1 Metal ions
  ZN A901.ZN 
Canonical sequence for model 0:
Metal Atoms removed All (1)
Running ligands. Options: All
Detected 4 Ligands
 NAG A902
 NAG A903
 NAG A904
 NA

'./assignment_data/6m0j_fixed.pdb'

## STEP 1

In [18]:
#Finding out atoms in both chains
def chain_atoms(structure, dt):
    model =structure[0]
    t=0
    chain1_atoms=[]
    chain2_atoms=[]
    for chain in model:
        if t==0:
            t=1
            for residual in chain.get_residues():
                for atom1 in residual:
                    chain1_atoms.append(atom1)
        else:
            for residual in chain.get_residues():
                for atom2 in residual:
                    chain2_atoms.append(atom2)
    return chain_comparison(chain1_atoms, chain2_atoms, dt)

#comparing atoms of both chains
def chain_comparison(chain1_atoms, chain2_atoms, dt):
    interface_residues=set()
    for a1 in chain1_atoms:
        for a2 in chain2_atoms:
            if a1.get_parent().id==a1.get_parent().id and a2.get_parent().id == a2.get_parent().id:
                distance=a1-a2
            if distance<=dt:#dt is the treshold of the distance
                interface_residues.add(a1.get_parent().id[1])
                interface_residues.add(a2.get_parent().id[1])
    return interface_residues

dt=5
print(chain_atoms(structure, dt))

{393, 19, 24, 27, 28, 30, 31, 417, 34, 35, 37, 38, 41, 42, 45, 446, 447, 449, 453, 455, 456, 330, 79, 82, 83, 473, 475, 476, 353, 354, 355, 484, 357, 486, 487, 489, 493, 496, 498, 500, 501, 502, 505}


## Step 2

In [None]:
Step 2 preparation

In [None]:
import argparse
import sys
import os
import math

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.NACCESS import NACCESS_atomic
from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB.PDBIO import PDBIO, Select

def residue_id(res):
    '''Returns readable residue id'''
    return '{} {}{}'.format(res.get_resname(), res.get_parent().id, res.id[1])

def atom_id(at):
    '''Returns readable atom id'''
    return '{}.{}'.format(residue_id(at.get_parent()), at.id)


class ResiduesDataLib():
    def __init__(self, fname):
        self.residue_data = {}
        try:
            fh = open(fname, "r")
        except OSError:
            print("#ERROR while loading library file (", fname, ")")
            sys.exit(2)
        for line in fh:
            if line[0] == '#':
                continue
            data = line.split()
            r = Residue(data)
            self.residue_data[r.id] = r
        self.nres = len(self.residue_data)

    def get_params(self, resid, atid):
        atom_id = resid + ':' + atid
        if atom_id in self.residue_data:
            return self.residue_data[atom_id]
        else:
            print("WARNING: atom not found in library (", atom_id, ')')
            return None

class Residue():
    def __init__(self,data):
        self.id     = data[0]+':'+data[1]
        self.at_type = data[2]
        self.charge  = float(data[3])
        
class AtomType():
    def __init__(self, data):
        self.id   = data[0]
        self.eps  = float(data[1])
        self.sig  = float(data[2])
        self.mass = float(data[3])
        self.fsrf = float(data[4])
        self.rvdw = self.sig * 0.5612
        
class VdwParamset():
    def __init__ (self, file_name):
        self.at_types = {}
        try:
            fh = open(file_name, "r")
        except OSError:
            print ("#ERROR while loading parameter file (", file_name, ")")
            sys.exit(2)
        for line in fh:
            if line[0] == '#':
                continue
            data = line.split()
            self.at_types[data[0]] = AtomType(data)
        self.ntypes = len(self.at_types)
        fh.close()

def calc_solvation(st, res):
    '''Solvation energy based on ASA'''
    solv = 0.
    for at in res.get_atoms():
        if 'EXP_NACCESS' not in at.xtra:
            continue
        s = float(at.xtra['EXP_NACCESS'])* at.xtra['vdw'].fsrf
        solv += s
    return solv


def vdw_int(at1, at2, r):
    '''Vdw interaction energy between two atoms'''
    eps12 = math.sqrt(at1.xtra['vdw'].eps * at2.xtra['vdw'].eps)
    sig12_2 = at1.xtra['vdw'].sig * at2.xtra['vdw'].sig
    return 4 * eps12 * (sig12_2**6/r**12 - sig12_2**3/r**6)

def MH_diel(r):
    '''Mehler-Solmajer dielectric'''
    return 86.9525 / (1 - 7.7839 * math.exp(-0.3153 * r)) - 8.5525

def elec_int(at1, at2, r):
    '''Electrostatic interaction energy between two atoms at r distance'''
    return 332.16 * at1.xtra['charge'] * at2.xtra['charge'] / MH_diel(r) / r

def calc_int_energies(st, res):
    elec = 0.
    vdw = 0.
    for at1 in res.get_atoms():
        for at2 in st.get_atoms():
        # skip same chain atom pairs
            if at2.get_parent().get_parent() != res.get_parent():
                r = at1 - at2
                e = elec_int(at1, at2, r)
                elec += e
                e = vdw_int(at1, at2, r)
                vdw += e
    return elec, vdw

def add_atom_parameters(st, res_lib, ff_params):
    ''' Adds parameters from libraries to atom .xtra field
        For not recognized atoms, issues a warning and put default parameters
    '''
    for at in st.get_atoms():
        resname = at.get_parent().get_resname()
        params = res_lib.get_params(resname, at.id)
        if not params:
            #print("WARNING: residue/atom pair not in library ("+atom_id(at) + ')')
            at.xtra['atom_type'] = at.element
            at.xtra['charge'] = 0
        else:
            at.xtra['atom_type'] = params.at_type
            at.xtra['charge'] = params.charge
        at.xtra['vdw'] = ff_params.at_types[at.xtra['atom_type']]

Step 2 execution

In [None]:
st=structure

residue_library = ResiduesDataLib('/home/jj/Desktop/Bioinformatics/Github/Bioinformatics_p/Biophysics/Biophysics_A1/assignment_data/parameters_step2.lib')
ff_params = VdwParamset('/home/jj/Desktop/Bioinformatics/Github/Bioinformatics_p/Biophysics/Biophysics_A1/assignment_data/parameters_vanderw.txt')
NACCESS_BINARY = '/home/jj/Desktop/Bioinformatics/Github/Bioinformatics_p/Biophysics/Biophysics_A1/soft/NACCESS/naccess'
srfA = NACCESS_atomic(st[0], naccess_binary=NACCESS_BINARY)

add_atom_parameters(st, residue_library,ff_params)

def chain_atoms(structure, dt):
    model =structure[0]
    t=0
    chain1_atoms=set()
    chain2_atoms=set()
    for chain in model:
        if t==0:
            t=1
            chain1=chain
        else: chain2=chain
    NeighborSearch_chain2 = NeighborSearch(list(chain2.get_atoms()))
    for residual1 in chain1.get_residues():
        for atom1 in residual1:
            Neighbor_atom_chain2 = NeighborSearch_chain2.search(atom1.coord, dt)
            for atom2 in Neighbor_atom_chain2:
                residual2=atom2.get_parent()
                chain1_atoms.add(residual1)
                chain2_atoms.add(residual2) 
    return calc(chain1_atoms, chain2_atoms)

def calc(chain1_atoms, chain2_atoms):
    s=sA=sE=e=v=0
    for res in chain1_atoms:
        s += calc_solvation(st[0], res)
        sA += calc_solvation(st[0]['A'], res)
        E, V = calc_int_energies(st, res)
        e+=E
        v+=V
    for res in chain2_atoms:
        s += calc_solvation(st[0], res)
        sE += calc_solvation(st[0]['E'], res)
        E, V = calc_int_energies(st, res)
        e+=E
        v+=V
    G = e + v + s - sA - sE
    print("Electrostatic interactions:", e)
    print("Van der Waals interactions", v)
    print("Solvation A-B complex:", s)
    print("Solvation of A:", sA)
    print("Solvation of B", sE)
    return("Interaction energy between components in A-B complex:",G)

print(chain_atoms(structure, dt))