# Symmetry check

A script to look at the distance between a residue in one chain of the homodimer and itself in the other subunit.

In [1]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib
import csv
import os
import sys
import subprocess
from Bio.PDB import PDBParser, PDBIO
import glob
from Bio import SeqIO
import re
from collections import OrderedDict
import math

In [2]:
## Some helper functions

# Use a class for PDB coordinates
class PDB_coordinates:
    def __init__(self, x_coord, y_coord, z_coord, residue, atomtype, bfactor, position, chain):
        '''The constructor for the PDB coordinates class'''
        self.x = x_coord
        self.y = y_coord
        self.z = z_coord
        self.residue = residue
        self.atomtype = atomtype
        self.bfactor = bfactor
        
        self.position = position
        self.chain = chain

    def measure_distance(self, target):
        '''A function that measures the distance between two PDB_coordinates objects.'''
        x_dist = (self.x - target.x)**2
        y_dist = (self.y - target.y)**2
        z_dist = (self.z - target.z)**2
        return math.sqrt(x_dist + y_dist + z_dist)

##################################

def parse_coordinates(infile, alpha_only):
    '''This function will parse a pdb file using the PDB_coordinates class.
    If the "alpha_only" argument is set to True the function only reads alpha carbons ("CA").
    If it is set to False it will read all atoms.
    '''
    handle = open(infile, 'r')
    coord_dict = OrderedDict()

    for line in handle:
        if line.startswith('ATOM'):

            atom_line = parse_pdb_line(line)

            if alpha_only:

                if atom_line[2] == 'CA':

                    if coord_dict.get(atom_line[4], -1) == -1:
                        coord_dict[atom_line[4]] = []
                    
                    # Add the atom to the list
                    x_coord = float(atom_line[6])
                    y_coord = float(atom_line[7])
                    z_coord = float(atom_line[8])
                    bfactor = float(atom_line[9])
                    # residue = atom_line[4] + atom_line[5] + atom_line[3]
                    
                    residue = atom_line[3]
                    position = atom_line[5]
                    chain = atom_line[4]
                    
                    
                    coord_dict[atom_line[4]].append(PDB_coordinates(x_coord, y_coord, z_coord, residue, 'CA', bfactor, position, chain))
            else:
                if coord_dict.get(atom_line[4], -1) == -1:
                        coord_dict[atom_line[4]] = []
                
                # Add the atom to the list
                x_coord = float(atom_line[6])
                y_coord = float(atom_line[7])
                z_coord = float(atom_line[8])
                bfactor = float(atom_line[9])
                atomtype = atom_line[2]
                # residue = atom_line[4] + atom_line[5] + atom_line[3]

                    
                residue = atom_line[3]
                position = atom_line[5]
                chain = atom_line[4]
                
                coord_dict[atom_line[4]].append(PDB_coordinates(x_coord, y_coord, z_coord, residue, atomtype, bfactor, position, chain))

    return coord_dict

##################################


def parse_pdb_line(pdb_line):
    '''This function will receive a line from a PDB file and parse it as a list. It will do so based on the
    PDB format explanation from this site:

    https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html.
    '''
    atom = pdb_line[0:4].strip(' ')
    atom_num = pdb_line[6:11].strip(' ')
    atom_name = pdb_line[12:16].strip(' ')
    resname = pdb_line[17:20].strip(' ')
    chain = pdb_line[21]
    res_num = pdb_line[22:26].strip(' ')
    x = pdb_line[30:38].strip(' ')
    y = pdb_line[38:46].strip(' ')
    z = pdb_line[46:54].strip(' ')
    bfactor = pdb_line[60:66].strip(' ')

    return [atom, atom_num, atom_name, resname, chain, res_num, x, y, z, bfactor]


In [21]:
## Folder for the PDB structures
path_structures = '../../Data/Structures/004_interfaces/'

list_structures = glob.glob(os.path.join(path_structures, '*'))

min_dist_list = []

## Loop through the list of structures
for in_folder in list_structures:
    ## Check the PDB ID and load the structure
    pdb_id = os.path.basename(in_folder)
    
    infile = os.path.join(in_folder, 'dist_regions_' + pdb_id + '_bio_check_Repair.pdb')
    
    if not os.path.exists(infile):
        continue
        
    ## Parse residues bfactor and positions
    struc_dict = parse_coordinates(infile, False)
    
    ## For residues at the interface, measure distance to their counterpart
    interface_dict = {'A':{}, 'B':{}}
    positions_interface = []
    for subunit, residue_data in struc_dict.items():
        for res in residue_data:
            ## Check if the residue is at the interface
            region = res.bfactor
            position = res.position
            residue = res.residue
            chain = res.chain
            
            if region >= 0.75:
                ## Save this residue to the list
                if not position in positions_interface:
                    positions_interface.append(position)
    
    ## Loop again to make sure every interface position is in the dictionary
    for subunit, residue_data in struc_dict.items():
        for res in residue_data:
            region = res.bfactor
            position = res.position
            residue = res.residue
            chain = res.chain
            
            if position in positions_interface:
                if interface_dict[chain].get(position, -1) == -1:
                    interface_dict[chain][position] = [res]
                else:
                    interface_dict[chain][position].append(res)
      
    ## Loop through the interface positions
    for position in positions_interface:
        
        # Initialize the minimum distance
        minimum_distance = 1000
        
        for atom1 in interface_dict['A'][position]:
            
            for atom2 in interface_dict['B'][position]:
                
                region = max(atom1.bfactor, atom2.bfactor)
                
                # Skip hydrogen atoms
                if not atom1.atomtype.startswith('H') and not atom2.atomtype.startswith('H'):
                    ## Measure the distance from atom1 to atom2
                    atom_distance = round(atom1.measure_distance(atom2), 2)
                
                    if atom_distance < minimum_distance:
                        minimum_distance = atom_distance
            
            
            ## End loop for atoms in chain B
        ## End loop for atoms in chain A
        
        ## Save the minimum distance between that residue and itself in the other subunit
        min_dist_list.append([pdb_id, atom1.position, atom1.residue, region, minimum_distance])
        
## Convert to a dataframe
min_dist_df = pd.DataFrame(min_dist_list, columns = ['PDB', 'Position', 'Residue', 'Region', 'Min_dist'])


In [22]:
min_dist_df.to_csv('../../Data/Structures/interface_distance_self.tsv', index = False, sep = '\t')

In [23]:
df_used_structures = pd.read_csv('../../Data/final_104_structures.txt', sep = '\t', names = ['PDB_ID'])
used_structures = [entry for entry in df_used_structures['PDB_ID']]
used_structures

['1gpr',
 '1ifu',
 '1gv3',
 '1a8l',
 '1ai2',
 '4cvt',
 '2raj',
 '3r2v',
 '1ok9',
 '1d0q',
 '1juv',
 '3wx4',
 '1a4b',
 '4rfp',
 '5abr',
 '19hc',
 '2o4c',
 '1m7p',
 '1dth',
 '1uuf',
 '3ib7',
 '1cpj',
 '4wpe',
 '1c1f',
 '1bbh',
 '5c94',
 '2i4z',
 '4p16',
 '4fgw',
 '3h65',
 '1pzw',
 '4zvl',
 '1g99',
 '1bcg',
 '1m6d',
 '3gi4',
 '1edt',
 '3euo',
 '1am2',
 '4o48',
 '1nmz',
 '1hg2',
 '1b57',
 '4z04',
 '2vtk',
 '5hbi',
 '1h0c',
 '4z5z',
 '3u2g',
 '2pih',
 '4yzg',
 '1a72',
 '3naq',
 '1ukw',
 '2rl2',
 '6ftq',
 '5bj4',
 '2xz2',
 '5rfd',
 '3cxk',
 '3zof',
 '1cei',
 '1f05',
 '1alu',
 '4cwd',
 '1oi2',
 '1mk4',
 '3bac',
 '1ge7',
 '6cso',
 '1mi8',
 '3w5z',
 '4cmd',
 '1nxp',
 '1hpc',
 '2p09',
 '1c8t',
 '1qq2',
 '4fum',
 '3tgs',
 '1b7e',
 '1m38',
 '7d83',
 '1asb',
 '1asu',
 '1i49',
 '1at0',
 '2b18',
 '1btm',
 '4nak',
 '1eye',
 '2hqv',
 '1bdo',
 '2p53',
 '1bkz',
 '2imt',
 '1p6o',
 '3ot2',
 '3ulh',
 '3f4d',
 '1ang',
 '2dfj',
 '5hke',
 '3rzn']

In [25]:
list_check = [os.path.basename(entry) for entry in list_structures]

for entry in list_check:
    if not os.path.basename(entry) in used_structures:
        print(entry)