# Interface summary

This script will look at the data from the files that called interfaces to make tables with:
- Numbers of core residues
- Numbers of rim residues
- Percentage of residues that are in the core
- Percentage of residues that are in the rim


In [1]:
# Load libraries
import numpy as np
import pandas as pd
import csv
import os
import glob
import re

import math
from collections import OrderedDict
from Bio.PDB import *

In [2]:
## Load some helper functions

# Use a class for PDB coordinates
class PDB_coordinates:
    def __init__(self, x_coord, y_coord, z_coord, residue, atomtype, bfactor):
        '''The constructor for the PDB coordinates class'''
        self.x = x_coord
        self.y = y_coord
        self.z = z_coord
        self.residue = residue
        self.atomtype = atomtype
        self.bfactor = bfactor

    def measure_distance(self, target):
        '''A function that measures the distance between two PDB_coordinates objects.'''
        x_dist = (self.x - target.x)**2
        y_dist = (self.y - target.y)**2
        z_dist = (self.z - target.z)**2
        return math.sqrt(x_dist + y_dist + z_dist)

##################################

def parse_coordinates(infile, alpha_only):
    '''This function will parse a pdb file using the PDB_coordinates class.
    If the "alpha_only" argument is set to True the function only reads alpha carbons ("CA").
    If it is set to False it will read all atoms.
    '''
    handle = open(infile, 'r')
    coord_dict = OrderedDict()

    for line in handle:
        if line.startswith('ATOM'):

            atom_line = parse_pdb_line(line)

            if alpha_only:

                if atom_line[2] == 'CA':

                    if coord_dict.get(atom_line[4], -1) == -1:
                        coord_dict[atom_line[4]] = []
                    
                    # Add the atom to the list
                    x_coord = float(atom_line[6])
                    y_coord = float(atom_line[7])
                    z_coord = float(atom_line[8])
                    bfactor = float(atom_line[9])
                    residue = atom_line[4] + atom_line[5] + atom_line[3]
                    coord_dict[atom_line[4]].append(PDB_coordinates(x_coord, y_coord, z_coord, residue, 'CA', bfactor))
            else:
                if coord_dict.get(atom_line[4], -1) == -1:
                        coord_dict[atom_line[4]] = []
                
                # Add the atom to the list
                x_coord = float(atom_line[6])
                y_coord = float(atom_line[7])
                z_coord = float(atom_line[8])
                bfactor = float(atom_line[9])
                atomtype = atom_line[2]
                residue = atom_line[4] + atom_line[5] + atom_line[3]
                coord_dict[atom_line[4]].append(PDB_coordinates(x_coord, y_coord, z_coord, residue, atomtype, bfactor))

    return coord_dict

##################################


def parse_pdb_line(pdb_line):
    '''This function will receive a line from a PDB file and parse it as a list. It will do so based on the
    PDB format explanation from this site:

    https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html.
    '''
    atom = pdb_line[0:4].strip(' ')
    atom_num = pdb_line[6:11].strip(' ')
    atom_name = pdb_line[12:16].strip(' ')
    resname = pdb_line[17:20].strip(' ')
    chain = pdb_line[21]
    res_num = pdb_line[22:26].strip(' ')
    x = pdb_line[30:38].strip(' ')
    y = pdb_line[38:46].strip(' ')
    z = pdb_line[46:54].strip(' ')
    bfactor = pdb_line[60:66].strip(' ')

    return [atom, atom_num, atom_name, resname, chain, res_num, x, y, z, bfactor]



In [3]:
# Go to the main folder
main_folder = '../../Data/Structures/004_interfaces'

In [4]:
list_files = glob.glob(os.path.join(main_folder, '*'))

In [5]:
list_files

['../../Data/Structures/004_interfaces/1gpr',
 '../../Data/Structures/004_interfaces/1ifu',
 '../../Data/Structures/004_interfaces/1gv3',
 '../../Data/Structures/004_interfaces/1a8l',
 '../../Data/Structures/004_interfaces/1ai2',
 '../../Data/Structures/004_interfaces/4cvt',
 '../../Data/Structures/004_interfaces/2raj',
 '../../Data/Structures/004_interfaces/3r2v',
 '../../Data/Structures/004_interfaces/1ok9',
 '../../Data/Structures/004_interfaces/1d0q',
 '../../Data/Structures/004_interfaces/1juv',
 '../../Data/Structures/004_interfaces/3wx4',
 '../../Data/Structures/004_interfaces/1a4b',
 '../../Data/Structures/004_interfaces/4rfp',
 '../../Data/Structures/004_interfaces/5abr',
 '../../Data/Structures/004_interfaces/19hc',
 '../../Data/Structures/004_interfaces/2o4c',
 '../../Data/Structures/004_interfaces/1m7p',
 '../../Data/Structures/004_interfaces/1dth',
 '../../Data/Structures/004_interfaces/1uuf',
 '../../Data/Structures/004_interfaces/3ib7',
 '../../Data/Structures/004_interf

In [6]:
out_list = []

# Loop through each of the structures
for pdb_folder in list_files:

    pdb_id = os.path.basename(pdb_folder)

    # Parse the file
    pdb_file = os.path.join(pdb_folder, 'dist_regions_' + pdb_id + '_bio_check_Repair.pdb')
    if os.path.isfile(pdb_file):
        pdb_coords = parse_coordinates(pdb_file, True)

        # Count number of residues per subunit (should be the same)
        length_struc = len(pdb_coords['A'])

        core_res = 0
        rim_res = 0
        # Count number of residues at the interface or the rim
        for residue in pdb_coords['A']:
            if residue.bfactor == 0.75:
                rim_res += 1
            elif residue.bfactor == 1:
                core_res += 1

        out_list.append([pdb_id, length_struc, core_res, rim_res])

In [7]:
df = pd.DataFrame(out_list, columns = ['Structure', 'Total_residues', 'Interface_residues', 'Rim_residues'])

In [8]:
df

Unnamed: 0,Structure,Total_residues,Interface_residues,Rim_residues
0,1gpr,158,11,16
1,1ifu,258,9,10
2,1gv3,213,15,30
3,1a8l,226,17,26
4,1ai2,414,63,62
...,...,...,...,...
99,3f4d,323,32,39
100,1ang,123,8,15
101,2dfj,267,8,11
102,5hke,316,54,59


In [9]:
df.to_csv('../../Data/Structures/interface_summary.tsv', 
         sep = '\t',  header = True, index = False)

## Write a dataframe with the stickiness values for residues at the interface

In [10]:
## Load the stickiness scale
aa_properties = pd.read_csv('../../Data/Levy2012_propensity.tsv', sep = '\t')
aa_properties

Unnamed: 0,Aminoacid.1.letter,levy_propensity
0,A,0.0062
1,C,1.0372
2,D,-0.7485
3,E,-0.7893
4,F,1.2727
5,G,-0.1771
6,H,0.1204
7,I,1.1109
8,K,-1.1806
9,L,0.9138


In [11]:
codontable_standard = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
    'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
    }

## Will also need the reverse dictionary
aa_three2one = {
    'ALA': 'A', 'ILE': 'I', 'MET': 'M', 'THR': 'T', 'ASN': 'N',
    'LYS': 'K', 'SER': 'S', 'ARG': 'R', 'LEU': 'L', 'PRO': 'P',
    'HIS': 'H', 'GLN': 'Q', 'VAL': 'V', 'ASP': 'D', 'GLU': 'E',
    'GLY': 'G', 'PHE': 'F', 'TYR': 'Y', 'CYS': 'C', 'TRP': 'W'
}


In [12]:
out_list = []

# Loop through each of the structures
for pdb_folder in list_files:

    pdb_id = os.path.basename(pdb_folder)

    # Parse the file
    pdb_file = os.path.join(pdb_folder, 'dist_regions_' + pdb_id + '_bio_check_Repair.pdb')
    if os.path.isfile(pdb_file):
        pdb_coords = parse_coordinates(pdb_file, True)

        # Count number of residues at the interface or the rim
        for residue_i in pdb_coords['A']:
            # Extract the residue type and the position
            res_matches = re.search(pattern = '(\d+)(.*)', string = residue_i.residue)
            position = res_matches.group(1)
            res_type = aa_three2one[res_matches.group(2)]
            
            ## Check the stickiness value for that residue
            aa_values = aa_properties[aa_properties['Aminoacid.1.letter'] == res_type]
            for index, line in aa_values.iterrows():
                stickiness_value = line['levy_propensity']
            
            if residue_i.bfactor == 0.75:
                region = 'Rim'
                out_list.append([pdb_id, position, res_type, stickiness_value, region])
                
                
            elif residue_i.bfactor == 1:
                region = 'Core'
                out_list.append([pdb_id, position, res_type, stickiness_value, region])

out_df = pd.DataFrame(out_list, columns = ['PDB', 'Position', 'Res_type', 'Levy_propensity', 'Region'])
out_df

Unnamed: 0,PDB,Position,Res_type,Levy_propensity,Region
0,1gpr,31,D,-0.7485,Rim
1,1gpr,32,Q,-0.4114,Core
2,1gpr,33,V,0.7599,Core
3,1gpr,34,F,1.2727,Rim
4,1gpr,35,S,0.1376,Rim
...,...,...,...,...,...
6338,3rzn,153,Y,0.8806,Core
6339,3rzn,154,A,0.0062,Core
6340,3rzn,155,A,0.0062,Rim
6341,3rzn,208,K,-1.1806,Rim


In [13]:
means_stickiness = out_df.groupby('PDB')['Levy_propensity'].mean()
means_stickiness.describe()

count    104.000000
mean       0.153058
std        0.132686
min       -0.443060
25%        0.093359
50%        0.167739
75%        0.211704
max        0.402095
Name: Levy_propensity, dtype: float64

In [14]:
out_df.to_csv('../../Data/Structures/interface_stickiness.tsv', 
         sep = '\t',  header = True, index = False)