## This file is to validate the AAC-1 results  
 **The general idea is to go backward. Randomly sample the results, then check whether the aminoacids and the contact number are correct**
* **Outline:**  
* Randomly sample the results  
* Extract the amino acids from the stored sequences accoring to the sampled results from above
* Extract the amino acids and coordinates from the original file, calculate the contact number
* Compare the results


### Do some settings.

In [5]:
import random
import math
import numpy as np

### sample_results  
* **Inputs:**  
* contact, it is a list, gives all the contact of one pdb file, in the form of [['l2LA', 54, 184, 6], ['h3HA', 102, 195, 3]....]  
* **Returns:**  
* sub_contact, a list, it is subset of the contact.


In [21]:
def sample_results(contact):
    sub_contact = random.sample(contact, math.ceil(0.5 * len(contact)))
    return sub_contact   

### extract_aa_from_results
* **Inputs:**  
* sub_contact, a list, in the form of [['l2LA', 54, 184, 6], ['h3HA', 102, 195, 3]....].  
* seq, a dictionary, gives the sequences of all the chains in one pdb, in the form of {'A': [ALA, THR, ARG, ...], 'L': [TRP,...],...}
* **Retruns:**  
* aa, a list, in the form of [[HA, ALA, THR, 3], ......], where the THR is from the antibody, and 3 is the contact number


In [7]:
def extract_aa_from_results(sub_contact, seq):
    aa = []
    for F_coordinates in sub_contact:
        antibody_chain = F_coordinates[0][2]
        antigen_chain = F_coordinates[0][3]
        aa.append([F_coordinates[0][2:3], seq[antibody_chain][F_coordinates[1]], seq[antigen_chain][F_coordinates[2]], F_coordinates[3]])
    return aa


### dist
**It is a function to calculate the Euclidean distance**

In [8]:
def dist(x, y):
    return np.sqrt(np.sum((np.array(x) - np.array(y))**2))

### get_contact_sub
* **Inputs:**  
 * list_1, list_2, are list of coordinates in the form of [[1.2, 2.1, 0.5, 8], ....], [[1.2, 2.1, 0.5, 8], ....]  
* **Returns:**
 * **contact**, a list in the form of [[8, 9, 3],....], where the first two elements indicates the amino acid positions in the antibody chaind and the antigen chain, the third element means the contact number

In [9]:
def get_contact_sub(list_1, list_2, cutoff = 5):
    contact = []
    dic_temp = {}
    for i in list_1:
        for j in list_2:
            if dist(i[:-1], j[:-1]) <= cutoff:
                if str(i[3])+'_'+str(j[3]) in dic_temp:
                    dic_temp[str(i[3])+'_'+str(j[3])] += 1
                else:
                    dic_temp[str(i[3])+'_'+str(j[3])] = 1
    for k in dic_temp:
        contact.append([int(k.split('_')[0]), int(k.split('_')[1]), dic_temp[k]])
    return contact

### extract_aa_cdn_from_origin
**This is a class to extract amino acids and calculate the conatact from the original pdb file**

In [10]:
class extract_aa_cdn_from_origin (object):
    def __init__(self, file, antibody_chain_id, antigen_chain_id,  position = {}, cutoff = 5.0):
        self.file = file
        self.antibody_chain_id = antibody_chain_id
        self.antigen_chain_id = antigen_chain_id
        self.position = position
        self.cutoff = cutoff
    '''
    inputs: the above arguments
    returns: aa, a dictionary, in the form of {'A':[[8,9,...],[ALA, ....]],...}
             coordinates, a dictionary in the form of {'A':[[1.2, 2.1, 0.5, 8], ....]...}
    '''        
    def get_aa_cdn(self):
        # initialize the returned values
        aa = {}
        for i in self.antibody_chain_id:
            aa[i] = [[], []]
        for j in self.antigen_chain_id :
            aa[j] = [[], []]
            
        coordinates = {}
        for i in self.antibody_chain_id:
            coordinates[i] = []
        for j in self.antigen_chain_id :
            coordinates[j] = []
        # Initialize the temperary values        
        aa_pdb_tracker = {}
        for i in self.antibody_chain_id:
            aa_pdb_tracker[i] = ' '
        for i in self.antigen_chain_id:
            aa_pdb_tracker[i] = ' '
            
        insersion_tracker = {}
        for i in self.antibody_chain_id:
            insersion_tracker[i] = ' '
        for i in self.antigen_chain_id:
            insersion_tracker[i] = ' '
            
        normal_tracker = {}
        for i in self.antibody_chain_id:
            normal_tracker[i] = -1
        for i in self.antigen_chain_id:
            normal_tracker[i] = -1
            
        count_tracker = {} 
        for i in self.antibody_chain_id:
            count_tracker[i] = -1
        for i in self.antigen_chain_id:
            count_tracker[i] = -1

       # extract the aa and coordinates        
        for line in self.file:
            a = 0
            b = 0              
            if line[:4] == "ATOM" and line[21] in self.antibody_chain_id:
                
                if insersion_tracker[line[21]] != line[26]:
                    if line[26] != ' ':
                        a = 1
                    insersion_tracker[line[21]] = line[26]  
                    
                if normal_tracker[line[21]] != int(line[22:26]) and normal_tracker[line[21]] == -1:#first line
                    b = 1
                    normal_tracker[line[21]] = int(line[22:26])
                if normal_tracker[line[21]] != int(line[22:26]) and normal_tracker[line[21]] != -1:
                    b = int(line[22:26]) - normal_tracker[line[21]]
                    normal_tracker[line[21]] = int(line[22:26])
                    
                if a + b >= 1:
                    count_tracker[line[21]] += 1
                                        
                if count_tracker[line[21]] in self.position[line[21]]:
                    if count_tracker[line[21]] not in aa[line[21]][0]:
                        aa[line[21]][0].append(count_tracker[line[21]])
                        aa[line[21]][1].append(line[17:20])
                    coordinates[line[21]].append([float(line[30:38]), float(line[38:46]), float(line[46:54]), count_tracker[line[21]]])
            
            a = 0 
            b = 0              
            if line[:4] == "ATOM" and line[21] in self.antigen_chain_id:
                
                if insersion_tracker[line[21]] != line[26]:
                    if line[26] != ' ':
                        a = 1
                    insersion_tracker[line[21]] = line[26]  
                    
                if normal_tracker[line[21]] != int(line[22:26]) and normal_tracker[line[21]] == -1:#first line
                    b = 1
                    normal_tracker[line[21]] = int(line[22:26])
                if normal_tracker[line[21]] != int(line[22:26]) and normal_tracker[line[21]] != -1:
                    b = int(line[22:26]) - normal_tracker[line[21]]
                    normal_tracker[line[21]] = int(line[22:26])
                    
                if a + b == 1:
                    count_tracker[line[21]] += 1
                if a + b >= 2:
                    count_tracker[line[21]] += 2
                                        
                if count_tracker[line[21]] in self.position[line[21]]:
                    if count_tracker[line[21]] not in aa[line[21]][0]:
                        aa[line[21]][0].append(count_tracker[line[21]])
                        aa[line[21]][1].append(line[17:20])
                    coordinates[line[21]].append([float(line[30:38]), float(line[38:46]), float(line[46:54]), count_tracker[line[21]]])
        return aa, coordinates 
    '''retruns contact in the form of [[HA, ALA, THR, 3], ......]'''   
    def get_contact(self, aa, coordinates):  
        contact = []
        contact_temp = []
        for i in self.antibody_chain_id:
            for j in self.antigen_chain_id:
                contact_temp = get_contact_sub(coordinates[i], coordinates[j], self.cutoff)
                contact = []
                for k in contact_temp:
                    aa_antibody = aa[i][1][aa[i][0].index(k[0])]
                    aa_antigen = aa[j][1][aa[j][0].index(k[1])]
                    contact.append([i+j, aa_antibody, aa_antigen, k[2]])
        return contact

### process_four_coordinates
* **Inputs:**  
 * four_coordinates, a list in the form of [['l2LA', 53, 202, 9], ...]  
* **Returns:**  
 * antibody_chain_id, antigen_chain_id,  position = {}, all those are as described in *extract_aa_cdn_from_origin*.



In [11]:
def process_four_coordinates(four_coordinates):
    antibody_chain_id = []
    antigen_chain_id = []
    for i in four_coordinates:
        if i[0][2] not in antibody_chain_id:
            antibody_chain_id.append(i[0][2])
        if i[0][3] not in antigen_chain_id:
            antigen_chain_id.append(i[0][3])
    # Initialize position
    position = {}
    for i in antibody_chain_id:
        position[i] = [] 
    for i in antigen_chain_id:
        position[i] = [] 
    # pour data to position
    for i in four_coordinates:
        if i[1] not in position[i[0][2]]:
            position[i[0][2]].append(i[1])
        if i[2] not in position[i[0][3]]:
            position[i[0][3]].append(i[2])  
    return antibody_chain_id, antigen_chain_id,  position

### main
* **Inputs:**  
 * contact_dict, a dictionary in the form of {'1dee': [['l2LA', 53, 202, 9], ...] , ...}  
 * PDB_seq, a dictionary in the form of {'1dee': {'A': [ALA, THR, ARG, ...], 'L': [TRP,...],...}, ...} 
 
* **Returns:**
 * if the results are the same, say"They are the same", otherwise say "files ... are different"

In [23]:
def main(contact_dic, PDB_seq):
    for pdb in contact_dic:
        sub_contact = sample_results(contact_dic[pdb])
        contact_from_results = extract_aa_from_results(sub_contact, PDB_seq[pdb][0])
        
        antibody_chain_id, antigen_chain_id,  position = process_four_coordinates(sub_contact)
        with open(pdb+'.pdb', 'r') as f:
            one_pdb = extract_aa_cdn_from_origin(f, antibody_chain_id, antigen_chain_id,  position)
            aa, coordinates = one_pdb.get_aa_cdn()
            contact_from_origin = one_pdb.get_contact(aa, coordinates)
        # compare
        if contact_from_results.sort() == contact_from_origin.sort():
            print('They are the same.')
        else:
            print('file '+ pdb + ' is different.')
            
            

In [19]:
import json
with open("contact_current", 'r') as f:
    contact_current = json.load(f)
with open('seq_and_coordinates_current', 'r') as f:
    seq_and_coordinates_current = json.load(f)

In [24]:
main(contact_current, seq_and_coordinates_current)

They are the same.
They are the same.
They are the same.
They are the same.
They are the same.
They are the same.
They are the same.
They are the same.
They are the same.
They are the same.
They are the same.
