### This file is to validate the results of AAC-1 by comparing the results of AAC-1 to all the possible interactions of a complex. This validation method is computationally costly, but more robust to test the fallacies of AAC-1. It may takes one hour for a cpu to compute 200 pdb files

### Do some preparations

In [9]:
import json
import os
os.getcwd()
# os.chdir()
# os.listdir()


'C:\\Users\\leo\\Documents\\Research\\Database\\PDB Learning'

### Extract_coordinates
* **Inputs:** 
 * *combineddict*, a list in the form of ['BDF', 'ACE', 'GH'] for one pdb. 
 * *pdb*, the pdb file corresponding to the combineddict.
* **Returns:**  
 * *coordinates* a dictionary in the form of {'A':[[1.2, 1.2, 3.0, 30, ALA], ....]}, [1.2, 1.2, 3.0, 30, ALA], the first three components are the coordinates, the fourth is the position, the last one is the name of the amino acids. It give all the coordinates of the atoms of all chains in one pdb file.

In [1]:
def Extract_coordinates(combineddict, pdb):
    all_chains = ''
    coordinates = {}
    for i in combineddict:
        all_chains +=i
    for i in all_chains:
        coordinates[i] = []
    for line in pdb:
        temp = []
        if line[:4] == 'ATOM':
            if line[21] in all_chains:
                temp =[float(line[30:38]), float(line[38:46]), float(line[46:54]), line[22:27], line[17:20]]
                coordinates[line[21]].append(temp)
    return coordinates

### Define a distance function

In [2]:
def dist(coordinates1, coordinates2):
    s = 0
    for i in range(len(coordinates1)):
        s += (coordinates1[i]-coordinates2[i])**2
    dist = s**0.5
    return dist

### count
#### This function will be used in *Extract_contact*
*  **Inputs:** 
 * *temp*, a list in the form of[ ['111 ', ALA, '20  ', LEU], ...].
* **Returns:**  
 * contact,  a list in the form of [[ALA, GLN, 10], ....].

In [None]:
def count(temp):
    temp_list1 = []
    for i in temp:
        temp_list1.append(i[0]+i[2]+i[1]+i[3])
    temp_set = set(temp_list1)
#    print(temp_set)
    contact = []
    for j in temp_set:
        n = temp_list1.count(j)
        contact.append([j[-6:-3], j[-3: ], n])
#        print([j[-6:-3], j[-3: ], n])
    return contact

### Extract_contact
 * **Inputs:**  
  * *coordinates*, a dictionary for one pdb, it is of the same as the returns of function *Extract_coordinates*
  * *iddict*, a list given in the form of  [['B', 'A', ''], ['D', 'C', 'G'], ['F', 'E', 'H']], which given information about the relationship between chains. It is of the same pdb as *coordinates*.
  * *cutoff*, float, gives the cutoff distance
 * **Returns:**  
  * contact, a list in the form of [[ALA, GLN, 10], ....], which means ALA contacts with GLN some where in the pdb and the contact number is 10.

In [None]:
def Extract_contact(coordinates, iddict, cutoff = 4):
    
    contact = []
    for i in iddict:
        temp1 = []
        if i[0]!= '' and i[2] != '':
            for j in coordinates[i[0]]:
                for k in coordinates[i[2]]:
                    if dist(j[:3], k[:3])<= cutoff:
                        l = j[3:]
                        l.extend(k[3:])
                        temp1.append(l)
        contact.extend(count(temp1))
                        
        temp1 = []
        if i[1]!= '' and i[2] != '':
            for j in coordinates[i[1]]:
                for k in coordinates[i[2]]:
                    if dist(j[:3], k[:3])<= cutoff:
                        l = j[3:]
                        l.extend(k[3:])
                        temp1.append(l)
        contact.extend(count(temp1))
        
    return contact

### Run
**This step will take a long time. It is better to divide the pdb files into batches.**

In [7]:
# with open('contact_homo', 'r') as f:
#     contact_homo = json.load(f)
# total_contact_homo ={}
# n = 0 
# for i in contact_homo:
#     with open(i +'.pdb', 'r') as pdb:
#         n += 1         
#         print('Dealing with ' + i + '   ' + str(n))
#         coordinates = Extract_coordinates(here_iddict_combineddict[1][i], pdb)
#         contact = Extract_contact(coordinates, here_iddict_combineddict[0][i])
#         total_contact_homo[i] = contact 

### Save the results
**We save these results for later usage**

In [None]:
# with open('total_contact', 'w') as f:
#     json.dump(total_contact_homo, f)

### extract_aa_from_results
* **Inputs:**  
 * *sub_contact*, a list of four coordinates generate by AAC-1 for one pdb file
 * seq, a dictionary, gives the sequence for one pdb file, it is generated by AAC-1
* **Return:**  
 * *aa*, a list of conatact corresponding to sub_contact in the form of [['PRO', 'SER', 3],...]

In [5]:
def extract_aa_from_results(sub_contact, seq):
    aa = []
    for F_coordinates in sub_contact:
        antibody_chain = F_coordinates[0][2]
        antigen_chain = F_coordinates[0][3]
        aa.append([seq[antibody_chain][F_coordinates[1]], seq[antigen_chain][F_coordinates[2]], F_coordinates[3]])
    return aa 

### validation
* **Inputs:**
 * *contact_homo_pdb*, the contact of one pdb generated by **AAC-1**
 * *seq_homo_pdb*, the sequence of the same pdb as *contact_homo_pdb*, generated by **AAC-1**
 * *total_contact_homo_pdb*, the total contact of the same pdb as above, generated by *Extract_contact*
* **Returns:** 
 * *res*, a string says"Same" or "Different" about whether the result generated by **AAC-1** and *Extract_contact* are the same

In [6]:
def validation(contact_homo_pdb, seq_homo_pdb, total_contact_homo_pdb):
    
    contact_results =  extract_aa_from_results(contact_homo_pdb, seq_homo_pdb)
    
    for i in contact_results:
        n = 0 
        res = ''
        if i not in total_contact_homo_pdb:
            res = 'Different'
            n += 1
            break
    if n == 0:
        res = 'Same'
    return res

### Run to validify

In [None]:
# for i in contact_homo:
#     try:
#         print(i +'  '+ validation(contact_homo[i], seq_homo[i], total_contact_homo[i]))
#     except:
#         print(i)