## This file is to extract the amino acids of the paratope and the epitope with the constraint of the frame length

### extract_four_coordinates_aa_pos
* **Inputs:**  
 * *DataExtract_pdb*, a dictionary, extracted data for pdb in the form of {'h1HA': [[30], None, None, None], [17, 0, 0, 0], [0.532, ...]],...}
 * *Four_coordinates*: a list of four coordinates, from the same pdf as DataExtract_pdb, in the form of [['h1HA', 30, 16, 7],...]
 * *frame_length*, an integer, gives the length of the refernce frame, it has the same meaning as *length* in *extract_aa_consecutive_pos*.
 * *ref_chain*, a string, takes values as either 'Ab' or 'Ag'. 
* **Returns:**  
 * *extract_four_coordinates*, a dictionary, in the form of {'h1HA':[['h1HA', 30, 16, 7],...], ...}
 * *aa_pos_correspondence_framed*, a dictionary, in the form of {'h1HA': [[30], [16, 17], 14, 0.824],...}, where the paratope aa or epitope aa is arranged according to the arranged frame,14 stands for the contact number, 0.824 stands for the ratio 14 is to the total contact number of CDRh1, it corresponds to the same pdb as DataExtract_pdb

In [1]:
def extract_four_coordinates_aa_pos(DataExtract_pdb, Four_coordinates_pdb, frame_length, ref_chain = 'Ag'):
    if ref_chain == 'Ab':
        ind = 1
    if ref_chain == 'Ag':
        ind = 2
        
    extract_four_coordinates = {}
    for i in DataExtract_pdb:
        extract_four_coordinates[i] = []
        for j in Four_coordinates_pdb:
            if i == j[0] and DataExtract_pdb[i][0][frame_length-1] != None and (j[ind] in DataExtract_pdb[i][0][frame_length-1]):
                 extract_four_coordinates[i].append(j)
        if extract_four_coordinates[i] == []:
            del extract_four_coordinates[i]
            
    aa_pos_correspondence_framed = {}       
    for i in extract_four_coordinates:
        aa_pos_correspondence_framed[i] = [None, None, None, None]
        temp = []
        extract_four_coordinates[i].sort(key = lambda x: x[ind])
        for j in extract_four_coordinates[i]:
            if j[3-ind] not in temp:
                temp.append(j[3-ind])
        aa_pos_correspondence_framed[i][ind-1] = DataExtract_pdb[i][0][frame_length-1]
        aa_pos_correspondence_framed[i][2-ind] = temp
        aa_pos_correspondence_framed[i][2] = DataExtract_pdb[i][1][frame_length-1]
        aa_pos_correspondence_framed[i][3] = DataExtract_pdb[i][2][frame_length-1]
               
    return extract_four_coordinates, aa_pos_correspondence_framed

### map_to_aa
* **Inputs:**  
 * *aa_pos_correspondence_framed*, the same as above.
 * *seq_pdb*, a dictionary of the form  {'H': ['GLU', 'VAL',...],...}, it should be of the same pdb as *aa_pos_correspondence_framed*
* **Returns:**
 * *aa_framed*, a dictionary, in the form of {'h1HA':[[ASP],[SER, ARG],...], ...} if {'h1HA':[[ASP],[SER, O, ARG],...], ...}, O stands for insertion(s).

In [2]:
def map_to_aa(aa_pos_correspondence_framed, seq_pdb):
    aa_framed = {}
    for i in aa_pos_correspondence_framed:
        #initiate the returned values
        aa_framed[i] = [None, None, None, None]
        
        antibody_chain = i[2]        
        antibody_temp = []        
        tracker = aa_pos_correspondence_framed[i][0][0]
        for j in aa_pos_correspondence_framed[i][0]:
            if j - tracker <= 1:
                antibody_temp.append(seq_pdb[antibody_chain][j])
                tracker = j 
            else:
                antibody_temp.extend(['O', seq_pdb[antibody_chain][j]])
                tracker = j 
            
        antigen_chain = i[3]
        antigen_temp = []
        tracker = aa_pos_correspondence_framed[i][1][0]
        for k in aa_pos_correspondence_framed[i][1]:
            if k - tracker <= 1:
                antigen_temp.append(seq_pdb[antigen_chain][k])
                tracker = k 
            else:
                antigen_temp.extend(['O', seq_pdb[antigen_chain][k]])
                tracker = k
        
        aa_framed[i][0] = antibody_temp
        aa_framed[i][1] = antigen_temp
        aa_framed[i][2] = aa_pos_correspondence_framed[i][2]
        aa_framed[i][3] = aa_pos_correspondence_framed[i][3]
    
    return aa_framed

### Set the working directory

In [4]:
import os
os.getcwd()
os.chdir("C:\\Users\\leo\\Documents\\Research\\Database\\Humnized antibody\\PDB DATA\\Homo Sapiens with paptide 5+\\structure")

### Import the required data

In [5]:
import json
with open('DataExtract_homo', 'r') as f:
    DataExtract_homo = json.load(f)
with open('contact_homo', 'r') as f:
    contact_homo = json.load(f)
with open('seq_homo', 'r') as f:
    seq_homo = json.load(f)

### Combine together

In [6]:
four_coordinates_framed = {}
aa_pos_correspondence_framed = {}
for i in DataExtract_homo:
    four_coordinates_framed[i], aa_pos_correspondence_framed[i] = extract_four_coordinates_aa_pos(DataExtract_homo[i],
                        contact_homo[i], frame_length = 2, ref_chain = 'Ag')
    
aa_correspondence_framed = {}
for i in aa_pos_correspondence_framed:
    aa_correspondence_framed[i] = map_to_aa(aa_pos_correspondence_framed[i], seq_homo[i])

### Save the results

In [7]:
with open('ParaEpi_pos_framed', 'w') as f:
    json.dump(aa_pos_correspondence_framed, f)
with open('ParaEpi_framed', 'w') as f:
    json.dump(aa_correspondence_framed, f)
with open('ParaEpi_fcdn_framed', 'w') as f:
    json.dump(four_coordinates_framed, f)

In [8]:
aa_correspondence_framed ['1adq']

{'h1HA': [['ASP'], ['SER', 'ARG'], 14, 0.824],
 'h2HA': [['TRP'], ['ILE', 'SER'], 11, 1.0],
 'h3HA': [['ARG', 'SER', 'TYR', 'VAL'], ['ASN', 'HIS'], 20, 0.488]}