In [1]:
import os
import pandas as pd
import json
from Bio.PDB import *
from Bio import SeqIO
import nglview as nv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from biofunctions.biofunctions import *
pd.set_option('display.max_columns', 100)
%matplotlib inline

**The objective of this notebook is to obtain the inputs and outputs to the Language Model from the interactions found in Notebook 1., and create the input-output dataset**

# 1. Create input and output per pdb

1. expand_ag_chain_seq must have a copy where each entry ist ag_letter-seqid -> Do this in all the the get_full_seq functions
2. the same for expand_cdr_seq
3. create a dataframe where columns are ag items and row are cdr items
4. For every item in ab_letters and ag_letters in interactions dict, add 1 in the dataframe
5. Fill the rest with null values
6. Save the matrix as an array

For each ab_chain I we'll get:

* ab: cdr1_start - 6 to cdr1_end + 6
* ag: min_seqid - 6 to max_seqid + 6
* out: contact matrix


* ab: cdr2_start - 6 to cdr2_end + 6
* ag: min_seqid - 6 to max_seqid + 6
* out: contact matrix


* ab: cdr3_start - 6 to cdr3_end + 6
* ag: min_seqid - 6 to max_seqid + 6
* out: contact matrix


And it will look like this:

````bash
<CDR2> P K T L I Y R A N R L M I G V <ag>  D A T P E D L N <out> . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ . | | | | | | | | _ | | | | | | | | . _ . | | | . | | . . _ . . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ . . . . . . . . . _ .
````

**Some terms:** 
* interactions_dict: a dictionary that contains all the contact information about a single pdb
* amino_acids_dict: a dictionary that contains a mapping between the 3-letter code to 1-letter code
* chains_dict: a dictionary mapping the chain_label to the chain structure of a given pdb.
* chain: a chain from the PDB.Bio.Structure.Structure
* seqid: the sequence id in the PDB according to the IMGT residue numbering
* AB: antibody
* AG: antigen
* cdr_dict: a dictionary that contains sequence information about the CDR and AG chains that are in contact. A single cdr_dict belongs to a single AB chain

````python
cdr1_start = 27
cdr1_end = 38
cdr2_start = 56
cdr2_end = 65
cdr3_start = 105
cdr3_end = 117
````

In [2]:
with open('pdb_dict.json','r') as f:
    pdb_dict = json.load(f)

In [4]:
total_pdbs = len(pdb_dict['pdbs'])
n=6
count = 1
all_in_out_list = []

for pdb in pdb_dict['pdbs']:  

    if not pdb_dict['pdbs'][pdb]:
        print(f'PDB {pdb} is pending')

        count += 1
        continue

    interactions_dict = pdb_dict['pdbs'][pdb]

    in_out_list = create_in_out_str(interactions_dict,n)

    if in_out_list:
        all_in_out_list = all_in_out_list + in_out_list

    if count%10 == 0:
        print(f'{count} analyzed pdbs out of {total_pdbs}')


    count += 1
    
with open('in_out.json','w') as f:
    json.dump(all_in_out_list, f)
print('Finished...')

10 analyzed pdbs out of 646
20 analyzed pdbs out of 646
30 analyzed pdbs out of 646
PDB 5gkr is pending
40 analyzed pdbs out of 646
50 analyzed pdbs out of 646
60 analyzed pdbs out of 646
70 analyzed pdbs out of 646
80 analyzed pdbs out of 646
90 analyzed pdbs out of 646
100 analyzed pdbs out of 646
PDB 6ii9 is pending
PDB 6ii4 is pending
110 analyzed pdbs out of 646
PDB 3eba is pending
PDB 3e8u is pending
PDB 4kze is pending
PDB 1h8s is pending
130 analyzed pdbs out of 646
PDB 1oau is pending
PDB 4hlz is pending
PDB 6db9 is pending
PDB 3g6j is pending
PDB 2y06 is pending
PDB 1i8m is pending
150 analyzed pdbs out of 646
PDB 1tet is pending
160 analyzed pdbs out of 646
PDB 2bmk is pending
170 analyzed pdbs out of 646
PDB 2wub is pending
PDB 2wuc is pending
190 analyzed pdbs out of 646
PDB 4hjj is pending
PDB 1qfu is pending
PDB 1qfw is pending
200 analyzed pdbs out of 646
PDB 1ob1 is pending
210 analyzed pdbs out of 646
PDB 2jix is pending
220 analyzed pdbs out of 646
230 analyzed pdbs 

# 2. Parse data

In [2]:
with open('in_out.json','r') as f:
    data = json.load(f)

In [3]:
with open('word_vocab.txt','r') as f:
    vocab = [line.strip() for line in f]

In [4]:
data_parser = DataParser(data,vocab)

In [6]:
data_parser[0]

{'<CDR1>_indices': [5,
  22,
  28,
  14,
  21,
  13,
  22,
  18,
  32,
  20,
  17,
  29,
  15,
  32,
  16,
  20,
  19,
  31,
  30,
  21,
  26],
 '<ag>_indices': [8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21],
 '<out>_indices': [9,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  10,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  10,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  10,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  10,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  10,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  10,
  11,
  12,
  12,
  12,
  12,
  11,
  11,
  11,
  11,
  11,
  11,
  10,
  11,
  12,
  12,
  12,
  12,
  12,
  12,
  11,
  12,
  11,
  11,
  10,
  11,
  12,
  12,
  12,
  12,
  12,
  12,
  11,
  12,
  11,
  12,
  10,
  11,
  12,
  12,
  12,
  12,
  11,
  12,
  11,
  12,
  11,
  12,
  10,
  12,
  11,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12

In [7]:
for i in range(0,3):
    print(data_parser[i])
    print('\n')

{'<CDR1>_indices': [5, 22, 28, 14, 21, 13, 22, 18, 32, 20, 17, 29, 15, 32, 16, 20, 19, 31, 30, 21, 26], '<ag>_indices': [8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21], '<out>_indices': [9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 12, 12, 12, 12, 12, 12, 11, 12, 11, 11, 10, 11, 12, 12, 12, 12, 12, 12, 11, 12, 11, 12, 10, 11, 12, 12, 12, 12, 11, 12, 11, 12, 11, 12, 10, 12, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 10, 12, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 10, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 11

In [5]:
examples_list = [[5, 22, 28, 14, 21, 13, 22, 18, 32, 20, 17, 29, 15, 32, 16, 20,
                  19, 31, 30, 21, 26, 8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21],
                 [6, 22, 16, 31, 20, 18, 18, 20, 19, 25, 18, 28, 28, 18, 29, 13,
                   8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21, 22],
                 [5, 20, 29, 14, 21, 13, 28, 26, 15, 20, 24, 28, 17, 22, 29, 31,
                  17, 22, 26, 8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21, 22]]

In [9]:
data_collator =  DataCollator(vocab)
print(data_collator.padding(examples_list))

[[0, 5, 22, 28, 14, 21, 13, 22, 18, 32, 20, 17, 29, 15, 32, 16, 20, 19, 31, 30, 21, 26, 8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21, 2], [0, 6, 22, 16, 31, 20, 18, 18, 20, 19, 25, 18, 28, 28, 18, 29, 13, 8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21, 22, 2, 1, 1, 1, 1], [0, 5, 20, 29, 14, 21, 13, 28, 26, 15, 20, 24, 28, 17, 22, 29, 31, 17, 22, 26, 8, 15, 13, 29, 25, 16, 15, 22, 24, 13, 21, 22, 2, 1]]


In [9]:
len(examples_list)

3

In [None]:
vocab_map = {item:idx for (idx,item) in enumerate(vocab)}

In [None]:
n_samples = 3
data_examples_list = [data_parser[i] for i in range(n_samples)]
data_examples_list[0]

In [None]:
data_collator = DataCollator(vocab)

In [None]:
batch = data_collator(data_examples_list)