## Data Understanding

### Set up

In [2]:
import os

In [41]:
data_folder = '../cs5242_project_data'
location_path = os.path.join(data_folder, 'centroids.csv')
ligand_path = os.path.join(data_folder, 'ligand.csv')
pair_path = os.path.join(data_folder, 'pair.csv')
proteins_folder = os.path.join(data_folder, 'proteins/pdbs')

### Data Frames Structure

In [8]:
import pandas as pd
import numpy as np

In [9]:
location_df = pd.read_csv(location_path)
location_df

Unnamed: 0,PID,x,y,z
0,102D,9.819391,24.178348,71.561739
1,110M,35.189667,6.802667,12.175667
2,112M,34.892200,7.174000,12.498400
3,11BA,-14.688256,14.944487,0.193744
4,11BG,5.319879,55.114576,66.171818
...,...,...,...,...
2995,3GS0,21.835619,43.358381,5.922905
2996,3GS4,21.921150,43.452400,5.793400
2997,3GS7,21.058688,42.161750,6.279813
2998,3GSM,-3.797056,16.102389,11.830556


In [10]:
ligand_df = pd.read_csv(ligand_path)
ligand_df

Unnamed: 0,LID,Smiles
0,1,S(=O)(=O)(N)c1ccc(cc1)C(=O)NCCOCCOCCNC(=O)[C@@...
1,2,c1cccc(c1)[C@@H](P(=O)(O)O)NCc1ccccc1
2,3,NCCCCCCNCCCCCCN
3,4,C(=O)(O)C(=O)Nc1ccc(cc1)CN
4,5,c1(ccc(cc1F)I)Nc1c(ccc(c1F)F)C(=O)NOC[C@@H](CO)O
...,...,...
3426,3427,N1=C2[C@H](N(C1)CCNC(CO)CO)C(=O)N=C(N)N2
3427,3428,C1(=NC2=NC=N[C@H]2C(=N1)OC[C@H]1CCC(=O)N1)N
3428,3429,O=C1C/C=C/[C@H](O)CC(=O)O[C@H]([C@@H]([C@H](C/...
3429,3430,c1cc(ccc1C[C@@H](C(=O)N[C@H]1CCCCN(C1=O)Cc1ccc...


In [11]:
pair_df = pd.read_csv(pair_path)
pair_df

Unnamed: 0,PID,LID
0,102D,494
1,110M,1797
2,112M,732
3,11BA,1313
4,11BG,2537
...,...,...
2995,3GS0,2035
2996,3GS4,333
2997,3GS7,2892
2998,3GSM,834


### Combine df


In [20]:
pair_arr = pair_df[['PID', 'LID']].to_numpy()
pair_arr.shape

(3000, 2)

In [21]:
ligand_arr = ligand_df['Smiles'].to_numpy()
ligand_arr.shape

(3431,)

In [23]:
location_list = []
Smiles_list = []
pid_list = []
length = len(pair_df)

for i in range(length):
    curr_PID = pair_arr[i, 0]
    curr_LID = pair_arr[i, 1]
    
    pid_list.append(curr_PID)
    Smiles_list.append(ligand_arr[curr_LID - 1])    

In [27]:
combined_df = location_df.copy()

In [28]:
combined_df['Smiles'] = Smiles_list

In [29]:
combined_df

Unnamed: 0,PID,x,y,z,Smiles
0,102D,9.819391,24.178348,71.561739,c1(ccc(cc1)C(=N)N)OCCCOc1ccc(cc1)C(=N)N
1,110M,35.189667,6.802667,12.175667,C=NC
2,112M,34.892200,7.174000,12.498400,C=NCCC
3,11BA,-14.688256,14.944487,0.193744,n1(c(=O)[nH]c(=O)cc1)[C@@H]1O[C@H](CO)[C@@H](O...
4,11BG,5.319879,55.114576,66.171818,n1c(N)[nH]c2c(c1=O)ncn2[C@H]1[C@H](O)[C@H](O)[...
...,...,...,...,...,...
2995,3GS0,21.835619,43.358381,5.922905,c12ccccc1-c1ccccc1C2=NOC[C@H](C)C(=O)O
2996,3GS4,21.921150,43.452400,5.793400,c12=C(C3=CC=CCC3=c1cccc2)NOCCC(=O)O
2997,3GS7,21.058688,42.161750,6.279813,COc1ccccc1/C=N\OCCC(=O)O
2998,3GSM,-3.797056,16.102389,11.830556,CCCCC(=O)N[C@@H]1[C@@H](O)[C@H](O)[C@@H](C=O)O...


In [30]:
combined_df.to_csv(os.path.join(data_folder, 'combined.csv'))

### Protein Data

In [42]:
import glob
protein_data_list = glob.glob(os.path.join(proteins_folder, '*.pdb'))
protein_data_list

['../cs5242_project_data\\proteins/pdbs\\102D.pdb',
 '../cs5242_project_data\\proteins/pdbs\\110M.pdb',
 '../cs5242_project_data\\proteins/pdbs\\112M.pdb',
 '../cs5242_project_data\\proteins/pdbs\\11BA.pdb',
 '../cs5242_project_data\\proteins/pdbs\\11BG.pdb',
 '../cs5242_project_data\\proteins/pdbs\\13GS.pdb',
 '../cs5242_project_data\\proteins/pdbs\\182L.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A0Q.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A4K.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A4L.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A4M.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A50.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A5H.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A6V.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A6W.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A72.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A7A.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A82.pdb',
 '../cs5242_project_data\\proteins/pdbs\\1A8A.pdb',
 '../cs5242_

In [47]:
file_map = {}

for file in protein_data_list:
    file_map[os.path.basename(file)[:4]] = file

In [48]:
file_map

{'102D': '../cs5242_project_data\\proteins/pdbs\\102D.pdb',
 '110M': '../cs5242_project_data\\proteins/pdbs\\110M.pdb',
 '112M': '../cs5242_project_data\\proteins/pdbs\\112M.pdb',
 '11BA': '../cs5242_project_data\\proteins/pdbs\\11BA.pdb',
 '11BG': '../cs5242_project_data\\proteins/pdbs\\11BG.pdb',
 '13GS': '../cs5242_project_data\\proteins/pdbs\\13GS.pdb',
 '182L': '../cs5242_project_data\\proteins/pdbs\\182L.pdb',
 '1A0Q': '../cs5242_project_data\\proteins/pdbs\\1A0Q.pdb',
 '1A4K': '../cs5242_project_data\\proteins/pdbs\\1A4K.pdb',
 '1A4L': '../cs5242_project_data\\proteins/pdbs\\1A4L.pdb',
 '1A4M': '../cs5242_project_data\\proteins/pdbs\\1A4M.pdb',
 '1A50': '../cs5242_project_data\\proteins/pdbs\\1A50.pdb',
 '1A5H': '../cs5242_project_data\\proteins/pdbs\\1A5H.pdb',
 '1A6V': '../cs5242_project_data\\proteins/pdbs\\1A6V.pdb',
 '1A6W': '../cs5242_project_data\\proteins/pdbs\\1A6W.pdb',
 '1A72': '../cs5242_project_data\\proteins/pdbs\\1A72.pdb',
 '1A7A': '../cs5242_project_data\\protei

### Develop Possible feature