In [4]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML Training libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier

# RDKit for chemical informatics
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem


In [5]:
# load the dataset
ki_data = pd.read_csv('data/Ki_bind.tsv', sep='\t')

ki_data.head()

Unnamed: 0,drug_id,target_id,smiles,target_seq,origin_affinity,affinity
0,1,P04183,Cc1cn([C@H]2C[C@H](O)[C@@H](CO)O2)c(=O)[nH]c1=O,MSCINLPTVLPGSPSKTRGQIQVILGPMFSGKSTELMRRVRRFQIA...,200,6.69897
1,1,P11413,Cc1cn([C@H]2C[C@H](O)[C@@H](CO)O2)c(=O)[nH]c1=O,MAEQVALSRTQVCGILREELFQGDAFHQSDTHIFIIMGASGDLAKK...,1.54e+4,4.812479
2,1,P23919,Cc1cn([C@H]2C[C@H](O)[C@@H](CO)O2)c(=O)[nH]c1=O,MAARRGALIVLEGVDRAGKSTQSRKLVEALCAAGHRAELLRFPERS...,180000,3.744727
3,1,P25099,Cc1cn([C@H]2C[C@H](O)[C@@H](CO)O2)c(=O)[nH]c1=O,MPPYISAFQAAYIGIEVLIALVSVPGNVLVIWAVKVNQALRDATFC...,>10000,4.999957
4,1,P30543,Cc1cn([C@H]2C[C@H](O)[C@@H](CO)O2)c(=O)[nH]c1=O,MGSSVYITVELAIAVLAILGNVLVCWAVWINSNLQNVTNFFVVSLA...,>10000,4.999957


In [3]:
# inspect columns
ki_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380365 entries, 0 to 380364
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   drug_id          380365 non-null  int64  
 1   target_id        380365 non-null  object 
 2   smiles           380365 non-null  object 
 3   target_seq       380365 non-null  object 
 4   origin_affinity  380365 non-null  object 
 5   affinity         380365 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 17.4+ MB


In [4]:
len(ki_data)

380365

In [5]:
ki_data.describe()

Unnamed: 0,drug_id,affinity
count,380365.0,380365.0
mean,36015090.0,6.822692
std,22574460.0,1.626958
min,1.0,-5.0
25%,432224.0,5.700275
50%,50135600.0,6.89279
75%,50319280.0,8.0
max,50545750.0,14.522879


In [6]:
# check for any null/missing values
ki_data.isnull().sum()

drug_id            0
target_id          0
smiles             0
target_seq         0
origin_affinity    0
affinity           0
dtype: int64

In [24]:
## Exploratory Data Analysis (EDA)
# check for unique smile and protein_target

uniq_smiles = ki_data.smiles.unique()
uniq_targets = ki_data.target_seq.unique()

print(f"Unique SMILES: {len(uniq_smiles)}")
print(f"Unique Protein Targets: {len(uniq_targets)}")

Unique SMILES: 199949
Unique Protein Targets: 2716


In [25]:
# check for unique smile-target pairs
ki_copy = ki_data.copy()
unique_pairs = ki_copy[['smiles', 'target_seq']].drop_duplicates().shape[0]
print(f"Unique SMILES-Target pairs: {unique_pairs}")

Unique SMILES-Target pairs: 379743
