# Create LCIdb and CV datasets in different prediction scenarii

In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
import pandas as pd

## Preprocessing 
Download the file 'Consensus_CompoundBioactivity_Dataset_v1.1.csv' from https://zenodo.org/record/6398019#.Y6A4nrKZPn4 

In [None]:
from komet import process_LCIdb
LCIdb = process_LCIdb('Consensus_CompoundBioactivity_Dataset_v1.1.csv', data_dir = "./", max_length_fasta = 1000, bioactivity_choice = "checkand1database",min_weight = 100, max_weight = 900,  interaction_plus = 1e-7, interaction_minus = 1e-4)

## Create CV in different prediction scenarii
* Random
* Unseen_drugs
* Unseen_targets
* Orphan

### Load LCIdb

The dataset, with default parameters, can be downloaded in Zenodo https://zenodo.org/records/10731712 as LCIdb_v2.csv.

In [2]:
LCIdb_path = 'LCIdb_v2.csv'
 
LCIdb = pd.read_csv(LCIdb_path,low_memory=False)
LCIdb.head()

Unnamed: 0,smiles,fasta,ChEMBL ID,PubChem ID,IUPHAR ID,Ligand names,Target,uniprot,mean,mean pIC50,...,mean pKd,min,min pIC50,min pKi,min pKd,max,max pIC50,max pKi,max pKd,score
0,BrC(=C\c1ccccc1)/C=N/n1cnnc1,MNPTLILAAFCLGIASATLTFDHSLEAQWTKWKAMHNRLYGMNEEG...,CHEMBL3190095,6861939.0,,sid24812872,ctsl,P07711,4.2,4.2,...,,4.2,4.2,,,4.2,4.2,,,0.5
1,BrC(Cn1ncc2c(N3CCCC3)ncnc21)c1ccccc1,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,CHEMBL382012,11617657.0,,1-(2-bromo-2-phenylethyl)-4-(pyrrolidin-1-yl)-...,src,P12931,5.0,,...,,5.0,,5.0,,5.0,,5.0,,0.5
2,BrC(Cn1ncc2c(N3CCCCC3)ncnc21)c1ccccc1,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,CHEMBL203922,11696703.0,,1-(2-bromo-2-phenylethyl)-4-(piperidin-1-yl)-1...,src,P12931,5.8,,...,,5.8,,5.8,,5.8,,5.8,,0.5
3,BrC(Cn1ncc2c(NCc3ccccc3)ncnc21)c1ccccc1,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,CHEMBL382153,11545984.0,,n-benzyl-1-(2-bromo-2-phenylethyl)-1h-pyrazolo...,src,P12931,5.5,,...,,5.5,,5.5,,5.5,,5.5,,0.5
4,BrC1=CC2CNCCC2S1,MSGADRSPNAGAAPDSAPGQAAVASAYQRFEPRAYLRNNYAPPRGD...,CHEMBL398475,44447166.0,,"2-bromo-3a,4,5,6,7,7a-hexahydrothieno[3,2-c]py...",pnmt,P11086,5.9,,...,,5.9,,5.9,,5.9,,5.9,,0.5


## Compute CV folds

Loads the interaction data from a CSV file, preprocesses the data to generate numerical indices for unique smiles (molecules) and fasta (proteins), 
and splits the data into cross-validation training and testing datasets based on the specified split type. 

Options are 
* "full" 
* "unseen_drug" 
* "unseen_target" 
* "Orphan" 

In [8]:
from komet import make_CV_train_test

train_arr, test_arr = make_CV_train_test('LCIdb_v2.csv', "Orphan", "./", 5)

cpu
nombre de smiles:  271180
nombre de fasta:  2060


  df = df.drop(indsmiles_index_with_nan,0)
  df = df.drop(indfasta_index_with_nan,0)


matrice d'interactions:  (2060, 271180)
train (352100, 3)
nb of interactions + in test 43653
number of interactions + deleted in test 0
number of interactions + in test 43653
number of interactions - (7965, 2)
number of np.nan (557958247, 2)
i_end 474771903
number of interactions - in test 43653
test (87306, 3)
train (335638, 3)
nb of interactions + in test 46706
number of interactions + deleted in test 0
number of interactions + in test 46706
number of interactions - (7965, 2)
number of np.nan (557958247, 2)
i_end 491762422
number of interactions - in test 46706
test (93412, 3)
train (321062, 3)
nb of interactions + in test 50310
number of interactions + deleted in test 0
number of interactions + in test 50310
number of interactions - (7965, 2)
number of np.nan (557958247, 2)
i_end 360985565
number of interactions - in test 50310
test (100620, 3)
train (331310, 3)
nb of interactions + in test 47831
number of interactions + deleted in test 0
number of interactions + in test 47831
numbe

In [9]:
train = train_arr[0]
test = test_arr[0]

train.head()

cpu


Unnamed: 0,SMILES,Target Sequence,Label
0,BrC[C@H]1CC[C@H](c2nnn3cnc4[nH]ccc4c23)CC1,MGMACLTMTEMEGTSTSSIYQNGDISGNANSMKQIDPVLQVYLYHS...,1
1,BrC[C@H]1CC[C@H](c2nnn3cnc4[nH]ccc4c23)CC1,MQYLNIKEDCNAMAFCAKMRSSKKTEVNLEAPEPGVEVIFYLSDRE...,1
2,BrC[C@H]1CC[C@H](c2nnn3cnc4[nH]ccc4c23)CC1,MKTPWKVLLGLLGAAALVTIITVPVVLLNKGTDDATADSRKTYTLT...,0
3,BrC[C@H]1CC[C@H](c2nnn3cnc4[nH]ccc4c23)CC1,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,0
4,Brc1cc(CN2CCC(c3nnn4cnc5[nH]ccc5c34)CC2)sc1Br,MGMACLTMTEMEGTSTSSIYQNGDISGNANSMKQIDPVLQVYLYHS...,1


In [13]:
print("number of drugs in train", train['SMILES'].nunique())
print("number of drugs in test", test['SMILES'].nunique())

print("*"*50)
print("number of proteins in train", train['Target Sequence'].nunique())
print("number of proteins in test", test['Target Sequence'].nunique())


number of drugs in train 135084
number of drugs in test 62676
**************************************************
number of proteins in train 1180
number of proteins in test 880


In [16]:
print("number of interactions in train")
train.value_counts('Label')

number of interactions in train


Label
0    176050
1    176050
dtype: int64

In [17]:
print("number of interactions in test")
test.value_counts('Label')

number of interactions in test


Label
1    43653
0    43652
dtype: int64