In [13]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from mordred import Calculator, descriptors

In [27]:
dataset = pd.read_excel("../data_for_modeling/filter_data/v1/HDAC2_data_after_filtering_raw.xlsx", sheet_name="Final data to train")

In [28]:
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [29]:
#Get all the duplicates
Canon_SMILES = canonical_smiles(dataset.SMILES)
len(Canon_SMILES)

993

In [30]:
dataset['SMILES'] = Canon_SMILES
dataset

Unnamed: 0,STT,SMILES,CID,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,0,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.21400,Active,Inactive,1
1,1,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68000,Active,Active,1
2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12000,Active,Inactive,1
3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.25200,Active,Active,1
4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,Active,Inactive,4
...,...,...,...,...,...,...,...
988,769,O=C(Cc1ccccc1)NO,220184,,Unspecified,Inactive,1
989,769,CCCC(CCC)C(=O)NO,88129,,Unspecified,Inactive,1
990,769,O=C(O)CCCc1ccccc1,4775,,Unspecified,Inactive,1
991,769,CCCC(CCC)C(=O)O,3121,,Unspecified,Inactive,2


In [31]:
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
len(duplicates_smiles)

9

In [32]:
dataset[dataset['SMILES'].isin(duplicates_smiles)].sort_values(by=['SMILES'])

Unnamed: 0,STT,SMILES,CID,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
252,252,CC(C)C1NC(=O)C2(C)CSC(=N2)c2csc(n2)CNC(=O)CC(C...,155564400,0.09551,Active,Active,1
290,290,CC(C)C1NC(=O)C2(C)CSC(=N2)c2csc(n2)CNC(=O)CC(C...,155545380,0.021,Active,Active,1
249,249,CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1,155566833,8.3,Active,Inactive,1
260,260,CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1,155560779,7.0,Active,Inactive,1
270,270,CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1,155555330,4.2,Active,Inactive,1
915,769,CCCCCCC(CCCCCC(=O)Nc1ccccc1)C(=O)NO,134156853,,Unspecified,Inactive,1
917,769,CCCCCCC(CCCCCC(=O)Nc1ccccc1)C(=O)NO,134136160,,Unspecified,Inactive,1
974,769,COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1,25065309,,Unspecified,Inactive,1
975,769,COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1,25065102,,Unspecified,Inactive,1
977,769,COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1,24779722,,Unspecified,Inactive,1


In [33]:
dataset_new = dataset.drop_duplicates(subset=['SMILES'])
len(dataset_new)

984

In [34]:
train_dataset, test_dataset = train_test_split(dataset_new, test_size=0.2, random_state=42)

In [35]:
y_Train = np.array(train_dataset['FINAL_LABEL'])
y_Test = np.array(test_dataset['FINAL_LABEL'])

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

y_Train_2d = y_Train.reshape(-1, 1)
encoder.fit(y_Train_2d)
one_hot_encoded_data = encoder.transform(y_Train_2d).toarray()
y_Train_sc = one_hot_encoded_data[:, 0]
y_Train_sc.astype(np.int64)

y_Test_2d = y_Test.reshape(-1, 1)
encoder.fit(y_Test_2d)
one_hot_encoded_data = encoder.transform(y_Test_2d).toarray()
y_Test_sc = one_hot_encoded_data[:, 0]
y_Train_sc.astype(np.int64)

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,

In [37]:
train_dataset.FINAL_LABEL = y_Train_sc
test_dataset.FINAL_LABEL = y_Test_sc

In [38]:
# Save train and test sets to files
dataset_new.to_csv("../data_for_modeling/filter_data/v1/HDAC2_total.csv", index=False)
train_dataset.to_csv("../data_for_modeling/filter_data/v1/HDAC2_train.csv", index=False)
test_dataset.to_csv("../data_for_modeling/filter_data/v1/HDAC2_test.csv", index=False)