In [18]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

# Read the data

In [19]:
all_data_path = "../../data_for_modeling/filter_data/v3/HDAC2_all_data_filtered_p2.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='filter_data')
dataset_length = len(dataset)
dataset_length

2801

In [20]:
dataset.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS,Original_CID,ZBG Classified
0,264,CCCC(=O)O,206000.0,Inactive,Inactive,1,264,3
1,2746,CC(=O)Nc1ccc(C(=O)Nc2ccccc2N)cc1,1495.875,Inactive,Inactive,1,2746,4
2,2788,Oc1c(I)cc(Cl)c2cccnc12,105800.0,Inactive,Inactive,1,2788,21
3,3121,CCCC(CCC)C(=O)O,288246.666667,Inactive,Inactive,1,3121,3
4,3994,CN(C)c1ccc(cc1)C(=O)NCCCCCCC(=O)NO,20.7,Active,Active,1,3994,1


# 1. Profile of the data

In [21]:
def check_activity_distribution(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "Active"]
    inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]

    dataset_length = len(dataset)

    print("Total dataset")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

__Train on the final label, not the first label__

In [22]:
check_activity_distribution(dataset=dataset, col_name='FINAL_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 899      │  1902      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  32.0957 │    67.9043 │
╘════════════════╧══════════╧════════════╛


# 2. Train test split

In [23]:
dataset_c = dataset.copy()
print(len(dataset_c))

2801


In [None]:
from sklearn.model_selection import train_test_split
#Define informations
train_test_data_path = "../../data_for_modeling/train_test_data"
train_test_file_name = "HDAC2_train_test_data.xlsx"
validation_size = 0.1 #10% of 100% total data
test_size = 0.2 #20% of 90% total data
random_state = 42
#Create data data
train_test_data = dataset_c.drop(['FIRST_LABEL', 'DUPLICATE_COUNTS', 'Original_CID'], axis = 1)
train_test_dataset, validation_dataset = train_test_split(train_test_data, test_size=validation_size, random_state=random_state)
train_dataset, test_dataset = train_test_split(train_test_data, test_size=test_size, random_state=random_state)

In [69]:
train_dataset.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FINAL_LABEL,ZBG Classified
0,264,CCCC(=O)O,206000.0,Inactive,3
1,2746,CC(=O)Nc1ccc(C(=O)Nc2ccccc2N)cc1,1495.875,Inactive,4
2,2788,Oc1c(I)cc(Cl)c2cccnc12,105800.0,Inactive,21
3,3121,CCCC(CCC)C(=O)O,288246.666667,Inactive,3
4,3994,CN(C)c1ccc(cc1)C(=O)NCCCCCCC(=O)NO,20.7,Active,1


In [70]:
print("Unclean data size: " + str(len(train_test_data)))
print("Unclean train dataset: " + str(len(train_dataset)))
check_activity_distribution(train_dataset, 'FINAL_LABEL')
print()
print("Unclean test dataset: " + str(len(test_dataset)))
check_activity_distribution(test_dataset, 'FINAL_LABEL')
print()
print("Unclean external dataset: " + str(len(validation_dataset)))
check_activity_distribution(validation_dataset, 'FINAL_LABEL')
print()

Unclean data size: 2801
Unclean train dataset: 2016
Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 649      │  1367      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  32.1925 │    67.8075 │
╘════════════════╧══════════╧════════════╛

Unclean test dataset: 504
Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 159      │   345      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  31.5476 │    68.4524 │
╘════════════════╧══════════╧════════════╛

Unclean external dataset: 281
Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │  91      │   190      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  32.3843 │  

In [None]:
# Write to file
with pd.ExcelWriter(train_test_data_path+train_test_file_name, engine='openpyxl') as writer:
    train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)
    validation_dataset.to_excel(writer, sheet_name='validation_dataset', index=False)