In [2]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn import preprocessing

# Read the data

In [4]:
all_data_path = "../../data_for_modeling/preprocessed_data/HDAC2_preprocessed_data.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='Sheet1')
dataset_length = len(dataset)
dataset_length

2809

In [5]:
dataset.head()

Unnamed: 0,CID,SMILES,Categories,ZBG Classified,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,6918878,CC(=O)Nc1ccc(C(=O)Nc2cc(-c3cccs3)ccc2N)cc1,active,4,15.0,20.0,
1,76284329,Nc1ccc(-c2cccs2)cc1NC(=O)c1cnc2c(C3CC3)c(N3CCN...,active,4,15.0,20.0,
2,118721861,Cc1cccc(NC(=O)[C@H](CCCCCS)NC(=O)[C@H]2CCC(=O)...,active,4,9.0,15.0,
3,165430653,CN(C)c1ccc(C(=O)N(CC(=O)NCc2ccccc2)Cc2ccc(C(=O...,active,1,15.0,,
4,42601485,CCOP(=O)(CNCc1ccc(C(=O)Nc2cc(-c3cccs3)ccc2N)cc...,active,4,15.0,20.0,


# 1. Profile of the data

In [18]:
def check_activity_distribution(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "active"]
    inactive_rows = dataset.loc[dataset[col_name] == "inactive"]

    dataset_length = len(dataset)

    print(f"Total dataset: {dataset_length}")
    table = [['', 'active', 'inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

__Train on the final label, not the first label__

In [11]:
check_activity_distribution(dataset=dataset, col_name='Categories')

Total dataset
╒════════════════╤══════════╤════════════╕
│                │   active │   inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 897      │  1913      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  31.9217 │    68.0783 │
╘════════════════╧══════════╧════════════╛


# 2. Train test split

In [12]:
dataset_c = dataset.copy()
print(len(dataset_c))

2810


In [23]:
le = preprocessing.LabelEncoder()
y = np.array(dataset['Categories'])
y = le.fit_transform(y)
list(le.classes_)

['active', 'inactive']

In [25]:
le.transform(le.classes_)

array([0, 1])

In [24]:
y

array([0, 0, 0, ..., 1, 0, 0])

In [29]:
from sklearn.model_selection import train_test_split
#Create data data
X_train, X_temp, y_train, y_temp = train_test_split(dataset_c.index, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [32]:
train_dataset = dataset_c.iloc[X_train]
test_dataset = dataset_c.iloc[X_test]
validation_dataset = dataset_c.iloc[X_val]

In [None]:
check_activity_distribution(train_dataset, "Categories")
check_activity_distribution(test_dataset, "Categories")
check_activity_distribution(validation_dataset, "Categories")

In [35]:
# Write to file
with pd.ExcelWriter("../../data_for_modeling/train_test_data/new_HDAC2_train_test_data.xlsx", engine='openpyxl') as writer:
    train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)
    validation_dataset.to_excel(writer, sheet_name='validation_dataset', index=False)