In [20]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

# Read the data

In [21]:
all_data_path = "/home/mylab-pharma/Code/tuele/XO/data/raw_data/20240530_data_XO_with substructure.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='Preprocessed')
dataset_length = len(dataset)
dataset_length

483

In [22]:
dataset.head()

Unnamed: 0,CID,SMILES,IC50(nM),aid,Type,Substructure
0,190,C1=NC2=NC=NC(=C2N1)N,10890.0,287937,active,3
1,471,C1=CC(=C(C=C1C2C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,100000.0,399340,active,14
2,675,CC1=CC2=C(C=C1C)N=CN2,200000.0,287937,active,16
3,938,C1=CC(=CN=C1)C(=O)O,518230.0,1444598,active,16
4,4947,CCCOC(=O)C1=CC(=C(C(=C1)O)O)O,628000.0,378145,active,2


# 1. Profile of the data

In [23]:
def check_activity_distribution(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "active"]
    inactive_rows = dataset.loc[dataset[col_name] == "inactive"]

    dataset_length = len(dataset)

    print("Total dataset")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

__Train on the final label, not the first label__

In [24]:
check_activity_distribution(dataset=dataset, col_name='Type')

Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 267      │   216      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  55.2795 │    44.7205 │
╘════════════════╧══════════╧════════════╛


# 2. Train test split

In [25]:
dataset_c = dataset.copy()
print(len(dataset_c))

483


In [26]:
from sklearn.model_selection import train_test_split

# Define thông tin
train_test_data_path = "/home/mylab-pharma/Code/tuele/XO/data/train_test_data"
train_test_file_name = "XO_train_test_data_for_tmap.xlsx"
test_size = 0.1765  # 15% của tổng số dữ liệu, do bị chia lần 2 
validation_size = 0.15  # 15% của tổng số dữ liệu (sau khi đã loại bỏ test set)
random_state = 42

# Tạo dữ liệu
# train_test_data = dataset_c.drop(['IC50(nM)'], axis=1)
train_test_data = dataset_c
labels = dataset_c['Type']  # Cột chứa nhãn lớp (active/inactive)

# Chia tập train_test_data thành train_test và validation
train_test_data, validation_dataset, train_labels, validation_labels = train_test_split(
    train_test_data, labels, test_size=validation_size, random_state=random_state, stratify=labels
)

# Chia tập train_test thành train và test
train_dataset, test_dataset, train_labels, test_labels = train_test_split(
    train_test_data, train_labels, test_size=test_size, random_state=random_state, stratify=train_labels
)

In [27]:
print(len(train_dataset), len(validation_dataset), len(test_dataset))

337 73 73


In [28]:
train_dataset.head()

Unnamed: 0,CID,SMILES,IC50(nM),aid,Type,Substructure
304,145967694,CC1=CC2=C(C=C1)N=C(O2)/C(=N/O)/CC3=CC=CC=C3,17500.0,1389558,active,16
158,76329670,CC1(C=CC2=CC(=C(C=C2O1)O)C(=O)/C=C/C3=CC(=C(C=...,1800.0,1485273,inactive,6
80,5320686,C1=CC(=CC=C1/C=C/C(=O)OC[C@@H]2[C@H]([C@@H]([C...,100000.0,399340,active,10
326,155903284,C1=CC(=CC=C1C2=NC=NN2)NC(=O)C3C(NC(=O)NC3=O)O,1400.0,1806026,inactive,1
274,137648214,CCCCC1=NN2C(=N1)C3=C(NC2=O)NN=C3,529.0,1485284,inactive,1


In [29]:
len(train_dataset)

337

In [30]:
print("Unclean data size: " + str(len(train_test_data)))
print("Unclean train dataset: " + str(len(train_dataset)))
check_activity_distribution(train_dataset, 'Type')
print()
print("Unclean test dataset: " + str(len(test_dataset)))
check_activity_distribution(test_dataset, 'Type')
print()
print("Unclean external dataset: " + str(len(validation_dataset)))
check_activity_distribution(validation_dataset, 'Type')
print()

Unclean data size: 410
Unclean train dataset: 337
Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 187      │   150      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  55.4896 │    44.5104 │
╘════════════════╧══════════╧════════════╛

Unclean test dataset: 73
Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │  40      │    33      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  54.7945 │    45.2055 │
╘════════════════╧══════════╧════════════╛

Unclean external dataset: 73
Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │  40      │    33      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  54.7945 │    45

In [31]:
# Write to file
with pd.ExcelWriter(train_test_data_path+train_test_file_name, engine='openpyxl') as writer:
    train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)
    validation_dataset.to_excel(writer, sheet_name='validation_dataset', index=False)