In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.preprocessing import LabelEncoder 

# 1. Import data

In [2]:
train_test_path = "../../data/train_test_data/CL/20240216_clean_data_approach1_method2.xlsx"
train_dataset = pd.read_excel(train_test_path, sheet_name='train_dataset')
test_dataset = pd.read_excel(train_test_path, sheet_name='test_dataset')
validation_dataset = pd.read_excel(train_test_path, sheet_name='validation_dataset')
print(len(train_dataset), len(test_dataset), len(validation_dataset))

604 123 137


In [3]:
train_dataset.head()

Unnamed: 0,MOL_ID,SMILES,IC50,IC50 of reference (vorinostat),Bioactivity,Ref_DOI
0,1162325,ONC(c1cnc(N(C[C@H]23)C[C@@H]2[C@H]3Nc2nc3ccccc...,490.4,60.2,inactive,10.1016/j.ejmech.2021.113799
1,136030779,C1=CC=C(C=C1)CN2C=C(N=N2)C3=CC(=CC=C3)C(=O)NO,58.0,107.0,active,10.1021/jm101605z
2,71520630,CC(C)C1=CC=C(C=C1)C(=O)NOCCCCCC(=O)NO,75.1,50.1,inactive,10.1021/acs.jmedchem.1c00821
3,11723098,C1=CC=C(C=C1)CCN2C=CC(=N2)C3=CC=C(S3)C(=O)NO,5000.0,120.0,inactive,10.1016/s0960-894x(02)00622-4
4,164613013,C1=CC=C2C(=C1)N=C(S2)C(=O)NOCCCCCC(=O)NO,23000.0,280.0,inactive,10.1021/jm900125m


In [4]:
def show_activity_distribution(dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset["Bioactivity"] == "active"]
    inactive_rows = dataset.loc[dataset["Bioactivity"] == "inactive"]
    dataset_length = len(dataset)
    print("Total dataset")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length, len(inactive_rows)/dataset_length]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [5]:
show_activity_distribution(train_dataset)
show_activity_distribution(test_dataset)
show_activity_distribution(validation_dataset)

Total dataset
╒════════════════╤════════════╤════════════╕
│                │     Active │   Inactive │
╞════════════════╪════════════╪════════════╡
│ Number         │ 182        │ 422        │
├────────────────┼────────────┼────────────┤
│ Percentage (%) │   0.301325 │   0.698675 │
╘════════════════╧════════════╧════════════╛
Total dataset
╒════════════════╤═══════════╤════════════╕
│                │    Active │   Inactive │
╞════════════════╪═══════════╪════════════╡
│ Number         │ 33        │  90        │
├────────────────┼───────────┼────────────┤
│ Percentage (%) │  0.268293 │   0.731707 │
╘════════════════╧═══════════╧════════════╛
Total dataset
╒════════════════╤═══════════╤════════════╕
│                │    Active │   Inactive │
╞════════════════╪═══════════╪════════════╡
│ Number         │ 46        │  91        │
├────────────────┼───────────┼────────────┤
│ Percentage (%) │  0.335766 │   0.664234 │
╘════════════════╧═══════════╧════════════╛


## Split train dataset and remove randomly

In [6]:
active_dataset = train_dataset[train_dataset["Bioactivity"] == "active"]
inactive_dataset = train_dataset[train_dataset["Bioactivity"] == "inactive"]
print(len(inactive_dataset), len(active_dataset))

422 182


In [7]:
def remove_n_elements(df, n, seed):
    np.random.seed(seed)
    to_remove = np.random.choice(df.index, size=n, replace=False)
    return df.drop(to_remove)

no_removed = len(inactive_dataset) - len(active_dataset)
train_datasets = []
for i in range(10):
    train_dataset = remove_n_elements(df=inactive_dataset, n=no_removed, seed=i)
    train_dataset = pd.concat([train_dataset, active_dataset])
    train_datasets.append(train_dataset)
    print(len(train_dataset))

364
364
364
364
364
364
364
364
364
364


In [11]:
with pd.ExcelWriter('../../data/survey_data/CL_then_balance/20240216_CL_then_balance_survey_data.xlsx', engine='xlsxwriter') as writer:
    for i in range(10):
        train_datasets[i].to_excel(writer, sheet_name=f"train_set_{i}")