In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.preprocessing import LabelEncoder 

# 1. Import data

In [None]:
def show_activity_distribution(dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset["Bioactivity"] == "active"]
    inactive_rows = dataset.loc[dataset["Bioactivity"] == "inactive"]
    dataset_length = len(dataset)
    print("Total dataset")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length, len(inactive_rows)/dataset_length]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [6]:
train_test_path = "../../data/train_test_data/NoCL/20240307_pan_HDAC_train_test_data.xlsx"
train_dataset = pd.read_excel(train_test_path, sheet_name='train_dataset')
print(len(train_dataset))
show_activity_distribution(train_dataset)

1067
Total dataset
╒════════════════╤════════════╤════════════╕
│                │     Active │   Inactive │
╞════════════════╪════════════╪════════════╡
│ Number         │ 416        │ 651        │
├────────────────┼────────────┼────────────┤
│ Percentage (%) │   0.389878 │   0.610122 │
╘════════════════╧════════════╧════════════╛


In [7]:
train_dataset.head()

Unnamed: 0,MOL_ID,SMILES,Bioactivity
0,415892,ONC(CCC1=CCCN(CCCc(cc2)ccc2-c2ccccc2)C1=O)=O,inactive
1,11449823,C1=CC=C(C=C1)C2=CC(=CC=C2)NC(=O)CCCCCCS,active
2,2305748,CC(C)c(c(O)c1)cc(C(N(CC2)c(cc3)c2cc3NC(CCCC(NO...,inactive
3,1282795,CC(C)(C)OC(C[C@@H](c1nnc(C)[n]1-c1c2c(C)c(C)[s...,inactive
4,390828,COc1cccc(C(c2cc(cc(/C=C/C(NO)=O)cc3)c3[nH]2)=O)c1,inactive


## Split train dataset and remove randomly

In [18]:
active_dataset = train_dataset[train_dataset["Bioactivity"] == "active"].reset_index()
inactive_dataset = train_dataset[train_dataset["Bioactivity"] == "inactive"].reset_index()
print(len(inactive_dataset), len(active_dataset))

651 416


In [19]:
len(inactive_dataset)

651

In [21]:
def select_index(df: pd.DataFrame, sub_sample, seed: int):
    np.random.seed(seed)
    return np.random.choice(df.index, size=int(len(df)*sub_sample), replace=False).tolist()

train_datasets = None
for i in np.round(np.linspace(0.05, 0.95, num=19), decimals=2):
    for seed in range(10):
        active_idx = select_index(df=active_dataset, sub_sample=i, seed=seed)
        inactive_idx = select_index(df=inactive_dataset, sub_sample=i, seed=seed)
        rand_subsample = pd.concat([inactive_dataset.iloc[inactive_idx].copy(), active_dataset.iloc[active_idx].copy()]) 
        rand_subsample.loc[:, "subsample_size"] = i
        rand_subsample.loc[:, "rand_seed"] = seed
        if train_datasets is None:
            train_datasets = rand_subsample
        else:
            train_datasets = pd.concat([train_datasets, rand_subsample], axis=0)

In [31]:
test_data = train_datasets[(train_datasets["subsample_size"] == 0.1) & (train_datasets["rand_seed"] == 2)]
show_activity_distribution(test_data)
test_data.head()

Total dataset
╒════════════════╤═══════════╤════════════╕
│                │    Active │   Inactive │
╞════════════════╪═══════════╪════════════╡
│ Number         │ 41        │  65        │
├────────────────┼───────────┼────────────┤
│ Percentage (%) │  0.386792 │   0.613208 │
╘════════════════╧═══════════╧════════════╛


Unnamed: 0,index,MOL_ID,SMILES,Bioactivity,subsample_size,rand_seed
37,65,298980,CC(SCC(NCCCCCC(Nc1ccccc1)=O)=O)=O,inactive,0.1,2
520,864,166630425,C1=CC=C2C(=C1)NC(=N2)C3=CC4=C(C=C3)N=C(N4)C5=C...,inactive,0.1,2
475,790,1840395,ONC(CCCCC(N(C/C=C/CCOc1cc(-c2ccnc(N3)n2)ccc1)C...,inactive,0.1,2
641,1048,336043,CN(C)c(cc1)ccc1S(CCCCCC(NO)=O)=O,inactive,0.1,2
595,983,25118057,CNC(=O)C(=O)CCCCCCC(=O)NC1=CC=CC=C1,inactive,0.1,2


In [57]:
train_datasets.to_csv("../../data/survey/dataset_size/subsampled_train_dataset.csv")