In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.preprocessing import LabelEncoder 

# Import preprocessed data

In [2]:
preprocessed_data_path = "../../data/preprocessed/20240321_pan-hdac-preprocessed.xlsx"
dataset = pd.read_excel(preprocessed_data_path, sheet_name='Sheet1')
len(dataset)

2183

In [3]:
dataset.head()

Unnamed: 0,Code,SMILES,Bioactivity
0,11305,ONC(CCCCCCC(Nc1ccccc1)=O)=O,active
1,236556,[O-][N+](c(cc1)ccc1C(NCCCCCC(NO)=O)=O)=O,inactive
2,1352187,[O-][N+](c(cc1)ccc1S([n](cc1)c2c1cc(/C=C/C(NO)...,active
3,418270,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(/C=C/C(NO...,inactive
4,418245,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(C(NO)=O)[...,inactive


In [4]:
def check_activity_distribution(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "active"]
    inactive_rows = dataset.loc[dataset[col_name] == "inactive"]

    dataset_length = len(dataset)

    print(f"Total dataset: {dataset_length}")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [5]:
check_activity_distribution(dataset=dataset, col_name="Bioactivity")

Total dataset: 2183
╒════════════════╤═══════════╤════════════╕
│                │    Active │   Inactive │
╞════════════════╪═══════════╪════════════╡
│ Number         │ 1006      │  1177      │
├────────────────┼───────────┼────────────┤
│ Percentage (%) │   46.0834 │    53.9166 │
╘════════════════╧═══════════╧════════════╛


In [6]:
from tqdm import tqdm

#maccs
def morgan_fpts(data):
    Morgan_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
            mfpts = np.array(fpts)
            Morgan_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Morgan_fpts)

# 2. Train test split

In [7]:
le = LabelEncoder()
labels = le.fit_transform(dataset['Bioactivity'])

In [8]:
list(le.classes_)

['active', 'inactive']

In [9]:
from sklearn.model_selection import train_test_split
random_state = 42
train_idx, temp_indeces, y_train, y_temp = train_test_split(dataset.index, labels, test_size=0.3, random_state=random_state)
test_idx, val_idx, y_test, y_val = train_test_split(temp_indeces, y_temp, test_size=0.5, random_state=random_state)

In [10]:
train_df = dataset.iloc[train_idx]
val_df = dataset.iloc[val_idx]
test_df = dataset.iloc[test_idx]

In [11]:
train_df.head()

Unnamed: 0,Code,SMILES,Bioactivity
376,415897,CC(Nc1ccc(CN(CCC=C2CCC(NO)=O)C2=O)cc1)=O,inactive
929,1589183,COc(cc(/C=C/C(Nc(cccc1)c1N)=O)cc1)c1OCC(Nc(cc1...,inactive
439,1161066,CC[C@H](C)[C@@H](C(N(Cc1c(C2)ccc(OCC(NO)=O)c1)...,active
271,2100074,CC(c1ccccc1)Nc1ncnc2c1cc(-c1ccc(CN3CCN(CCOCCCC...,inactive
244,386804,CC(C)SC(SCC(c1ccc(C)cc1)=O)=S,inactive


In [12]:
print("Train dataset")
check_activity_distribution(dataset=train_df, col_name="Bioactivity")
print("Validation dataset")
check_activity_distribution(dataset=val_df, col_name="Bioactivity")
print("Test dataset")
check_activity_distribution(dataset=test_df, col_name="Bioactivity")

Train dataset
Total dataset: 1528
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │  701     │    827     │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │   45.877 │     54.123 │
╘════════════════╧══════════╧════════════╛
Validation dataset
Total dataset: 328
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 149      │   179      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  45.4268 │    54.5732 │
╘════════════════╧══════════╧════════════╛
Test dataset
Total dataset: 327
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 156      │   171      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  47.7064 │    52.2936 │
╘════════════════╧══════════╧═══════

In [14]:
# Write to file
with pd.ExcelWriter("../../data/train_test_data/NoCL/20240321_pan_HDAC_train_test_data.xlsx", engine='openpyxl') as writer:
    train_df.to_excel(writer, sheet_name='train_dataset', index=False)
    test_df.to_excel(writer, sheet_name='test_dataset', index=False)
    val_df.to_excel(writer, sheet_name='validation_dataset', index=False)