In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

# Import raw data

In [4]:
all_data_path = "../../data/raw_data/20.3.pan_HDAC.csv"
dataset = pd.read_csv(all_data_path)

In [5]:
dataset_c = dataset.copy() #Get a copy of the original dataset
print(len(dataset_c))
dataset_c.head()

2183


Unnamed: 0,Code,Average IC50,Bioactivity,SMILES,INCHI-KEY
0,11305,236.6,active,ONC(CCCCCCC(Nc1ccccc1)=O)=O,
1,236556,1556.0,inactive,[O-][N+](c(cc1)ccc1C(NCCCCCC(NO)=O)=O)=O,GDAAHZPABCCCFD-UHFFFAOYSA-N
2,1352187,12.6,active,[O-][N+](c(cc1)ccc1S([n](cc1)c2c1cc(/C=C/C(NO)...,KAFPWADPNJMVFZ-KRXBUXKQSA-N
3,418270,1600.0,inactive,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(/C=C/C(NO...,DVMYLPRGYNBANT-ZZXKWVIFSA-N
4,418245,400.0,inactive,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(C(NO)=O)[...,SINFIJWJVFHMHP-UHFFFAOYSA-N


In [6]:
dataset_c["Bioactivity"].unique()

array(['active', 'inactive'], dtype=object)

In [7]:
activity_col_name = "Bioactivity"
smiles_col_name = "SMILES"
cid_col_name = "MOL_ID"

In [8]:
def show_activity_distribution(label, dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset[label] == "active"]
    inactive_rows = dataset.loc[dataset[label] == "inactive"]
    dataset_length = len(dataset)
    print("Total dataset")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length, len(inactive_rows)/dataset_length]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

# Data preprocessing

## 1. Check activity distribution

In [9]:
show_activity_distribution(dataset=dataset_c, label=activity_col_name)

Total dataset
╒════════════════╤═════════════╤═════════════╕
│                │      Active │    Inactive │
╞════════════════╪═════════════╪═════════════╡
│ Number         │ 1006        │ 1177        │
├────────────────┼─────────────┼─────────────┤
│ Percentage (%) │    0.460834 │    0.539166 │
╘════════════════╧═════════════╧═════════════╛


## 2. Check conflict labels

In [10]:
def check_label_intersection(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "active"]
    inactive_rows = dataset.loc[dataset[col_name] == "inactive"]
    cid_active = active_rows.loc[:, smiles_col_name]
    cid_inactive = inactive_rows.loc[:, smiles_col_name]
    #intersect1d check for string intersection while also matching the case 
    ac_inac_cid = np.intersect1d(cid_active, cid_inactive)
    print(f"Activity intersection: {len(ac_inac_cid)}")
    return ac_inac_cid

In [11]:
duplicate_smiles = check_label_intersection(dataset=dataset, col_name=activity_col_name)

Activity intersection: 0


In [12]:
duplicate_smiles[0:5]

array([], dtype=object)

## Remove labels intersections

In [13]:
duplicate_rows = pd.DataFrame(columns=dataset_c.columns)
for smiles in duplicate_smiles:
    dup_row = dataset.loc[dataset[smiles_col_name] == smiles]
    duplicate_rows = pd.concat([duplicate_rows, dup_row])

In [14]:
len(duplicate_rows)
duplicate_rows.to_excel("../../results/removed_smiles/20240321_pan_hdac_duplicate_smiles.xlsx", index=False) 
dup_indeces = duplicate_rows.index
dataset_c = dataset_c.drop(dup_indeces)

In [15]:
dataset_c.head()

Unnamed: 0,Code,Average IC50,Bioactivity,SMILES,INCHI-KEY
0,11305,236.6,active,ONC(CCCCCCC(Nc1ccccc1)=O)=O,
1,236556,1556.0,inactive,[O-][N+](c(cc1)ccc1C(NCCCCCC(NO)=O)=O)=O,GDAAHZPABCCCFD-UHFFFAOYSA-N
2,1352187,12.6,active,[O-][N+](c(cc1)ccc1S([n](cc1)c2c1cc(/C=C/C(NO)...,KAFPWADPNJMVFZ-KRXBUXKQSA-N
3,418270,1600.0,inactive,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(/C=C/C(NO...,DVMYLPRGYNBANT-ZZXKWVIFSA-N
4,418245,400.0,inactive,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(C(NO)=O)[...,SINFIJWJVFHMHP-UHFFFAOYSA-N


In [16]:
check_label_intersection(dataset=dataset_c, col_name=activity_col_name)

Activity intersection: 0


array([], dtype=object)

In [17]:
show_activity_distribution(dataset=dataset_c, label=activity_col_name)

Total dataset
╒════════════════╤═════════════╤═════════════╕
│                │      Active │    Inactive │
╞════════════════╪═════════════╪═════════════╡
│ Number         │ 1006        │ 1177        │
├────────────────┼─────────────┼─────────────┤
│ Percentage (%) │    0.460834 │    0.539166 │
╘════════════════╧═════════════╧═════════════╛


## Check if the smiles are encodeable

In [18]:
from tqdm import tqdm

def maccs_fpts(data):
    Maccs_fpts = []
    error_index = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
                fpts = MACCSkeys.GenMACCSKeys(mol)
            except:
                print("An exception occurred with " + str(count))
                error_index.append(count)
                count += 1
                continue
            mfpts = np.array(fpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts), error_index

In [19]:
smiles_data, error_idx = maccs_fpts(dataset_c[smiles_col_name]) #All of them are encodable by rdkit, so none of them are removed

Progress:   0%|          | 0/2183 [00:00<?, ?it/s]

[15:19:47] Conflicting single bond directions around double bond at index 15.
[15:19:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[15:19:47] Conflicting single bond directions around double bond at index 27.
[15:19:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[15:19:47] Conflicting single bond directions around double bond at index 26.
[15:19:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[15:19:47] Conflicting single bond directions around double bond at index 7.
[15:19:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[15:19:47] Conflicting single bond directions around double bond at index 7.
[15:19:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[15:19:47] Conflicting single bond directions around double bond at index 8.
[15:19:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[15:19:47] Conflicting single bond directio

# Write to file

In [20]:
dataset_c.head()

Unnamed: 0,Code,Average IC50,Bioactivity,SMILES,INCHI-KEY
0,11305,236.6,active,ONC(CCCCCCC(Nc1ccccc1)=O)=O,
1,236556,1556.0,inactive,[O-][N+](c(cc1)ccc1C(NCCCCCC(NO)=O)=O)=O,GDAAHZPABCCCFD-UHFFFAOYSA-N
2,1352187,12.6,active,[O-][N+](c(cc1)ccc1S([n](cc1)c2c1cc(/C=C/C(NO)...,KAFPWADPNJMVFZ-KRXBUXKQSA-N
3,418270,1600.0,inactive,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(/C=C/C(NO...,DVMYLPRGYNBANT-ZZXKWVIFSA-N
4,418245,400.0,inactive,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(C(NO)=O)[...,SINFIJWJVFHMHP-UHFFFAOYSA-N


In [21]:
dataset_c = dataset_c[["Code", "SMILES", "Bioactivity"]]

In [19]:
dataset_c.to_excel("../../data/preprocessed/20240321_pan-hdac-preprocessed.xlsx", index=False)