__Data preprocessing of the raw train-test data, to remove any duplicates, label conflicts, or 
IC50 value errors__

In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

# Starting data preprocessing

## 1. Read the data

In [2]:
all_data_path = "../../data_for_modeling/raw_data/new_HDAC2_raw_data.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='original_data')

In [3]:
dataset_c = dataset.copy() #Get a copy of the original dataset
print(len(dataset))
dataset.head()

2814


Unnamed: 0,CID,SMILES,Categories,ZBG Classified,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,6918878,CC(=O)Nc1ccc(C(=O)Nc2cc(-c3cccs3)ccc2N)cc1,active,4,15.0,20.0,
1,76284329,Nc1ccc(-c2cccs2)cc1NC(=O)c1cnc2c(C3CC3)c(N3CCN...,active,4,15.0,20.0,
2,118721861,Cc1cccc(NC(=O)[C@H](CCCCCS)NC(=O)[C@H]2CCC(=O)...,active,4,9.0,15.0,
3,165430653,CN(C)c1ccc(C(=O)N(CC(=O)NCc2ccccc2)Cc2ccc(C(=O...,active,1,15.0,,
4,42601485,CCOP(=O)(CNCc1ccc(C(=O)Nc2cc(-c3cccs3)ccc2N)cc...,active,4,15.0,20.0,


In [4]:
activity_col_name = "Categories"
smiles_col_name = "SMILES"
cid_col_name = "CID"

In [5]:
def show_activity_distribution(label, dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset[label] == "active"]
    inactive_rows = dataset.loc[dataset[label] == "inactive"]
    dataset_length = len(dataset)
    print("Total dataset")
    table = [['', 'active', 'inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

## 2. Activity distribution

We used to have inconclusive and unspecified labels in the raw data, however, those labels in the raw data have been removed by now.

In [6]:
show_activity_distribution(dataset=dataset, label=activity_col_name)

Total dataset
╒════════════════╤══════════╤════════════╕
│                │   active │   inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 897      │  1917      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  31.8763 │    68.1237 │
╘════════════════╧══════════╧════════════╛


## 3. Remove conflict labels data

In [7]:
def check_label_intersection(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "active"]
    inactive_rows = dataset.loc[dataset[col_name] == "inactive"]
    
    cid_active = active_rows.loc[:, 'SMILES']
    cid_inactive = inactive_rows.loc[:, 'SMILES'] 
    ac_inac_cid = np.intersect1d(cid_active, cid_inactive)

    print("activity intersection:")
    table = [['active-inactive'], 
             [len(ac_inac_cid)]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [8]:
check_label_intersection(dataset=dataset, col_name=activity_col_name)

activity intersection:
╒═══════════════════╕
│   active-inactive │
╞═══════════════════╡
│                 0 │
╘═══════════════════╛


## Remove labels intersection

__Remove label intersection__

In [9]:
def check_label_intersection(dataset, col_name):
    '''
        Description: Search for any data that have the same SMILES but have different labels
    '''
    active_rows = dataset.loc[dataset[col_name] == "active"]
    inactive_rows = dataset.loc[dataset[col_name] == "inactive"]
    
    cid_active = active_rows.loc[:, 'SMILES']
    cid_inactive = inactive_rows.loc[:, 'SMILES']
    ac_inac_cid = np.intersect1d(cid_active, cid_inactive)
    print("activity intersection:")
    table = [['Active-Inactive'], 
             [len(ac_inac_cid)]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [10]:
check_label_intersection(dataset=dataset, col_name=activity_col_name)

activity intersection:
╒═══════════════════╕
│   Active-Inactive │
╞═══════════════════╡
│                 0 │
╘═══════════════════╛


__Check if the SMILES are encodedable__

In [11]:
from tqdm import tqdm

def check_encode(data):
    Maccs_fpts = []
    error_index = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
                m2_fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
                maccs_fpts = MACCSkeys.GenMACCSKeys(mol)
            except:
                print("An exception occurred with " + str(count))
                error_index.append(count)
                count += 1
                continue
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts), error_index

In [12]:
smiles_data, error_idx = check_encode(dataset[smiles_col_name])

Progress:  21%|██▏       | 599/2814 [00:00<00:01, 1523.06it/s]

Progress:  87%|████████▋ | 2439/2814 [00:01<00:00, 1741.84it/s][15:44:31] Can't kekulize mol.  Unkekulized atoms: 12 13 14 15 16
[15:44:31] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[15:44:31] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 18
[15:44:31] Can't kekulize mol.  Unkekulized atoms: 1 2 3 14 15 16 17 18 19
Progress: 100%|█████████▉| 2810/2814 [00:01<00:00, 1630.61it/s]

An exception occurred with 2467
An exception occurred with 2560
An exception occurred with 2562
An exception occurred with 2569





In [13]:
error_idx

[2467, 2560, 2562, 2569]

In [15]:
error_encode_data = dataset_c.iloc[error_idx]

In [17]:
error_encode_data.to_excel("../../results/error_data/error_encode_data.xlsx")

In [16]:
dataset_c = dataset_c.drop(error_idx)

In [17]:
len(dataset_c)

2810

In [18]:
smiles_data, error_idx = check_encode(dataset_c[smiles_col_name])

Progress:   0%|          | 0/2810 [00:00<?, ?it/s]

Progress: 100%|██████████| 2810/2810 [00:01<00:00, 1590.28it/s]


__Write to file__

In [54]:
# with pd.ExcelWriter('../../data_for_modeling/preprocessed_data/HDAC2_all_data_filtered_p2.xlsx', engine='openpyxl') as writer:
#     writer.book = writer.book 
#     dataset_c.to_excel(writer, sheet_name='original_data', index=False)
#     new_data.to_excel(writer, sheet_name='filter_data', index=False)
#     dup_result.to_excel(writer, sheet_name='duplicate_smiles', index=False)
#     data_ic50_processed.to_excel(writer, sheet_name="data_ic50_process", index=False)
with pd.ExcelWriter('../../data_for_modeling/preprocessed_data/HDAC2_preprocessed_data.xlsx', engine='openpyxl') as writer:
    dataset_c.to_excel(writer, sheet_name='Sheet1', index=False)