__Notification__: The raw data from PubChem database have many errors and duplicates, therefore, this file perform the cleaning it.
- Use the my-rdkit-env environment
- The data that we used after preprocessing is availible in this link: https://drive.google.com/file/d/1YIhBD51oWA0s3p-egIHepNb3iZZbXqb1/view?usp=sharing

In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate

import os
import psycopg2

from dotenv import load_dotenv
import logging

#MACCS
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit import RDLogger  
import sys
sys.path.append("/home/mylab-pharma/Code/tuele/pan_HDAC/mylab_panHDAC-master/src/common")
from pharmacy_common import PharmacyCommon
#class to encode smiles
common = PharmacyCommon()

## Preparing

### Class definition

In [2]:
import csv

# Step 1: Read the text data
with open('/home/mylab-pharma/Code/tuele/pan_HDAC/mylab_panHDAC-master/data/screening_data/enamine_hits.txt', 'r') as file:
    text_data = file.read()

# Step 2: Parse the text data
rows = text_data.split('\n')  # Split text into rows based on newlines
data = [row.split('\t') for row in rows]  # Split rows into columns based on tab delimiter

# Step 3: Create a CSV file
with open('/home/mylab-pharma/Code/tuele/pan_HDAC/mylab_panHDAC-master/data/screening_data/enamine_hits.csv', 'w', newline='') as file:
    writer = csv.writer(file)

    # Step 4: Write data to the CSV file
    writer.writerows(data)

print("Text data converted to CSV successfully!")

Text data converted to CSV successfully!


### Getting the train - test data

In [2]:
train_test_path = "../../data/train_test_data/NoCL/20240321_pan_HDAC_train_test_data.xlsx"
train_dataset = pd.read_excel(train_test_path, sheet_name='train_dataset')
test_dataset = pd.read_excel(train_test_path, sheet_name='test_dataset')
validation_dataset = pd.read_excel(train_test_path, sheet_name='validation_dataset')

train_test_dataset = pd.concat([train_dataset, test_dataset, validation_dataset], axis=0)

In [3]:
print(len(train_dataset), len(test_dataset), len(validation_dataset), len(train_test_dataset))

1528 327 328 2183


In [4]:
train_test_dataset.head()

Unnamed: 0,Code,SMILES,Bioactivity
0,415897,CC(Nc1ccc(CN(CCC=C2CCC(NO)=O)C2=O)cc1)=O,inactive
1,1589183,COc(cc(/C=C/C(Nc(cccc1)c1N)=O)cc1)c1OCC(Nc(cc1...,inactive
2,1161066,CC[C@H](C)[C@@H](C(N(Cc1c(C2)ccc(OCC(NO)=O)c1)...,active
3,2100074,CC(c1ccccc1)Nc1ncnc2c1cc(-c1ccc(CN3CCN(CCOCCCC...,inactive
4,386804,CC(C)SC(SCC(c1ccc(C)cc1)=O)=S,inactive


## Checking and insert data into the preprocessing table

In [5]:
def check_error_fpts(check_dataset, smiles_column):
    """
    Checks for errors in MACCS fingerprint calculation.

    Args:
        check_dataset (pd.DataFrame): The dataset to be checked.
        smiles_column (str): The name of the SMILES column.

    Returns:
        pd.DataFrame: The rows with errors in MACCS fingerprint calculation.
    """
    result_df = pd.DataFrame(columns=check_dataset.columns)
    for index, row in check_dataset.iterrows():
        current_smiles = str(row[smiles_column]).strip()
        if current_smiles is not None and len(current_smiles) > 0:
            try:
                RDLogger.DisableLog('rdApp.info')
                mol = Chem.MolFromSmiles(current_smiles)
                if mol is not None:
                    result_df = pd.concat([result_df, row.to_frame().T], axis=0)  # Concatenate the current row                
                else:
                    logging.info("Could not interpret " + current_smiles + " to mol object!")
            except Exception as e:
                logging.error("An exception occurred at row " + str(index) + ": " + str(e))
                continue
    return result_df

def preprocess_dataset(working_dataset, train_test_dataset):
    """
    Preprocesses the dataset by removing duplicate SMILES and rows with errors in MACCS fingerprint calculation.

    Args:
        working_dataset (pd.DataFrame): The dataset to be preprocessed.
        train_test_dataset (pd.DataFrame): The training/testing dataset.

    Returns:
        pd.DataFrame: The preprocessed dataset.
    """
    #Filter
    logging.info("[+] Working dataset: " + str(len(working_dataset)))
    #Check for error smiles while encoding
    working_dataset = check_error_fpts(working_dataset, 'SMILES')
    #Resert index
    working_dataset.reset_index(drop=True, inplace=True) 
    # Get the duplicate SMILES from the training/testing dataset.
    duplicate_smiles = working_dataset[working_dataset['SMILES'].isin(train_test_dataset['SMILES'])]
    logging.info("[+] Duplicate with the train-test data: " + str(len(duplicate_smiles)))
    # Get the indices of duplicate smiles in test_working_dataset
    duplicate_indices = duplicate_smiles.index
    # Remove rows with duplicate SMILES from test_working_dataset
    working_dataset.drop(index=duplicate_indices, inplace=True)
    #Ending report
    logging.info("[+] Ending preprocessing: " + str(len(working_dataset)))
    return working_dataset

## Screening data preprocessing

In [7]:
screening_data_path = "../../data/screening_data/all_screen_data.xlsx"
screening_data = pd.read_excel(screening_data_path, sheet_name='final_screen_data')
screening_data.head(10)

Unnamed: 0,SMILES
0,O=C1CCON1
1,CCC(=O)NO
2,NCC(=O)NO
3,CON(C)C(C)=O
4,ON1C(=O)CCC1=O
5,CC1(C)CONC1=O
6,CC(C)(C)C(=O)NO
7,CCC(=O)N(C)OC
8,CCC(N)C(=O)NO
9,COCONC(C)=O


### Test data preprocessing

In [9]:
test_working_dataset = screening_data.iloc[:10,:]
error_dataset = {
    # test_working_dataset.columns[0]: [6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    test_working_dataset.columns[0]: ['O=C(NO)c1cnc(NC2(c3ccc(C(F)(F)F)c(F)c3)CC2)nc1', '', '4453', 'hellow rodl', 'Cc1cn(C)nc1CN1CCC(c2ccc(C(=O)Nc3ccccc3N)cc2)CC1', 'qwe', '####@@qds', None, ' ', '']
}
error_dataset = pd.DataFrame(error_dataset)
test_working_dataset = pd.concat([test_working_dataset, error_dataset], axis=0)

In [10]:
test_working_dataset = preprocess_dataset(working_dataset=test_working_dataset, train_test_dataset=train_test_dataset)

[09:59:46] SMILES Parse Error: syntax error while parsing: 4453
[09:59:46] SMILES Parse Error: Failed parsing SMILES '4453' for input: '4453'
[09:59:46] SMILES Parse Error: syntax error while parsing: hellow
[09:59:46] SMILES Parse Error: Failed parsing SMILES 'hellow' for input: 'hellow'
[09:59:46] SMILES Parse Error: syntax error while parsing: qwe
[09:59:46] SMILES Parse Error: Failed parsing SMILES 'qwe' for input: 'qwe'
[09:59:46] SMILES Parse Error: syntax error while parsing: ####@@qds
[09:59:46] SMILES Parse Error: Failed parsing SMILES '####@@qds' for input: '####@@qds'
[09:59:46] SMILES Parse Error: syntax error while parsing: None
[09:59:46] SMILES Parse Error: Failed parsing SMILES 'None' for input: 'None'


### Starting preprocessing

In [11]:
print(len(screening_data))

542869


In [12]:
encoding_screen_data = common.gen_ecfp4_fpts(screening_data["SMILES"],bits= 1024)

Progress:   0%|          | 0/542869 [00:00<?, ?it/s]

Progress: 100%|██████████| 542869/542869 [03:17<00:00, 2743.23it/s]


In [13]:
print(len(encoding_screen_data))

542869
