# ChEMBL 
✅ Final curated ChEMBL CETP dataset saved: 1393 compounds

✅ Filtering biologically relevant IC50 range (1 fM – 10 mM)

✅ Grouping by ligand-target system (step 2)

✅ Keeping the best pIC50 if multiple measurements exist (step 4)

✅ Detecting and removing rounded/duplicate/near-identical values (step 5)

(📌 Steps 1 and 6 don’t apply: you’re using a valid target, and author data isn't available via the ChEMBL API.)



In [None]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import numpy as np

# Step 1: Search for CETP target
target_df = pd.DataFrame(new_client.target.search("CETP"))
target_id = target_df.loc[0, 'target_chembl_id']  # e.g., 'CHEMBL1824'

# Step 2: Fetch all IC50 entries
activities = []
page = new_client.activity.filter(
    target_chembl_id=target_id,
    standard_type="IC50"
).only([
    'molecule_chembl_id', 'canonical_smiles',
    'standard_value', 'standard_units',
    'standard_type', 'standard_relation', 'document_chembl_id'
])
data = list(page)
activities.extend(data)

# Step 3: Convert to DataFrame
df = pd.DataFrame(activities)

# Step 4: Filter rows with valid data
df = df[
    (df['standard_value'].notna()) &
    (df['canonical_smiles'].notna())
].copy()

# Step 5: Rename and clean columns
df = df.rename(columns={
    'canonical_smiles': 'SMILES',
    'standard_value': 'IC50_nM',
    'document_chembl_id': 'DocumentID'
})
df['IC50_nM'] = pd.to_numeric(df['IC50_nM'], errors='coerce')
df = df[(df['IC50_nM'] > 1e-6) & (df['IC50_nM'] < 1e7)]  # Kramer Step 3

# Step 6: Convert to pIC50
df['pIC50'] = -np.log10(df['IC50_nM'] * 1e-9)

# Step 7: Create SystemID (ligand ID + SMILES)
df['SystemID'] = df['molecule_chembl_id'] + "_" + df['SMILES']

# Step 8: Keep best pIC50 per publication (Kramer Step 4)
df = df.sort_values(['SystemID', 'DocumentID', 'pIC50'], ascending=[True, True, False])
df = df.drop_duplicates(subset=['SystemID', 'DocumentID'], keep='first')

# Step 9: Kramer Step 5 – Remove rounded or near-duplicate pIC50 values across docs
def flag_suspect(group):
    vals = group['pIC50'].values
    diffs = np.abs(np.subtract.outer(vals, vals))
    if (np.any((diffs < 0.02) & (diffs > 0))) or np.any(np.isclose(vals % 3, 0, atol=0.01)) or np.any(np.isclose(vals % 6, 0, atol=0.01)):
        return pd.Series([True] * len(group), index=group.index)
    return pd.Series([False] * len(group), index=group.index)

df['suspect_duplicate'] = df.groupby('SystemID', group_keys=False).apply(flag_suspect)
df = df[~df['suspect_duplicate']]

# Step 10: Final clean-up
df = df[['molecule_chembl_id', 'SMILES', 'IC50_nM', 'pIC50', 'SystemID', 'DocumentID']]
df = df.drop_duplicates(subset='SMILES')

# Step 11: Save to file
df.to_csv("chembl_cetp_ic50_curated.csv", index=False)
print(f"✅ Final curated ChEMBL CETP dataset saved: {len(df)} compounds")


# BindingDB
✅ Final curated dataset saved: bindingdb_cetp_curated.csv (1286 compounds)

New steps:  
| Section                                    | Added Functionality         |
| ------------------------------------------ | --------------------------- |
| ✅ `SystemID` creation                      | Enables tracking duplicates |
| ✅ pIC50 curation                           | Keeps best per publication  |
| ✅ Rounding + transcription error filtering | Removes noisy duplications  |
| ✅ Optional author filtering                | Removes biased entries      |


In [None]:
import pandas as pd
import numpy as np

# Step 1: Read headers and identify UniProt columns
header = pd.read_csv(r"C:\QSAR\QSAR_data\BindingDB_All.tsv", sep="\t", nrows=0)
uniprot_cols = [col for col in header.columns if "UniProt" in col]

# Step 2: Prepare for chunked reading
chunksize = 10 ** 5
filtered_chunks = []

# Step 3: Process file in chunks
for chunk in pd.read_csv(r"C:\QSAR\QSAR_data\BindingDB_All.tsv", sep="\t", low_memory=False, chunksize=chunksize):
    # Step 3.1: Filter rows with any UniProt column containing 'P11597'
    mask = chunk[uniprot_cols].apply(lambda row: row.astype(str).str.contains('P11597', na=False)).any(axis=1)
    filtered = chunk[mask]

    # Step 3.2: Keep valid IC50 in nM range
    filtered = filtered[filtered['IC50 (nM)'].notna()]
    filtered['IC50_nM'] = pd.to_numeric(filtered['IC50 (nM)'], errors='coerce')
    filtered = filtered[(filtered['IC50_nM'] > 1e-6) & (filtered['IC50_nM'] < 1e7)]  # between 1 fM and 10 mM

    # Step 3.3: Handle SMILES column
    smiles_col = 'Smiles' if 'Smiles' in filtered.columns else 'Ligand SMILES'
    filtered = filtered[filtered[smiles_col].notna()]
    filtered = filtered.rename(columns={smiles_col: 'SMILES'})

    # Step 3.4: Create SystemID = SMILES + UniProt for grouping
    filtered['UniProt_ID'] = filtered[uniprot_cols].bfill(axis=1).iloc[:, 0]  # use first non-null UniProt column
    filtered['SystemID'] = filtered['SMILES'] + '_' + filtered['UniProt_ID']

    # Step 3.5: Calculate pIC50
    filtered['pIC50'] = -np.log10(filtered['IC50_nM'] * 1e-9)

    # Step 3.6: Keep key columns for curation
    keep_cols = ['BindingDB Reactant_set_id', 'SMILES', 'IC50_nM', 'pIC50', 'SystemID', 'PMID']
    for col in keep_cols:
        if col not in filtered.columns:
            filtered[col] = np.nan
    filtered = filtered[keep_cols]

    filtered_chunks.append(filtered)

# Step 4: Combine all chunks
df = pd.concat(filtered_chunks, ignore_index=True)

# Step 5: Kramer Step 4 — Keep highest pIC50 per SystemID + PMID
df = df.sort_values(['SystemID', 'PMID', 'pIC50'], ascending=[True, True, False])
df = df.drop_duplicates(subset=['SystemID', 'PMID'], keep='first')

# Step 6: Kramer Step 5 — Remove suspect duplicates
def flag_suspect(group):
    vals = group['pIC50'].values
    diffs = np.abs(np.subtract.outer(vals, vals))
    if (np.any((diffs < 0.02) & (diffs > 0))) or np.any(np.isclose(vals % 3, 0, atol=0.01)) or np.any(np.isclose(vals % 6, 0, atol=0.01)):
        return pd.Series([True] * len(group), index=group.index)
    return pd.Series([False] * len(group), index=group.index)

df['suspect_duplicate'] = df.groupby('SystemID', group_keys=False).apply(flag_suspect)
df = df[~df['suspect_duplicate']]

# Step 7: Kramer Step 6 — Drop by overlapping authors (optional)
if 'Authors' in df.columns:
    df['Authors'] = df['Authors'].fillna('')
    df = df.sort_values(['SystemID', 'Authors'])
    df = df.drop_duplicates(subset=['SystemID', 'Authors'], keep='first')

# Step 8: Finalize and save
df = df[['SystemID', 'SMILES', 'IC50_nM', 'pIC50', 'PMID']].drop_duplicates(subset='SMILES')
df.to_csv("bindingdb_cetp_curated.csv", index=False)
print(f"✅ Final curated dataset saved: bindingdb_cetp_curated.csv ({len(df)} compounds)")


# Combining
✅ Final deduplicated dataset saved: combined_cetp_dataset.csv (1423 compounds)

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MolToSmiles

# Step 1: Load curated datasets
df_binding = pd.read_csv("bindingdb_cetp_curated.csv")
df_chembl = pd.read_csv("chembl_cetp_ic50_curated.csv")

# Step 2: Keep only necessary columns and label dataset
df_binding = df_binding[['SMILES', 'IC50_nM', 'pIC50']].copy()
df_binding['dataset'] = 'BindingDB'

df_chembl = df_chembl[['SMILES', 'IC50_nM', 'pIC50']].copy()
df_chembl['dataset'] = 'ChEMBL'

# Step 3: Combine both datasets
df_combined = pd.concat([df_binding, df_chembl], ignore_index=True)

# ✅ Step 4: Canonicalize SMILES using RDKit
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return MolToSmiles(mol, canonical=True)
    except:
        return None

df_combined['canonical_smiles'] = df_combined['SMILES'].apply(canonicalize_smiles)
df_combined = df_combined.dropna(subset=['canonical_smiles'])

# ✅ Step 5: Drop duplicates based on canonical SMILES
df_combined = df_combined.drop_duplicates(subset='canonical_smiles')

# ✅ Step 6: Label activity: IC50 < 50 nM → Active
df_combined['Activity'] = (df_combined['IC50_nM'] < 50).astype(int)

# ✅ Step 7: Save final cleaned dataset
df_combined = df_combined[['canonical_smiles', 'IC50_nM', 'pIC50', 'dataset', 'Activity']]
df_combined.rename(columns={'canonical_smiles': 'SMILES'}, inplace=True)
df_combined.to_csv("combined_chembel_bindingdb_dataset.csv", index=False)

print(f"✅ Final deduplicated dataset saved: combined_cetp_dataset.csv ({len(df_combined)} compounds)")
