# Drug Detection

## Dataset

**Positive(drug):** 
- Chembl ( Approved + Small Molecules + Not Withdrawn)
- ZINC15's World Subset (Clean + In-Man)

**Negative(non-drug):** 
- ZINC15(Clean + Lead-like)
- GDB17(General Chemically possible Non-Drugs)

### Drug Dataset
Generating a combined Drug dataset of Chembl's Approved drugs and Zinc15's World subset.

Total Drugs in Drug combined dataset = 5901

In [None]:
import pandas as pd
from rdkit import Chem

# Paths
chembl_path = r"Data/positives/chembl/raw_approved_drug_data.csv"
zinc_path = r"Data/positives/zinc/world+in-man+clean.csv"
output_path = r"Dataset/positives/dataset.csv"

# Load datasets
chembl_df = pd.read_csv(chembl_path, sep=";")
zinc_df = pd.read_csv(zinc_path)


# Filter ChEMBL: small molecule & not withdrawn
chembl_df = chembl_df[
    (chembl_df["Drug Type"].str.contains("Small Molecule", case=False, na=False)) &
    (chembl_df["Withdrawn Flag"] != "True")
]

# Canonicalize SMILES
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None

# Apply to both datasets
chembl_df["smiles"] = chembl_df["Smiles"].apply(canonicalize_smiles)
zinc_df["smiles"] = zinc_df["smiles"].apply(canonicalize_smiles)

# Drop invalid entries
chembl_df = chembl_df.dropna(subset=["smiles"])
zinc_df = zinc_df.dropna(subset=["smiles"])

# Keep only canonical SMILES
chembl_df = chembl_df[["smiles"]]
zinc_df = zinc_df[["smiles"]]

# Merge, deduplicate, shuffle
combined_df = pd.concat([chembl_df, zinc_df], ignore_index=True)
combined_df.drop_duplicates(subset=["smiles"], inplace=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to file
combined_df.to_csv(output_path, index=False)
print(f"Saved: {len(combined_df)} to {output_path}")




Saved: 5901 to Dataset/positives/dataset.csv


### Non-Drug Dataset

Generating a combined Non-Drug Dataset of ZINC15's Lead-like molecules and GDB17's general non-drug molecules

Total non-drugs in combined Non-drug dataset = 5902

In [20]:
import pandas as pd
import os
import glob
import random
from rdkit import Chem

# -------------------------
# Paths and Params
# -------------------------
zinc_dir = r"Data/negatives/zinc"
gdb_file = r"Data/negatives/gdb/GDB17.50000000.smi"
drug_file = r"Dataset/positives/dataset.csv"
output_file = r"Dataset/negatives/dataset.csv"

sample_size_per_source = 2951
random.seed(42)

# -------------------------
# Canonicalization Function
# -------------------------
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None
    return None

# -------------------------
# Load known drug SMILES
# -------------------------
drug_df = pd.read_csv(drug_file)
known_drugs = set(drug_df["smiles"].dropna().unique())

# -------------------------
# ZINC SMILES (Lead-like, Clean)
# -------------------------
zinc_smiles = set()
for file_path in glob.glob(os.path.join(zinc_dir, "*.smi")):
    with open(file_path, "r") as f:
        for line in f:
            smiles = line.strip().split()[0]
            if smiles.lower() == "smiles":
                continue
            canon = canonicalize_smiles(smiles)
            if canon and canon not in known_drugs:
                zinc_smiles.add(canon)

zinc_sample = random.sample(list(zinc_smiles), min(sample_size_per_source * 2, len(zinc_smiles)))

# -------------------------
# GDB SMILES (sampled)
# -------------------------
gdb_smiles = set()
with open(gdb_file, "r") as f:
    lines = f.readlines()

sampled_lines = random.sample(lines, sample_size_per_source * 3)  # oversample
for line in sampled_lines:
    smiles = line.strip().split()[0]
    if smiles.lower() == "smiles":
        continue
    canon = canonicalize_smiles(smiles)
    if canon and canon not in known_drugs:
        gdb_smiles.add(canon)
    if len(gdb_smiles) >= sample_size_per_source * 2:
        break

# -------------------------
# Final Sampling & Merge
# -------------------------
zinc_final = random.sample(list(zinc_smiles), sample_size_per_source)
gdb_final = random.sample(list(gdb_smiles), sample_size_per_source)

combined = pd.DataFrame(zinc_final + gdb_final, columns=["smiles"])
combined = combined.drop_duplicates(subset=["smiles"])
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

# -------------------------
# Save
# -------------------------
combined.to_csv(output_file, index=False)
print(f"✅ Saved {len(combined)} non-drug SMILES to {output_file}")


✅ Saved 5902 non-drug SMILES to Dataset/negatives/dataset.csv


## Combined Dataset

Generating a Combined Dataset of both Drugs and Non-drugs
- Shuffled.
- Assigned target column "Is Drug" with 0 for non-drugs and 1 for drugs.

Total Molecules = 5901(Drugs) + 5902(Non-Drugs) = 11803

In [22]:
import pandas as pd

# -------------------------
# File paths
# -------------------------
drug_file = "Dataset/positives/dataset.csv"
non_drug_file = "Dataset/negatives/dataset.csv"
output_file = "Dataset/combined/dataset.csv"

# -------------------------
# Load and label
# -------------------------
drug_df = pd.read_csv(drug_file)
non_drug_df = pd.read_csv(non_drug_file)

drug_df["Is Drug"] = 1
non_drug_df["Is Drug"] = 0

# -------------------------
# Combine and shuffle
# -------------------------
combined_df = pd.concat([drug_df, non_drug_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# -------------------------
# Save
# -------------------------
os.makedirs(os.path.dirname(output_file), exist_ok=True)
combined_df.to_csv(output_file, index=False)

print(f"✅ Saved {len(combined_df)} labeled and shuffled samples to: {output_file}")


✅ Saved 11803 labeled and shuffled samples to: Dataset/combined/dataset.csv


## Final Dataset

Preparing the final dataset with numerical features extracted from the SMILES string.

The features extracted are:
* Physiochemical
  * Molecular Weight
  * clogP
  * TPSA
  * HBD
  * HBA
  * Rotatable Bonds
  * Ring Count
* Structural
  * EFCP4 (2048-bits)
  * MACCS (166-bits)

In [5]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski, rdMolDescriptors, MACCSkeys
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from tqdm import tqdm
import os

tqdm.pandas()

# ------------------------
# Input/Output
# ------------------------
input_file = "Dataset/combined/dataset.csv"
output_file = "Dataset/final/dataset.csv"

# ------------------------
# Initialize Fingerprint Generators
# ------------------------
morgan_gen = GetMorganGenerator(radius=2, fpSize=2048)

# ------------------------
# Feature Functions
# ------------------------
def smiles_to_mol(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except:
        return None

def compute_features(smiles):
    mol = smiles_to_mol(smiles)
    if mol is None:
        return None

    try:
        # Physicochemical
        mw = Descriptors.MolWt(mol)
        logp = Crippen.MolLogP(mol)
        tpsa = rdMolDescriptors.CalcTPSA(mol)
        hbd = Lipinski.NumHDonors(mol)
        hba = Lipinski.NumHAcceptors(mol)
        rot_bonds = Lipinski.NumRotatableBonds(mol)
        ring_count = rdMolDescriptors.CalcNumRings(mol)

        # ECFP4 (2048-bit)
        ecfp = morgan_gen.GetFingerprint(mol)
        ecfp_bits = list(ecfp)

        # MACCS (166-bit)
        maccs = MACCSkeys.GenMACCSKeys(mol)
        maccs_bits = list(maccs)[1:]  # Remove bit 0

        return [mw, logp, tpsa, hbd, hba, rot_bonds, ring_count] + ecfp_bits + maccs_bits
    except:
        return None

# ------------------------
# Load Dataset
# ------------------------
df = pd.read_csv(input_file)
df["features"] = df["smiles"].progress_apply(compute_features)

# ------------------------
# Drop Invalid Entries
# ------------------------
df = df[df["features"].notnull()].reset_index(drop=True)

# ------------------------
# Expand Features
# ------------------------
feature_names = (
    ["MW", "clogP", "TPSA", "HBD", "HBA", "RotatableBonds", "RingCount"] +
    [f"ECFP4_{i}" for i in range(2048)] +
    [f"MACCS_{i+1}" for i in range(166)]
)

features_df = pd.DataFrame(df["features"].tolist(), columns=feature_names)
final_df = pd.concat([df["smiles"], features_df, df["Is Drug"]], axis=1)

# ------------------------
# Save Final Dataset
# ------------------------
os.makedirs(os.path.dirname(output_file), exist_ok=True)
final_df.to_csv(output_file, index=False)

print(f"✅ Final dataset saved to {output_file} — shape = {final_df.shape}")


100%|██████████| 11803/11803 [00:21<00:00, 550.75it/s]


✅ Final dataset saved to Dataset/final/dataset.csv — shape = (11803, 2223)
