# Drug ATC Classification

Creating the Drug-Atc pair dataset

In [2]:
import pandas as pd

input_file = 'Data/chembl/drugs.csv'
output_file = 'Dataset/drug-atc/dataset.csv'
atc_mapping_file = 'Dataset/atc/dataset.csv'

# Load your ChEMBL CSV file (update the path accordingly)
df = pd.read_csv(input_file, sep=';', encoding='utf-8')

# Extract relevant columns
df_subset = df[["Smiles", "Level 1 ATC Codes"]].copy()

# Drop rows with missing values
df_subset.dropna(inplace=True)

# Split multiple ATC Level 1 codes and explode into rows
df_exploded = df_subset.assign(
    atc_level1=df_subset["Level 1 ATC Codes"].str.split(" \| ")
).explode("atc_level1")

# Keep only SMILES and ATC Level 1 code columns
final_df = df_exploded[["Smiles", "atc_level1"]].rename(columns={"Smiles": "smiles"})

# Step 4: Encode 'atc_level1' into numeric values
unique_atcs = sorted(final_df["atc_level1"].unique())
atc_to_num = {atc: idx for idx, atc in enumerate(unique_atcs)}
final_df["atc_numeric"] = final_df["atc_level1"].map(atc_to_num)
#final_df.drop_duplicates(subset=["smiles"], inplace=True)

# Filter out ATC code 'V'
final_df = final_df[~final_df['atc_level1'].str.contains("V - VARIOUS")]

'''
# Remove drugs with multiple Level 1 ATC codes
counts = df_exploded.groupby('Smiles')["atc_level1"].nunique()
single_label_smiles = counts[counts == 1].index
final_df = final_df[final_df["smiles"].isin(single_label_smiles)]
'''

# OPTIONAL: Remove common salt patterns (simplified example)
final_df = final_df[~final_df["smiles"].str.contains(r"(?:Cl|Na|HCl|Br|SO4)", regex=True)]

# OPTIONAL: Remove singleton ATC classes
counts = final_df["atc_level1"].value_counts()
valid_classes = counts[counts > 1].index
final_df = final_df[final_df["atc_level1"].isin(valid_classes)]

# Step 5: Save combined dataset and mapping
final_df.to_csv(output_file, index=False)

# Save mapping separately
used_atcs = final_df["atc_level1"].unique()
atc_mapping_df = pd.DataFrame({
    "atc_level1": used_atcs,
    "atc_numeric": [atc_to_num[a] for a in used_atcs]
}).set_index("atc_level1")
atc_mapping_df.to_csv(atc_mapping_file)

print(f"✅ Saved: {output_file} and {atc_mapping_file}")

# Save or inspect
print(f"Final dataset size: {len(final_df)} samples, {len(valid_classes)} ATC classes")



✅ Saved: Dataset/drug-atc/dataset.csv and Dataset/atc/dataset.csv
Final dataset size: 2053 samples, 13 ATC classes


## Final Dataset

Extracting ECFP4 Features

In [3]:
import pandas as pd, numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import rdkit.DataStructs as ds
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from Mold2_pywrapper import Mold2          # <- make sure wrapper is installed

# ---------- paths ----------
input_file   = "Dataset/drug-atc/dataset.csv"   # smiles | atc_numeric | label
output_file  = "Dataset/final/dataset.csv"

ecfp_bits  = 2048

# ---------- helper ----------
def fp_to_array(fp, n_bits):
    arr = np.zeros((n_bits,), dtype=int)
    ds.ConvertToNumpyArray(fp, arr)
    return arr

# ---------- 1. Load combined dataset ----------
raw_df = pd.read_csv(input_file, dtype={"atc_numeric": int})
raw_df.dropna(subset=["smiles"], inplace=True)
raw_df = raw_df[~raw_df["smiles"].str.contains(r"\.", regex=True)]  # drop mixtures if desired
raw_df.reset_index(drop=True, inplace=True)

# ---------- 2. RDKit mols (keep index map) ----------
valid_idx, mols = [], []
for idx, smi in enumerate(raw_df["smiles"]):
    m = Chem.MolFromSmiles(smi)
    if m:
        valid_idx.append(idx)
        mols.append(m)

if not mols:
    raise ValueError("No valid SMILES found!")

core_df = raw_df.loc[valid_idx].reset_index(drop=True)   # keep only valid rows

# ---------- 4. ECFP4 (Morgan radius 2, 2048 bits) ----------
print("🧬 Calculating ECFP4 …")
morgan_gen = GetMorganGenerator(
    radius=2, fpSize=ecfp_bits,
    includeChirality=False, useBondTypes=True
)
ecfp_mat = [fp_to_array(morgan_gen.GetFingerprint(m), ecfp_bits) for m in mols]
ecfp_df  = pd.DataFrame(ecfp_mat, columns=[f"ECFP4_{i}" for i in range(ecfp_bits)])

# ---------- 6. Assemble final feature table ----------
features_df = pd.DataFrame(ecfp_df)

# Append numeric ATC code & label
features_df["label"] = core_df["atc_numeric"].values

# ---------- 7. Clean & save ----------
features_df.replace([np.inf, -np.inf], np.nan, inplace=True)
features_df.dropna(inplace=True)

features_df.to_csv(output_file, index=False)
print(f"✅ Saved final dataset {features_df.shape} → {output_file}")


  from .autonotebook import tqdm as notebook_tqdm


🧬 Calculating ECFP4 …
✅ Saved final dataset (2010, 2049) → Dataset/final/dataset.csv
