In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, Descriptors
from keras.utils import to_categorical
import pickle

In [None]:
# ======================================================
# Load Kedro-processed drug data
# ======================================================

with open("data/05_model_input/drug_X.pkl", "rb") as f:
    X_drug = pickle.load(f)

with open("data/05_model_input/drug_y_drug.pkl", "rb") as f:
    y_drug = pickle.load(f)

with open("data/05_model_input/drug_y_atc.pkl", "rb") as f:
    y_atc = pickle.load(f)

atc_mapping = pd.read_csv("data/08_reporting/atc_mapping.csv")
n_classes = y_atc.shape[1]

# ======================================================
# Load your non-drug dataset
# ======================================================

non_drug_df = pd.read_csv("data/01_raw/non_drugs_dataset.csv")
print(f"Loaded {len(non_drug_df)} non-drug molecules.")


# ======================================================
# Fingerprint generation (same function as in Kedro)
# ======================================================

def mol_from_smiles(smiles: str):
    try:
        return Chem.MolFromSmiles(smiles)
    except Exception:
        return None


def compute_fingerprints(smiles: str, radius=2, n_bits=2048):
    mol = mol_from_smiles(smiles)
    if mol is None:
        return np.zeros(n_bits + 167 + 1)

    # Morgan fingerprint
    morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    morgan = np.array(morgan)

    # MACCS keys
    maccs = MACCSkeys.GenMACCSKeys(mol)
    maccs = np.array(list(maccs.ToBitString()), dtype=int)

    # TPSA descriptor
    tpsa = np.array([Descriptors.TPSA(mol)])

    return np.concatenate([morgan, maccs, tpsa])


# ======================================================
# Process non-drugs
# ======================================================

print("Computing fingerprints for non-drugs...")
X_non_drug = np.stack(non_drug_df["IsomericSMILES"].map(compute_fingerprints))
y_non_drug = np.zeros(len(non_drug_df))  # non-drugs labeled 0
y_non_drug_atc = np.zeros((len(non_drug_df), n_classes))  # no ATC class

# ======================================================
# Combine with drug dataset
# ======================================================

X_final = np.vstack([X_drug, X_non_drug])
y_drug_final = np.concatenate([y_drug, y_non_drug])
y_atc_final = np.vstack([y_atc, y_non_drug_atc])

print(f"âœ… Final dataset shape: {X_final.shape}")
print(f"   â†’ Drugs: {int(sum(y_drug))}, Non-drugs: {int(len(y_non_drug))}")
print(f"   â†’ ATC classes: {n_classes}")

# ======================================================
# Save combined dataset
# ======================================================

with open("data/05_model_input/drug_non_drug_X.pkl", "wb") as f:
    pickle.dump(X_final, f)

with open("data/05_model_input/drug_non_drug_y_drug.pkl", "wb") as f:
    pickle.dump(y_drug_final, f)

with open("data/05_model_input/drug_non_drug_y_atc.pkl", "wb") as f:
    pickle.dump(y_atc_final, f)

print("ðŸ’¾ Saved combined dataset to data/05_model_input/")
