In [1]:
import os
import pandas as pd
import numpy as np
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, Descriptors
from keras.utils import to_categorical
import pickle

In [2]:
drug_dataset = pd.read_csv(os.path.join('..', 'data', '01_raw', 'drugs_dataset', 'all_drugs_dataset.csv'))

In [3]:
drug_dataset.head()

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,LogP,RuleFive,MATC_Code_Short,MATC_Code_Explanation
0,24769,2.0,0.0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,1.0,B,Blood and blood forming organs
1,134694070,9.0,6.0,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,0.0,C,Cardiovascular system
2,5121,2.0,0.0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,1.0,J,Antiinfectives for systemic use
3,4660557,1.0,1.0,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,1.0,N,Nervous system
4,122175,2.0,2.0,CC(CCC(C#C)N)N,126.203,-0.4,1.0,L,Antineoplastic and immunomodulating agents


In [31]:
drug_dataset_cols = drug_dataset[['CID', 'IsomericSMILES', 'MATC_Code_Short', 'MATC_Code_Explanation']]

In [32]:
drug_dataset_cols.head()

Unnamed: 0,CID,IsomericSMILES,MATC_Code_Short,MATC_Code_Explanation
0,24769,CN(C)CCCCCCN(C)C.C(CBr)CBr,B,Blood and blood forming organs
1,134694070,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,C,Cardiovascular system
2,5121,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,J,Antiinfectives for systemic use
3,4660557,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,N,Nervous system
4,122175,CC(CCC(C#C)N)N,L,Antineoplastic and immunomodulating agents


In [5]:
no_drug_dataset = pd.DataFrame({'CID': drug_dataset['CID']+1 ,'MATC_Code_Short': 'ND','MATC_Code_Explanation': 'No Drug'})

In [6]:
type(no_drug_dataset['CID'][0])


numpy.int64

In [8]:
def get_smiles(cid):
    str_cid = str(cid)
    try:
        compounds = pcp.get_compounds(str_cid, 'cid')
        
        return(compounds[0].smiles)
    except Exception:
        print(f"Could not retrieve SMILES for CID: {cid}")
        return None
smi = get_smiles(2244)
print(smi)

CC(=O)OC1=CC=CC=C1C(=O)O


In [9]:
no_drug_dataset['CID']

0           24770
1       134694071
2            5122
3         4660558
4          122176
          ...    
9949       121099
9950      3034027
9951        64940
9952        65451
9953      9851776
Name: CID, Length: 9954, dtype: int64

In [10]:
no_drug_dataset['IsomericSMILES'] = no_drug_dataset['CID'].apply(get_smiles)

Could not retrieve SMILES for CID: 12899070
Could not retrieve SMILES for CID: 9549214
Could not retrieve SMILES for CID: 146383
Could not retrieve SMILES for CID: 5702180
Could not retrieve SMILES for CID: 134693235
Could not retrieve SMILES for CID: 122185
Could not retrieve SMILES for CID: 122032
Could not retrieve SMILES for CID: 18406187
Could not retrieve SMILES for CID: 20824
Could not retrieve SMILES for CID: 54607814
Could not retrieve SMILES for CID: 517574
Could not retrieve SMILES for CID: 104788
Could not retrieve SMILES for CID: 122173046
Could not retrieve SMILES for CID: 101545480
Could not retrieve SMILES for CID: 133109137
Could not retrieve SMILES for CID: 132274086
Could not retrieve SMILES for CID: 23622963
Could not retrieve SMILES for CID: 122130147
Could not retrieve SMILES for CID: 14374
Could not retrieve SMILES for CID: 5282046
Could not retrieve SMILES for CID: 5326761
Could not retrieve SMILES for CID: 5311011
Could not retrieve SMILES for CID: 11962413
Cou

In [18]:
no_drug_dataset.dropna(inplace=True)

In [19]:
no_drug_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9839 entries, 0 to 9953
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   CID                    9839 non-null   int64 
 1   MATC_Code_Short        9839 non-null   object
 2   MATC_Code_Explanation  9839 non-null   object
 3   IsomericSMILES         9839 non-null   object
dtypes: int64(1), object(3)
memory usage: 384.3+ KB


In [20]:
no_drug_dataset.to_csv(os.path.join('..', 'data', '01_raw', 'drugs_dataset', 'no_drug_dataset.csv'), index=False)

In [21]:
no_drug_dataset[no_drug_dataset['IsomericSMILES'].isnull()]

Unnamed: 0,CID,MATC_Code_Short,MATC_Code_Explanation,IsomericSMILES


In [22]:
no_drug_dataset['IsomericSMILES'].isnull().sum()

0

In [33]:
no_drug_dataset.head()

Unnamed: 0,CID,MATC_Code_Short,MATC_Code_Explanation,IsomericSMILES
0,24770,ND,No Drug,C1=CC(=CC=C1CCC(=O)C2=C(C=C(C=C2O)O)O)O.OP(=O)...
1,134694071,ND,No Drug,C[N+](C)(C)CCOP(=O)(OCC1[C@H](C(C(O1)N2C=CC(=N...
2,5122,ND,No Drug,CCCN(CCC)C1CCC2=C(C=CC(=C2C1)O)F
3,4660558,ND,No Drug,CC1CSC(=NC2(C3CCC(C3)C2(C)C)C)O1
4,122176,ND,No Drug,CC(C)[C@@H](C(=O)N)NC(=O)[C@@H](CCCN=C(N)N)NC(...


In [34]:
df_final = pd.concat([drug_dataset_cols, no_drug_dataset], ignore_index=True)

In [35]:
df_final['is_drug'] = (df_final['MATC_Code_Short'] != 'ND').astype(int)

In [36]:
drugs_dataset_final = df_final.sort_values(by='CID').reset_index(drop=True)

In [37]:
drugs_dataset_final.head(20)

Unnamed: 0,CID,IsomericSMILES,MATC_Code_Short,MATC_Code_Explanation,is_drug
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,N,Nervous system,1
1,2,CC(=O)OC(CC(=O)O)C[N+](C)(C)C,ND,No Drug,0
2,72,C1=CC(=C(C=C1C(=O)O)O)O,L,Antineoplastic and immunomodulating agents,1
3,73,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O...,ND,No Drug,0
4,137,C(CC(=O)O)C(=O)CN,L,Antineoplastic and immunomodulating agents,1
5,138,C(CCN)CC(=O)O,ND,No Drug,0
6,144,C1=CC2=C(C=C1O)C(=CN2)CC(C(=O)O)N,N,Nervous system,1
7,145,C(C(C(C(=O)O)O)O)C(=O)C(=O)O,ND,No Drug,0
8,149,CSCC1C(C(C(O1)N2C=NC3=C2N=CN=C3N)O)O,J,Antiinfectives for systemic use,1
9,150,C(C1C(C(C(O1)N)O)O)OP(=O)(O)O,ND,No Drug,0


In [40]:
drugs_dataset_final.to_csv(os.path.join('..', 'data', '01_raw', 'drugs_dataset', 'drugs_dataset_final.csv'), index=False)

In [None]:
#Function templates

# ======================================================
# Load Kedro-processed drug data
# ======================================================

with open("data/05_model_input/drug_X.pkl", "rb") as f:
    X_drug = pickle.load(f)

with open("data/05_model_input/drug_y_drug.pkl", "rb") as f:
    y_drug = pickle.load(f)

with open("data/05_model_input/drug_y_atc.pkl", "rb") as f:
    y_atc = pickle.load(f)

atc_mapping = pd.read_csv("data/08_reporting/atc_mapping.csv")
n_classes = y_atc.shape[1]

# ======================================================
# Load your non-drug dataset
# ======================================================

non_drug_df = pd.read_csv("data/01_raw/non_drugs_dataset.csv")
print(f"Loaded {len(non_drug_df)} non-drug molecules.")


# ======================================================
# Fingerprint generation (same function as in Kedro)
# ======================================================

def mol_from_smiles(smiles: str):
    try:
        return Chem.MolFromSmiles(smiles)
    except Exception:
        return None


def compute_fingerprints(smiles: str, radius=2, n_bits=2048):
    mol = mol_from_smiles(smiles)
    if mol is None:
        return np.zeros(n_bits + 167 + 1)

    # Morgan fingerprint
    morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    morgan = np.array(morgan)

    # MACCS keys
    maccs = MACCSkeys.GenMACCSKeys(mol)
    maccs = np.array(list(maccs.ToBitString()), dtype=int)

    # TPSA descriptor
    tpsa = np.array([Descriptors.TPSA(mol)])

    return np.concatenate([morgan, maccs, tpsa])


# ======================================================
# Process non-drugs
# ======================================================

print("Computing fingerprints for non-drugs...")
X_non_drug = np.stack(non_drug_df["IsomericSMILES"].map(compute_fingerprints))
y_non_drug = np.zeros(len(non_drug_df))  # non-drugs labeled 0
y_non_drug_atc = np.zeros((len(non_drug_df), n_classes))  # no ATC class

# ======================================================
# Combine with drug dataset
# ======================================================

X_final = np.vstack([X_drug, X_non_drug])
y_drug_final = np.concatenate([y_drug, y_non_drug])
y_atc_final = np.vstack([y_atc, y_non_drug_atc])

print(f"âœ… Final dataset shape: {X_final.shape}")
print(f"   â†’ Drugs: {int(sum(y_drug))}, Non-drugs: {int(len(y_non_drug))}")
print(f"   â†’ ATC classes: {n_classes}")

# ======================================================
# Save combined dataset
# ======================================================

with open("data/05_model_input/drug_non_drug_X.pkl", "wb") as f:
    pickle.dump(X_final, f)

with open("data/05_model_input/drug_non_drug_y_drug.pkl", "wb") as f:
    pickle.dump(y_drug_final, f)

with open("data/05_model_input/drug_non_drug_y_atc.pkl", "wb") as f:
    pickle.dump(y_atc_final, f)

print("ðŸ’¾ Saved combined dataset to data/05_model_input/")


FileNotFoundError: [Errno 2] No such file or directory: 'data/05_model_input/drug_X.pkl'