In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.3


In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import joblib

In [4]:
SMILES = "CC.CC.CC1=CC=C(CN2CCC3=C(C2)C(=O)N(CC2=C(C)C=CC=C2)C2=NCCN23)C=C1"

In [5]:
def calculate_descriptors(SMILES):
    mol = Chem.MolFromSmiles(SMILES)
    data = Descriptors.CalcMolDescriptors(mol)
    if mol is None:
        raise ValueError("Invalid SMILES string provided")
    return data

In [6]:
descriptors = calculate_descriptors(SMILES)
descriptors_df = pd.DataFrame([descriptors])
descriptors_df.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.594583,13.594583,0.123851,0.123851,0.602213,17.235294,460.666,420.346,460.320212,182,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def pu_loss(prior):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1-1e-7)

        # Positive term: mean over ACTUAL positives only
        positive_mask = tf.cast(tf.equal(y_true, 1), tf.float32)
        n_positive = tf.reduce_sum(positive_mask)+ 1e-7  # Count actual positives and avoid division by zero
        positive_term = -tf.reduce_sum(positive_mask * tf.math.log(y_pred + 1e-7)) / n_positive

        # Unlabeled term
        unlabeled_mask = tf.cast(tf.equal(y_true, 0), tf.float32)
        n_unlabeled =  tf.reduce_sum(unlabeled_mask) + 1e-7  # Count unlabeled samples and avoid division by zero
        unlabeled_term_1 = -tf.reduce_sum(unlabeled_mask * tf.math.log(1 - y_pred + 1e-7)) / n_unlabeled
        unlabeled_term_2 = -tf.reduce_sum(positive_mask * tf.math.log(1 - y_pred + 1e-7)) / n_positive

        pu_loss = prior * positive_term +  tf.maximum((unlabeled_term_1 - prior * unlabeled_term_2),0)
        return pu_loss
    return loss

In [None]:
# 1. Load your trained model and scaler
model = tf.keras.models.load_model(r"C:\Users\rudra\OneDrive\Desktop\Wrap_Up_101\Codes\Class_Prior_Runs\pi_0.5_Chosen_One\t5\drug_classification_model_with_PU_Loss.h5",
                                  custom_objects={'loss': pu_loss(prior=0.5)})
scaler = joblib.load(r"C:\Users\rudra\OneDrive\Desktop\Wrap_Up_101\Codes\Class_Prior_Runs\pi_0.5_Chosen_One\t5\scaler.save")  # Load the scaler used during training

In [None]:
X_new = descriptors_df

# Ensure columns match training data (critical!)
try:
    X_new = X_new[scaler.feature_names_in_]  # Use only columns the model knows
except KeyError as e:
    print("Error: Missing descriptors needed by the model!")
    print(f"Required descriptors: {scaler.feature_names_in_.tolist()}")
    print(f"Your CSV contains: {X_new.columns.tolist()}")
    raise

# Standardize using the original scaler
X_new_scaled = scaler.transform(X_new)

In [None]:
# 4. Make predictions
predictions = model.predict(X_new_scaled)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to 0/1

# 5. Create output DataFrame
results = pd.DataFrame({
    'SMILES': SMILES,
    'Predicted_Label': predicted_labels.flatten(),
    'Prediction_Confidence': predictions.flatten()
})
print("\nPrediction results:")
print(results.head())


Prediction results:
                                              SMILES  Predicted_Label  \
0  CC.CC.CC1=CC=C(CN2CCC3=C(C2)C(=O)N(CC2=C(C)C=C...                1   

   Prediction_Confidence  
0               0.986676  
