In [3]:
import pandas as pd
import pickle
import os
from rdkit import Chem
from rdkit.Chem import Descriptors
from mordred import Calculator, descriptors
from sklearn.metrics import accuracy_score

# Function to calculate RDKit descriptors
def calculate_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    rdkit_descriptors = {
        'MinEStateIndex': Descriptors.MinEStateIndex(mol),
        'qed': Descriptors.qed(mol),
        'MaxAbsPartialCharge': Descriptors.MaxAbsPartialCharge(mol),
        'MinAbsPartialCharge': Descriptors.MinAbsPartialCharge(mol),
        'BCUT2D_MWHI': Descriptors.BCUT2D_MWHI(mol),
        'BCUT2D_MWLOW': Descriptors.BCUT2D_MWLOW(mol),
        'BCUT2D_MRHI': Descriptors.BCUT2D_MRHI(mol),
        'BCUT2D_MRLOW': Descriptors.BCUT2D_MRLOW(mol),
        'AvgIpc': Descriptors.AvgIpc(mol),
        'BalabanJ': Descriptors.BalabanJ(mol),
        'VSA_EState5': Descriptors.VSA_EState5(mol)
    }
    return pd.DataFrame([rdkit_descriptors])

# Function to calculate Mordred descriptors
def calculate_mordred_descriptors(smiles):
    calc = Calculator(descriptors, ignore_3D=True)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mordred_descriptors = calc(mol).asdict()

    # Select relevant Mordred descriptors
    selected_descriptors = {
        'ATSC2pe': mordred_descriptors['ATSC2pe'],
        'AATSC1c': mordred_descriptors['AATSC1c'],
        'MATS2pe': mordred_descriptors['MATS2pe'],
        'BCUTdv-1h': mordred_descriptors['BCUTdv-1h'],
        'BCUTs-1h': mordred_descriptors['BCUTs-1h'],
        'BCUTs-1l': mordred_descriptors['BCUTs-1l'],
        'BCUTse-1h': mordred_descriptors['BCUTse-1h'],
        'BCUTse-1l': mordred_descriptors['BCUTse-1l'],
        'BCUTi-1h': mordred_descriptors['BCUTi-1h'],
        'BCUTi-1l': mordred_descriptors['BCUTi-1l'],
        'RPCG': mordred_descriptors['RPCG'],
        'SpMAD_Dt': mordred_descriptors['SpMAD_Dt']
    }

    return pd.DataFrame([selected_descriptors])

# Function to check if the molecule is connected
def is_connected(mol):
    visited = set()  
    stack = [mol.GetAtoms()[0].GetIdx()]

    while stack:
        idx = stack.pop()
        if idx in visited:
            continue
        visited.add(idx)
        for neighbor in mol.GetAtomWithIdx(idx).GetNeighbors():
            stack.append(neighbor.GetIdx())

    return len(visited) == mol.GetNumAtoms()

# Function to calculate combined descriptors
def calculate_descriptors(smiles):
    rdkit_df = calculate_rdkit_descriptors(smiles)
    mordred_df = calculate_mordred_descriptors(smiles)

    mol = Chem.MolFromSmiles(smiles)
    if mol is None or not is_connected(mol):
        return None
    
    if rdkit_df is None or mordred_df is None:
        return None
    
    combined_descriptors = pd.concat([rdkit_df, mordred_df], axis=1)

    expected_columns = ['MinEStateIndex', 'qed', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 
                        'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 
                        'AvgIpc', 'BalabanJ', 'VSA_EState5', 'ATSC2pe', 'AATSC1c', 
                        'MATS2pe', 'BCUTdv-1h', 'BCUTs-1h', 'BCUTs-1l', 'BCUTse-1h', 
                        'BCUTse-1l', 'BCUTi-1h', 'BCUTi-1l', 'RPCG', 'SpMAD_Dt']
    
    for col in expected_columns:
        if col not in combined_descriptors.columns:
            combined_descriptors[col] = 0 

    combined_descriptors = combined_descriptors[expected_columns]

    return combined_descriptors

# Load model function
def load_model(model_path):
    """Load the trained model from the specified path."""
    with open(model_path, 'rb') as f:
        return pickle.load(f) 

# Load data function
def load_data(data_path):
    """Load and preprocess the dataset."""
    data = pd.read_excel(data_path)
    data = data.dropna()
    data = data.drop(columns=['Smiles'])  # Drop the 'Smiles' column
    X_new = data.drop(columns=['Labels'])  # Features
    y_new_true = data['Labels']  # True labels
    return X_new, y_new_true

# Evaluate model function
def evaluate_model(model, X_new, y_new_true):
    """Evaluate the model on new data and return accuracy."""
    y_new_pred = model.predict(X_new)
    accuracy = accuracy_score(y_new_true, y_new_pred)
    return accuracy

if __name__ == "__main__":
    model_path = 'model.pkl'
    data_path = input("Enter the path to your dataset (Excel file): ")

    if os.path.exists(model_path) and os.path.exists(data_path):
        loaded_model = load_model(model_path)
        X_new, y_new_true = load_data(data_path)

        # Assuming 'Smiles' column exists in the data for descriptor calculation
        if 'Smiles' in pd.read_excel(data_path).columns:
            descriptors_df = pd.concat([calculate_descriptors(smiles) for smiles in pd.read_excel(data_path)['Smiles']], ignore_index=True)

            X_new = pd.concat([X_new, descriptors_df], axis=1)

        accuracy = evaluate_model(loaded_model, X_new, y_new_true)
        print("Accuracy:", accuracy)
    else:
        print("Error: Model or dataset file not found. Please check the paths.")


Enter the path to your dataset (Excel file): D:\Car Wallpaper\Genotoxicity\Train_&_Test_set\232 external set\232 Smiles Test Set.xlsx
Accuracy: 0.9612068965517241
