In [None]:
import pandas as pd
import pickle
import os
from rdkit import Chem
from rdkit.Chem import Descriptors
from mordred import Calculator, descriptors
from sklearn.metrics import accuracy_score

# Function to calculate RDKit descriptors
def calculate_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    rdkit_descriptors = {
        'MinAbsEStateIndex': Descriptors.MinAbsEStateIndex(mol),
        'MinAbsPartialCharge': Descriptors.MinAbsPartialCharge(mol),
        'BCUT2D_MWHI': Descriptors.BCUT2D_MWHI(mol),
        'BCUT2D_MWLOW': Descriptors.BCUT2D_MWLOW(mol),
        'BCUT2D_CHGHI': Descriptors.BCUT2D_CHGHI(mol),
        'BCUT2D_CHGLO': Descriptors.BCUT2D_CHGLO(mol),
        'BCUT2D_LOGPHI': Descriptors.BCUT2D_LOGPHI(mol),
        'BCUT2D_LOGPLOW': Descriptors.BCUT2D_LOGPLOW(mol),
        'BCUT2D_MRHI': Descriptors.BCUT2D_MRHI(mol),
        'BCUT2D_MRLOW': Descriptors.BCUT2D_MRLOW(mol),
        'BalabanJ': Descriptors.BalabanJ(mol),
        'MolLogP': Descriptors.MolLogP(mol)
    }
    return pd.DataFrame([rdkit_descriptors])

# Function to calculate Mordred descriptors
def calculate_mordred_descriptors(smiles):
    calc = Calculator(descriptors, ignore_3D=True)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mordred_descriptors = calc(mol).asdict()

    
    selected_descriptors = {
        'BCUTi-1h': mordred_descriptors['BCUTi-1h'],
        'AATSC2s': mordred_descriptors['AATSC2s'],
        'GATS2m': mordred_descriptors['GATS2m'],
        'GATS1s': mordred_descriptors['GATS1s'],
        'ATSC3s': mordred_descriptors['ATSC3s'],
        'GATS2v': mordred_descriptors['GATS2v'],
        'AETA_eta_L': mordred_descriptors['AETA_eta_L'],
        'GATS2pe': mordred_descriptors['GATS2pe'],
        'ATSC3se': mordred_descriptors['ATSC3se'],
        'ATSC4v': mordred_descriptors['ATSC4v'],
        'GATS1m': mordred_descriptors['GATS1m'],
        'MIC5': mordred_descriptors['MIC5'],
        'GATS1v': mordred_descriptors['GATS1v'],
        'BCUTd-1l': mordred_descriptors['BCUTd-1l'],
        'ATSC2d': mordred_descriptors['ATSC2d'],
        'ATSC3d': mordred_descriptors['ATSC3d'],
        'AATSC0p': mordred_descriptors['AATSC0p'],
        'AATSC0i': mordred_descriptors['BIC5'],
        'Mv': mordred_descriptors['Mv'],
        'MIC0': mordred_descriptors['MIC0'],
        'CIC2': mordred_descriptors['CIC2'],
        'BCUTi-1l': mordred_descriptors['BCUTi-1l'],
        'BIC5': mordred_descriptors['BIC5'],
        
    }

    return pd.DataFrame([selected_descriptors])

# Function to check if the molecule is connected
def is_connected(mol):
    visited = set()  
    stack = [mol.GetAtoms()[0].GetIdx()]

    while stack:
        idx = stack.pop()
        if idx in visited:
            continue
        visited.add(idx)
        for neighbor in mol.GetAtomWithIdx(idx).GetNeighbors():
            stack.append(neighbor.GetIdx())

    return len(visited) == mol.GetNumAtoms()

# Function to calculate combined descriptors
def calculate_descriptors(smiles):
    rdkit_df = calculate_rdkit_descriptors(smiles)
    mordred_df = calculate_mordred_descriptors(smiles)

    mol = Chem.MolFromSmiles(smiles)
    if mol is None or not is_connected(mol):
        return None
    
    if rdkit_df is None or mordred_df is None:
        return None
    
    combined_descriptors = pd.concat([rdkit_df, mordred_df], axis=1)

    expected_columns = ["BCUTi-1h", "AATSC2s", "BCUTi-1l", "GATS2m", "GATS1s", "ATSC3s", "GATS2v", "AETA_eta_L", 
                    "GATS2pe", "ATSC3se", "ATSC4v", "GATS1m", "MIC5", "GATS1v", "BCUTd-1l", "ATSC2d", 
                    "ATSC3d", "AATSC0p", "AATSC0i", "BIC5", "Mv", "MIC0", "CIC2","MinAbsEStateIndex", "MinAbsPartialCharge","BCUT2D_MWHI","BCUT2D_MWLOW","BCUT2D_CHGHI","BCUT2D_CHGLO","BCUT2D_LOGPHI","BCUT2D_LOGPLOW","BCUT2D_MRHI","BCUT2D_MRLOW","BalabanJ","MolLogP"]
    
    for col in expected_columns:
        if col not in combined_descriptors.columns:
            combined_descriptors[col] = 0 

    combined_descriptors = combined_descriptors[expected_columns]

    return combined_descriptors

# Load model function
def load_model(model_path):
    """Load the trained model from the specified path."""
    with open(model_path, 'rb') as f:
        return pickle.load(f) 

# Load data function
def load_data(data_path):
    """Load and preprocess the dataset."""
    data = pd.read_excel(data_path)
    data = data.dropna()
    data = data.drop(columns=['Smiles'])  # Drop the 'Smiles' column
    X_new = data.drop(columns=['Labels'])  # Features
    y_new_true = data['Labels']  # True labels
    return X_new, y_new_true

# Evaluate model function
def evaluate_model(model, X_new, y_new_true):
    """Evaluate the model on new data and return accuracy."""
    y_new_pred = model.predict(X_new)
    accuracy = accuracy_score(y_new_true, y_new_pred)
    return accuracy

if __name__ == "__main__":
    model_path = 'xgboost_model.pkl'
    data_path = input("Enter the path to your dataset (Excel file): ")

    if os.path.exists(model_path) and os.path.exists(data_path):
        loaded_model = load_model(model_path)
        X_new, y_new_true = load_data(data_path)

        # Assuming 'Smiles' column exists in the data for descriptor calculation
        if 'Smiles' in pd.read_excel(data_path).columns:
            descriptors_df = pd.concat([calculate_descriptors(smiles) for smiles in pd.read_excel(data_path)['Smiles']], ignore_index=True)

            X_new = pd.concat([X_new, descriptors_df], axis=1)

        accuracy = evaluate_model(loaded_model, X_new, y_new_true)
        print("Accuracy:", accuracy)
    else:
        print("Error: Model or dataset file not found. Please check the paths.")
