In [None]:
import pandas as pd
import numpy as np
import torch
import os
from tqdm import tqdm
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, average_precision_score, precision_recall_curve, auc, balanced_accuracy_score
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define saving directory
save_dir = "C:\\Users\\jen\\Proteins\\MDM2\\Scaffold outersplit\\Results\\MDM2_similarity_scaffold"
os.makedirs(save_dir, exist_ok=True)

# set random seed
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# SMILES -> ECFP 
def smiles_to_ecfp_counts(smiles_list, radius=3, nBits=2048, batch_size=2000):
    num_batches = (len(smiles_list) + batch_size - 1) // batch_size
    with tqdm(total=num_batches, desc="Generating ECFP Features", leave=True) as pbar:
        for i in range(0, len(smiles_list), batch_size):
            batch_smiles = smiles_list[i : i + batch_size]
            batch_fps = np.zeros((len(batch_smiles), nBits), dtype=np.uint8)
            
            for j, smiles in enumerate(batch_smiles):
                mol = AllChem.MolFromSmiles(smiles)
                if mol:
                    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
                    DataStructs.ConvertToNumpyArray(fp, batch_fps[j])
            
            pbar.update(1)
            yield batch_fps
# Train SVM 
def train_svc(train_features, train_labels):
    model = SVC(
        C=14.760052670334735, 
        gamma=0.011087183655158359, 
        class_weight="balanced", 
        probability=True, 
        random_state=42
    )
    model.fit(train_features, train_labels)
    return model

# Calculate Tanimoto similarity to selection pool 
def calculate_max_tanimoto_similarity(selection_pool, train_active_smiles, radius=3, nBits=2048):
    max_similarities = []
    train_fps = [
        AllChem.GetMorganFingerprintAsBitVect(AllChem.MolFromSmiles(smiles), radius, nBits)
        for smiles in train_active_smiles if AllChem.MolFromSmiles(smiles) is not None
    ]
    for smiles in tqdm(selection_pool["canonical_smiles"], desc="Calculating Tanimoto similarities"):
        mol = AllChem.MolFromSmiles(smiles)
        if mol is None:
            max_similarities.append(0)
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
        similarities = [DataStructs.TanimotoSimilarity(fp, train_fp) for train_fp in train_fps]
        max_similarities.append(max(similarities) if similarities else 0)
    return max_similarities

# Evaluation
def evaluate_model(predictions, expected):
    predicted_labels = (predictions >= 0.5).astype(int)
    accuracy = accuracy_score(expected, predicted_labels)
    balanced_acc = balanced_accuracy_score(expected, predicted_labels)
    average_precision = average_precision_score(expected, predictions)
    precision, recall, _ = precision_recall_curve(expected, predictions)
    auc_pr = auc(recall, precision)
    return accuracy, balanced_acc, average_precision, auc_pr

# Active Learning loop
def active_learning_loop(selection_pool, train_set, test_set, max_iterations=10, molecules_to_select=400):
    print("Training initial SVC model...")

    # **转换训练数据**
    train_features = np.concatenate([batch for batch in smiles_to_ecfp_counts(train_set["canonical_smiles"].tolist())])
    train_labels = train_set["pChEMBL_gt6"].values
    model = train_svc(train_features, train_labels)
    
    # **转换测试数据**
    test_features = np.concatenate([batch for batch in smiles_to_ecfp_counts(test_set["canonical_smiles"].tolist())])
    test_labels = test_set["pChEMBL_gt6"].values
    
    test_predictions = model.predict_proba(test_features)[:, 1]
    accuracy, balanced_acc, avg_precision, auc_pr = evaluate_model(test_predictions, test_labels)
    
    results = [{
        "iteration": 0, 
        "accuracy": accuracy, 
        "balanced_accuracy": balanced_acc,  
        "average_precision": avg_precision, 
        "auc_pr": auc_pr
    }]
    
    print(f"Initial Model Results: Accuracy={accuracy:.4f}, Balanced Accuracy={balanced_acc:.4f}, AP={avg_precision:.4f}, AUC-PR={auc_pr:.4f}")

    for iteration in range(1, max_iterations + 1):
        print(f"Iteration {iteration}: Selection Pool Size = {len(selection_pool)}")
        
        train_active_smiles = train_set[train_set["pChEMBL_gt6"] == 1]["canonical_smiles"].tolist()
        selection_pool["max_tanimoto_similarity"] = calculate_max_tanimoto_similarity(selection_pool, train_active_smiles)
        
        selected_molecules = selection_pool.nlargest(molecules_to_select, "max_tanimoto_similarity")
        selection_pool = selection_pool.drop(selected_molecules.index)
        
        train_set = pd.concat([train_set, selected_molecules])
        print(f"Iteration {iteration}: Train set size = {len(train_set)}")
        
        train_features = np.concatenate([batch for batch in smiles_to_ecfp_counts(train_set["canonical_smiles"].tolist())])
        train_labels = train_set["pChEMBL_gt6"].values
        model = train_svc(train_features, train_labels)
        
        test_predictions = model.predict_proba(test_features)[:, 1]
        accuracy, balanced_acc, avg_precision, auc_pr = evaluate_model(test_predictions, test_labels)
        
        results.append({
            "iteration": iteration, 
            "accuracy": accuracy, 
            "balanced_accuracy": balanced_acc,  
            "average_precision": avg_precision, 
            "auc_pr": auc_pr
        })
        
        print(f"Iteration {iteration} Results: Accuracy={accuracy:.4f}, Balanced Accuracy={balanced_acc:.4f}, AP={avg_precision:.4f}, AUC-PR={auc_pr:.4f}")

    return results


# Data initialisation
train_set = pd.read_csv("./Data/MDM2_scaffold_train_set_20.csv")
selection_pool = pd.read_csv("./Data/MDM2_scaffold_selection_pool_20.csv")
test_set = pd.read_csv("./Data/MDM2_scaffold_test_set_20.csv")


molecules_to_select = 400
max_iterations = 10
results = active_learning_loop(selection_pool, train_set, test_set, max_iterations, molecules_to_select)

# Save Result
save_path = os.path.join(save_dir, "MDM2_scaffold_similarity_R1.csv")
pd.DataFrame(results).to_csv(save_path, index=False)
print(f"Results saved to {save_path}")
print("Active Learning Process Completed!")
