In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, average_precision_score, precision_recall_curve, auc, balanced_accuracy_score
from tqdm import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Definesaved directory
save_dir = "C:\Users\jen\Proteins\DRD2\Scaffold outersplit\Results\MDM2_active_learning_scaffold"
os.makedirs(save_dir, exist_ok=True)

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # 设置全局随机种子

# Convert SMILES into ECFP descriptors
def smiles_to_ecfp(smiles_list, radius=3, nBits=2048):
    fingerprints = []
    for smiles in smiles_list:
        mol = AllChem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
        arr = np.zeros((nBits,), dtype=np.float32)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fingerprints.append(arr)
    return np.array(fingerprints)

# Train SVC 
def train_svc(train_features, train_labels):
    model = SVC(C=14.760052670334735, class_weight='balanced', gamma=0.011087183655158359, probability=True, random_state=42)
    model.fit(train_features, train_labels)
    return model

# Calculate uncertainty
def predict_with_uncertainty(model, test_features):
    probabilities = model.predict_proba(test_features)[:, 1]
    return probabilities

# Evaluate metrics
def evaluate_model(predictions, expected):
    predicted_labels = (predictions >= 0.5).astype(int)
    accuracy = accuracy_score(expected, predicted_labels)
    balanced_acc = balanced_accuracy_score(expected, predicted_labels)  # 计算 Balanced Accuracy
    average_precision = average_precision_score(expected, predictions)
    precision, recall, _ = precision_recall_curve(expected, predictions)
    auc_pr = auc(recall, precision)
    
    return accuracy, balanced_acc, average_precision, auc_pr

# Active Learning cycle
def active_learning_loop(selection_pool, train_set, test_set):
    print("Training initial SVC model...")
    train_smiles = train_set["canonical_smiles"].tolist()
    train_features = smiles_to_ecfp(train_smiles)
    train_labels = train_set["pChEMBL_gt6"].values
    
    model = train_svc(train_features, train_labels)
    
    test_smiles = test_set["canonical_smiles"].tolist()
    test_features = smiles_to_ecfp(test_smiles)
    test_labels = test_set["pChEMBL_gt6"].values
    
    test_predictions = predict_with_uncertainty(model, test_features)
    accuracy, balanced_acc, avg_precision, auc_pr = evaluate_model(test_predictions, test_labels)
    
    results.append({
        "iteration": 0, 
        "accuracy": accuracy, 
        "balanced_accuracy": balanced_acc,  # 记录 balanced accuracy
        "average_precision": avg_precision, 
        "auc_pr": auc_pr
    })
    
    print(f"Initial Model Results: Accuracy={accuracy:.4f}, Balanced Accuracy={balanced_acc:.4f}, AP={avg_precision:.4f}, AUC-PR={auc_pr:.4f}")
    
    for iteration in range(1, max_iterations + 1):
        print(f"Iteration {iteration}: Selection Pool Size = {len(selection_pool)}")
        
        selection_smiles = selection_pool['canonical_smiles'].tolist()
        selection_features = smiles_to_ecfp(selection_smiles)
        selection_predictions = predict_with_uncertainty(model, selection_features)
        selection_pool['predictions'] = selection_predictions
        
        n_instances = min(molecules_to_select, len(selection_pool))
        selected_molecules = selection_pool.nlargest(n_instances, 'predictions')
        selection_pool = selection_pool.drop(selected_molecules.index)
        
        train_set = pd.concat([train_set, selected_molecules])
        print(f"Iteration {iteration}: Train set size = {len(train_set)}")
        
        train_smiles = train_set["canonical_smiles"].tolist()
        train_features = smiles_to_ecfp(train_smiles)
        train_labels = train_set["pChEMBL_gt6"].values
        
        model = train_svc(train_features, train_labels)
        test_predictions = predict_with_uncertainty(model, test_features)
        accuracy, balanced_acc, avg_precision, auc_pr = evaluate_model(test_predictions, test_labels)
        
        results.append({
            "iteration": iteration, 
            "accuracy": accuracy, 
            "balanced_accuracy": balanced_acc,  # 记录 balanced accuracy
            "average_precision": avg_precision, 
            "auc_pr": auc_pr
        })
        
        print(f"Iteration {iteration} Results: Accuracy={accuracy:.4f}, Balanced Accuracy={balanced_acc:.4f}, AP={avg_precision:.4f}, AUC-PR={auc_pr:.4f}")
        
        # 在每次迭代后清理 GPU 缓存
        torch.cuda.empty_cache()
    
    return results

# Data initialisation
train_set = pd.read_csv("./Data/MDM2_scaffold_train_set_20.csv")
selection_pool = pd.read_csv("./Data/MDM2_selection_pool_20.csv")
test_set = pd.read_csv("./Data/MDM2_scaffold_test_set_20.csv")

# Setting
molecules_to_select = 400
max_iterations = 10
results = []

# Start Active Learning
torch.cuda.empty_cache()
results = active_learning_loop(selection_pool, train_set, test_set)

save_path = os.path.join(save_dir, "MDM2_AL_results.csv")
pd.DataFrame(results).to_csv(save_path, index=False)
print(f"Results saved to {save_path}")
print("Active Learning Process Completed!")
