In [None]:
import pandas as pd
import numpy as np
import torch
import os
from tqdm import tqdm
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, average_precision_score, precision_recall_curve, auc, balanced_accuracy_score
)
from joblib import Parallel, delayed  # For parallel computation

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define directory to save results
save_dir = "C:\\Users\\jen\\Proteins\\MDM2\\Scaffold outersplit\\Results\\MDM2_random_scaffold"
os.makedirs(save_dir, exist_ok=True)

# Set random seed
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# **Parallelized SMILES -> ECFP conversion**
def smiles_to_ecfp_counts(smiles_list, radius=3, nBits=2048):
    def compute_fingerprint(smiles):
        mol = AllChem.MolFromSmiles(smiles)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
            arr = np.zeros((nBits,), dtype=np.uint8)
            DataStructs.ConvertToNumpyArray(fp, arr)
            return arr
        return np.zeros(nBits, dtype=np.uint8)
    
    fingerprints = Parallel(n_jobs=-1)(delayed(compute_fingerprint)(smiles) for smiles in smiles_list)
    return np.array(fingerprints)

# Train SVM model
def train_svc(train_features, train_labels):
    model = SVC(
        C=14.760052670334735, 
        gamma=0.011087183655158359, 
        class_weight="balanced", 
        probability=True, 
        random_state=42
    )
    model.fit(train_features, train_labels)
    return model

# Active Learning main loop
def active_learning_loop(selection_pool, train_set, test_set, max_iterations=10, molecules_to_select=400):
    print("Training initial SVC model...")
    
    train_set["features"] = list(smiles_to_ecfp_counts(train_set["canonical_smiles"].tolist()))
    train_features = np.vstack(train_set["features"].values)
    train_labels = train_set["pChEMBL_gt6"].values
    
    model = train_svc(train_features, train_labels)
    
    test_features = smiles_to_ecfp_counts(test_set["canonical_smiles"].tolist())
    test_labels = test_set["pChEMBL_gt6"].values
    
    test_predictions = model.predict_proba(test_features)[:, 1]
    accuracy, balanced_acc, avg_precision, auc_pr = evaluate_model(test_predictions, test_labels)
    
    results = [{
        "iteration": 0, 
        "accuracy": accuracy, 
        "balanced_accuracy": balanced_acc,  
        "average_precision": avg_precision, 
        "auc_pr": auc_pr
    }]
    
    print(f"Initial Model Results: Accuracy={accuracy:.4f}, Balanced Accuracy={balanced_acc:.4f}, AP={avg_precision:.4f}, AUC-PR={auc_pr:.4f}")
    
    for iteration in range(1, max_iterations + 1):
        print(f"Iteration {iteration}: Selection Pool Size = {len(selection_pool)}")
        
        # Randomly sample molecules from the selection pool
        selected_molecules = selection_pool.sample(n=molecules_to_select, random_state=42)
        train_set = pd.concat([train_set, selected_molecules], ignore_index=True)
        selection_pool.drop(index=selected_molecules.index, inplace=True)
        
        print(f"Iteration {iteration}: Train set size = {len(train_set)}")
        
        # Recompute fingerprints for the updated training set
        train_set["features"] = list(smiles_to_ecfp_counts(train_set["canonical_smiles"].tolist()))
        train_features = np.vstack(train_set["features"].values)
        train_labels = train_set["pChEMBL_gt6"].values
        
        model = train_svc(train_features, train_labels)
        
        test_predictions = model.predict_proba(test_features)[:, 1]
        accuracy, balanced_acc, avg_precision, auc_pr = evaluate_model(test_predictions, test_labels)
        
        results.append({
            "iteration": iteration, 
            "accuracy": accuracy, 
            "balanced_accuracy": balanced_acc,  
            "average_precision": avg_precision, 
            "auc_pr": auc_pr
        })
        
        print(f"Iteration {iteration} Results: Accuracy={accuracy:.4f}, Balanced Accuracy={balanced_acc:.4f}, AP={avg_precision:.4f}, AUC-PR={auc_pr:.4f}")
    
    return results

# **Evaluate the model**
def evaluate_model(predictions, expected):
    predicted_labels = (predictions >= 0.5).astype(int)
    accuracy = accuracy_score(expected, predicted_labels)
    balanced_acc = balanced_accuracy_score(expected, predicted_labels)
    average_precision = average_precision_score(expected, predictions)
    precision, recall, _ = precision_recall_curve(expected, predictions)
    auc_pr = auc(recall, precision)
    return accuracy, balanced_acc, average_precision, auc_pr

# **Initialize data**
train_set = pd.read_csv("./Data/MDM2_scaffold_train_set_20.csv")
selection_pool = pd.read_csv("./Data/MDM2_scaffold_selection_pool_20.csv")
test_set = pd.read_csv("./Data/MDM2_scaffold_test_set_20.csv")

# **Start Active Learning**
molecules_to_select = 400
max_iterations = 10
results = active_learning_loop(selection_pool, train_set, test_set, max_iterations, molecules_to_select)

# **Save results**
save_path = os.path.join(save_dir, "MDM2_random_scaffold_R1.csv")
pd.DataFrame(results).to_csv(save_path, index=False)
print(f"Results saved to {save_path}")
print("Active Learning Process Completed!")
