In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit import DataStructs
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
PAPER_COLORS = ['#1B9E77', '#7570B3', '#66A61E']

In [None]:
class ActiveLearningLoop:
    def __init__(self, data_df, fingerprint_type='morgan', model_type='RF',
                 model_params=None, preset='large',
                 init_fraction=0.20, selection_fraction=0.02, top_fraction=0.01,
                 max_iterations=5, random_state=42, test_size=0.2):
        self.data = data_df.reset_index(drop=True)
        self.fingerprint_type = fingerprint_type
        self.model_type = model_type
        self.model_params = model_params or {}
        self.preset = preset
        self.init_fraction = init_fraction
        self.selection_fraction = selection_fraction
        self.top_fraction = top_fraction
        self.max_iterations = max_iterations
        self.random_state = random_state
        self.test_size = test_size
        self.smiles_dict = dict(zip(self.data['ID'], self.data['smiles']))
        self.affinity_dict = dict(zip(self.data['ID'], self.data['affinity']))
        self.fingerprint_cache = {}
        self.evaluated_ids = set()
        self.all_ids = self.data["ID"].tolist()
        self.progress_log = []
        self.fp_length = {'morgan': 2048, 'maccs': 167, 'map4': 1024}[fingerprint_type]
        self.morgan_generator = GetMorganGenerator(radius=3, fpSize=2048)

    def _fp_morgan(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros(2048, dtype=float)
        fp = self.morgan_generator.GetFingerprint(mol)
        arr = np.zeros((2048,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr)
        return arr.astype(float)

    def _fp_maccs(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros(167, dtype=float)
        fp = MACCSkeys.GenMACCSKeys(mol)
        arr = np.zeros((167,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr)
        return arr.astype(float)

    def smiles_to_fingerprint(self, smiles):
        try:
            if self.fingerprint_type == 'maccs':
                return self._fp_maccs(smiles)
            else:
                return self._fp_morgan(smiles)
        except:
            return np.zeros(self.fp_length, dtype=float)

    def compute_fingerprints(self):
        for idx, smiles in self.smiles_dict.items():
            self.fingerprint_cache[idx] = self.smiles_to_fingerprint(smiles)

    def _make_model(self):
        if self.model_type == 'RF':
            defaults = dict(n_estimators=600, max_features=0.3,
                            n_jobs=-1, random_state=self.random_state)
            return RandomForestRegressor(**defaults)
        if self.model_type == 'SVR':
            defaults = dict(kernel='rbf', C=5.0, epsilon=0.1, gamma='scale')
            return Pipeline([('scaler', StandardScaler()), ('svr', SVR(**defaults))])
        if self.model_type == 'MLR':
            defaults = dict(alpha=1.0)
            return Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(**defaults))])

    def get_top_affinities(self, top_percent):
        df = self.data[['ID', 'affinity']].dropna().copy()
        df = df.sort_values('affinity', ascending=True)
        top_n = max(1, int(top_percent * len(df)))
        top_ids = set(df.head(top_n)['ID'].tolist())
        return top_ids, top_n

    def run(self):
        rng = np.random.RandomState(self.random_state)
        n_initial = max(1, int(self.init_fraction * len(self.all_ids)))
        selection_size = max(1, int(self.selection_fraction * len(self.all_ids)))
        top_ids, top_n = self.get_top_affinities(top_percent=self.top_fraction)
        self.compute_fingerprints()
        self.evaluated_ids = set(rng.choice(self.all_ids, n_initial, replace=False))
        captured0 = self.evaluated_ids.intersection(top_ids)
        self.progress_log.append({'iteration': 0, 'evaluated': len(self.evaluated_ids),
                                  'recovered': len(captured0),
                                  'recovered_pct': 100.0 * len(captured0) / max(1, top_n)})
        for iteration in range(self.max_iterations):
            print(f"\nIteration {iteration+1}/{self.max_iterations}")
            X_train, y_train = [], []
            for idx in self.evaluated_ids:
                y = self.affinity_dict.get(idx, None)
                if y is not None and not np.isnan(y):
                    X_train.append(self.fingerprint_cache[idx])
                    y_train.append(y)
            X_train = np.array(X_train); y_train = np.array(y_train)
            if len(X_train) == 0:
                break
            model = self._make_model()
            model.fit(X_train, y_train)
            candidates = list(set(self.all_ids) - self.evaluated_ids)
            if not candidates:
                break
            X_candidates = np.stack([self.fingerprint_cache[c] for c in candidates], axis=0)
            preds = model.predict(X_candidates)
            order = np.argsort(preds)
            take = min(selection_size, len(candidates))
            selected_ids = [candidates[i] for i in order[:take]]
            self.evaluated_ids.update(selected_ids)
            captured = self.evaluated_ids.intersection(top_ids)
            self.progress_log.append({'iteration': iteration + 1,
                                      'evaluated': len(self.evaluated_ids),
                                      'recovered': len(captured),
                                      'recovered_pct': 100.0 * len(captured) / max(1, top_n)})

In [None]:
def build_progress_df(loop):
    df = pd.DataFrame(loop.progress_log).copy()
    total = len(loop.all_ids)
    df['explored'] = df['evaluated']
    df['explored_pct'] = 100.0 * df['evaluated'] / max(1, total)
    return df

def plot_recovery_curves(progress_by_model, fingerprint_type, init_fraction, top_fraction):
    plt.figure()
    for i, (model, dfp) in enumerate(progress_by_model.items()):
        color = PAPER_COLORS[i % len(PAPER_COLORS)]
        plt.plot(dfp['explored'], dfp['recovered_pct'], marker='o',
                 label=model, linewidth=2, markersize=5, color=color)
    plt.xlabel("Molecules explored")
    plt.ylabel("Percentage of top-n scores found")
    plt.title(f"Init={init_fraction:.0%}, Fingerprint={fingerprint_type}, Top={int(top_fraction*100)}%")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()

def run_models_and_plot(df, fingerprint_type='morgan', models=('RF','SVR','MLR'),
                        init_fraction=0.10, selection_fraction=0.02, top_fraction=0.01,
                        max_iterations=5, random_state=42, test_size=0.2):
    progress_by_model = {}
    for model_type in models:
        loop = ActiveLearningLoop(df, fingerprint_type=fingerprint_type,
                                  model_type=model_type, preset='large',
                                  init_fraction=init_fraction,
                                  selection_fraction=selection_fraction,
                                  top_fraction=top_fraction,
                                  max_iterations=max_iterations,
                                  random_state=random_state,
                                  test_size=test_size)
        loop.run()
        df_prog = build_progress_df(loop)
        progress_by_model[model_type] = df_prog
    plot_recovery_curves(progress_by_model, fingerprint_type, init_fraction, top_fraction)

Base filtrada: 5280 compuestos con afinidad disponible.

Iteración 1/5
X_train shape: (105, 167)
y_train stats: min=-10.300, max=-6.600, std=0.811
[maccs] Modelo RF - R2: -0.089 RMSE: 0.841 MAE: 0.720
[maccs] Modelo SVR - R2: 0.143 RMSE: 0.747 MAE: 0.610
[maccs] Modelo MLR - R2: 0.193 RMSE: 0.724 MAE: 0.620
Usando modelo MLR para predicción de candidatos.
Seleccionados 105 nuevos candidatos para la siguiente iteración.

Iteración 2/5
X_train shape: (210, 167)
y_train stats: min=-10.700, max=-6.600, std=0.860
[maccs] Modelo RF - R2: 0.454 RMSE: 0.601 MAE: 0.505
[maccs] Modelo SVR - R2: 0.425 RMSE: 0.616 MAE: 0.488
[maccs] Modelo MLR - R2: 0.209 RMSE: 0.723 MAE: 0.567
Usando modelo RF para predicción de candidatos.
Seleccionados 105 nuevos candidatos para la siguiente iteración.

Iteración 3/5
X_train shape: (315, 167)
y_train stats: min=-10.700, max=-6.600, std=0.813
[maccs] Modelo RF - R2: 0.142 RMSE: 0.724 MAE: 0.601
[maccs] Modelo SVR - R2: 0.225 RMSE: 0.688 MAE: 0.537
[maccs] Modelo

In [None]:
df = pd.read_csv("df_merged.csv")
print(df.head())
print(f"Total compounds loaded: {len(df)}")

In [None]:
for init_f in [0.01, 0.05, 0.10]:
    run_models_and_plot(df, fingerprint_type='morgan',
                        models=('RF','SVR','MLR'),
                        init_fraction=init_f,
                        selection_fraction=0.02,
                        top_fraction=0.01,
                        max_iterations=5,
                        random_state=42,
                        test_size=0.2)