### 1. Key Instance Detection

Some MIL algorithms can identify key instances (if they have get_instance_weights method). In this section, AttentionNetworkRegressor is used to estimate the conformer weights. Here, different 3D descriptors are used to estimate the weight distribution depending on the representation type.

**Conclusion:** With current representations available the weight distribution is not definitive (almost uniform).

In [None]:
from qsarmil.descriptor.rdkit import (RDKitGEOM, 
                                      RDKitAUTOCORR, 
                                      RDKitRDF, 
                                      RDKitMORSE, 
                                      RDKitWHIM, 
                                      RDKitGETAWAY)

from molfeat.calc import (Pharmacophore3D, 
                          USRDescriptors, 
                          ElectroShapeDescriptors)

from qsarmil.descriptor.wrapper import DescriptorWrapper

In [None]:
desc_list = [
             ("RDKitGEOM", DescriptorWrapper(RDKitGEOM())),
             ("RDKitAUTOCORR", DescriptorWrapper(RDKitAUTOCORR())),
             ("RDKitRDF", DescriptorWrapper(RDKitRDF())),
             ("RDKitMORSE", DescriptorWrapper(RDKitMORSE())),
             ("RDKitWHIM", DescriptorWrapper(RDKitWHIM())),
             # ("RDKitGETAWAY", DescriptorWrapper(RDKitGETAWAY())), # can be long
             # ("MolFeatPmapper", DescriptorWrapper(Pharmacophore3D(factory='pmapper'))), # can be long
             ("MolFeatUSRD", DescriptorWrapper(USRDescriptors())),
             ("MolFeatElectroShape", DescriptorWrapper(ElectroShapeDescriptors())),
            ]

In [None]:
network_hparams = {'hidden_layer_sizes':(256, 128, 64),
                   'num_epoch':300,
                   'batch_size':128,
                   'learning_rate':0.001,
                   'weight_decay':0.001,
                   'instance_weight_dropout':0.01,
                   'init_cuda':False,
                   'verbose':False}

In [None]:
w_list = [pd.DataFrame() for _ in confs_test]
for desc_name, desc_calc in desc_list:
    
    # calc descriptors
    x_train = desc_calc.transform(confs_train)
    x_test = desc_calc.transform(confs_test)

    # scale descriptors
    scaler = BagMinMaxScaler()
    scaler.fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    # train model
    model = AttentionNetworkRegressor(**network_hparams)
    model.fit(x_train_scaled, y_train)

    # get instance weights
    w_pred = model.get_instance_weights(x_test_scaled)
    for w, df in zip(w_pred, w_list):
        df[desc_name] = w
        df.index = [f"Conformer_{i + 1}" for i in range(len(w))]

In [None]:
w_list[0].round(2) # molecule 0

In [None]:
w_list[1].round(2) # molecule 1

### 2. Intra-bag vs. Inter-bag

In [None]:
import numpy as np

def compute_bag_variances(bags):
    """
    Computes intra-bag and inter-bag variance from a list of bags.

    Parameters:
    - bags: list of np.ndarray, each of shape (n_instances, descriptor_dim)

    Returns:
    - intra_bag_variances: list of float (one per bag)
    - mean_intra_bag_variance: float
    - inter_bag_variance: float
    """
    bag_means = []
    intra_bag_variances = []

    for bag in bags:
        if bag.shape[0] == 0:
            raise ValueError("A bag is empty.")
        bag_mean = bag.mean(axis=0)
        bag_means.append(bag_mean)
        variance = np.mean(np.linalg.norm(bag - bag_mean, axis=1) ** 2)
        intra_bag_variances.append(variance)

    # Convert list of means to array
    bag_means = np.stack(bag_means, axis=0)
    global_mean = bag_means.mean(axis=0)

    # Inter-bag variance: variance of bag means from global mean
    inter_bag_variance = np.mean(np.linalg.norm(bag_means - global_mean, axis=1) ** 2)

    return intra_bag_variances, np.mean(intra_bag_variances), inter_bag_variance

def normalized_entropy(weights, epsilon=1e-12):
    """
    Computes normalized entropy of a vector of attention weights.

    Parameters:
    - weights: array-like of shape (n,) — non-negative, need not be normalized
    - epsilon: small value to avoid log(0)

    Returns:
    - norm_entropy: float in [0, 1], where 0 = sharp, 1 = flat
    """
    weights = np.asarray(weights, dtype=np.float64)
    weights = weights / (weights.sum() + epsilon)  # normalize

    entropy = -np.sum(weights * np.log(weights + epsilon))
    max_entropy = np.log(len(weights) + epsilon)

    return entropy / max_entropy

In [None]:
var_df = pd.DataFrame()
for desc_name, desc_calc in desc_list:

    # calc descriptors
    x_train = desc_calc.transform(confs_train)
    x_test = desc_calc.transform(confs_test)
    
    # scale bags
    bags = x_train + x_test
    scaler = BagMinMaxScaler()
    scaler.fit(bags)
    bags_scaled = scaler.transform(bags)
    
    # calc var
    intra_vars, mean_intra, mean_inter = compute_bag_variances(bags_scaled)
    
    # save results
    var_df.loc[desc_name, "intra"] = mean_intra.item()
    var_df.loc[desc_name, "inter"] = mean_inter.item()
    var_df.loc[desc_name, "ratio"] = (mean_intra / mean_inter).item()

In [None]:
var_df

In [None]:
from collections import defaultdict

ent_dict = defaultdict(list)
for bag in w_list:
    for dsc in bag.columns:
        ent_dict[dsc].append(normalized_entropy(bag[dsc]))
#
ent_df = pd.DataFrame()
for k, v in ent_dict.items():
    ent_df.loc[k, "ent"] = np.mean(v)

In [None]:
pd.concat([var_df, ent_df], axis=1).sort_values(by="ratio")