In [1]:
import numpy as np


Step 1: Compute Expert’s Feature Expectations 
For each trajectory, compute the expected outcomes using the feature map (

Example: The feature map might capture tumor reduction and side effects:

$\phi(h_t, a_t)$ =[expected tumor size, side effect severity]

Aggregate these over the trajectories to compute the expert's feature expectations.

Step 2: Initialize Random Reward Weights ($w_0$)

Step 3: Compute the Initial Policy

Step 4: Iteratively Improve the Reward Weights

In [2]:

def feature_map(history, action):
    """
    Computes the feature map φ(h_t, a_t) = E[Y_{t+1}[a_t] | h_t]
    Placeholder: You need to define your own feature computation.
    """
    # Example: Placeholder for feature computation
    return np.random.rand(3)  # Example: 3-dimensional feature vector


def compute_feature_expectations(data, policy, feature_map_fn):
    """
    Computes feature expectations μ^π for a given policy π.
    :param data: Dataset of trajectories.
    :param policy: Current policy.
    :param feature_map_fn: Function to compute feature map φ(h_t, a_t).
    :return: Feature expectations μ^π.
    """
    feature_expectations = np.zeros(feature_map_fn(None, None).shape)
    for trajectory in data:
        for history, action in trajectory:
            feature_expectations += feature_map_fn(history, action) * policy(history, action)
    return feature_expectations / len(data)



In [3]:

def orthogonal_projection(mu_pi_e, mu_bar_prev, mu_pi_k):
    """
    Orthogonally project μ^π_E onto the line through μ̄_{k-1} and μ^π_k.
    :param mu_pi_e: Expert feature expectations.
    :param mu_bar_prev: Previous projection μ̄_{k-1}.
    :param mu_pi_k: Current feature expectations μ^π_k.
    :return: Updated μ̄_k.
    """
    direction = mu_pi_k - mu_bar_prev
    t = np.dot(mu_pi_e - mu_bar_prev, direction) / np.dot(direction, direction)
    return mu_bar_prev + t * direction, t



In [16]:
def batch_max_margin_cirl(data, feature_map_fn, max_iterations, epsilon):
    """
    Implements the Batch Max-Margin CIRL algorithm.
    :param data: Batch dataset D of trajectories.
    :param feature_map_fn: Function to compute feature map φ(h_t, a_t).
    :param max_iterations: Maximum number of iterations.
    :param epsilon: Convergence threshold.
    :return: Final reward function, policies Π, and feature expectations Δ.
    """
    # Initialize variables
    mu_pi_e = compute_feature_expectations(data, lambda h, a: 1, feature_map_fn)  # Expert's feature expectations
    w = np.random.rand(mu_pi_e.shape[0])  # Random initial reward weights
    pi_0 = lambda h, a: 1  # Initial policy (Uniform policy for example)
    mu_pi_0 = compute_feature_expectations(data, pi_0, feature_map_fn)
    policies = [pi_0]
    feature_expectations = [mu_pi_0]
    mu_bar = mu_pi_0  # μ̄_0

    for k in range(1, max_iterations + 1):
        # Compute reward function R_k = w_k · φ(h, a)
        R_k = lambda h, a: np.dot(w, feature_map_fn(h, a))
        
        # Derive optimal policy π_k based on R_k (e.g., via RL)
        pi_k = lambda h, a: np.random.choice([0, 1])  # Placeholder policy
        mu_pi_k = compute_feature_expectations(data, pi_k, feature_map_fn)

        # Update policies and feature expectations
        policies.append(pi_k)
        feature_expectations.append(mu_pi_k)

        # Orthogonal projection
        mu_bar, t = orthogonal_projection(mu_pi_e, mu_bar, mu_pi_k)

        # Update reward weights
        w = mu_pi_e - mu_bar

        # Check for convergence
        if t < epsilon:
            break

    # Find optimal reward
    K = np.argmin([np.linalg.norm(mu_pi_e - mu) for mu in feature_expectations])
    final_reward = lambda h, a: np.dot(w, feature_map_fn(h, a))

    return final_reward, policies, feature_expectations


In [17]:
# Example dataset: List of trajectories (history, action)
dataset = [
    [(np.array([1, 2, 3]), 0), (np.array([2, 3, 4]), 1)],
    [(np.array([3, 4, 5]), 1), (np.array([4, 5, 6]), 0)],
]


In [18]:
print(f'trajectories: {dataset}')

trajectories: [[(array([1, 2, 3]), 0), (array([2, 3, 4]), 1)], [(array([3, 4, 5]), 1), (array([4, 5, 6]), 0)]]


In [19]:


# Run Batch Max-Margin CIRL
reward, policies, feature_expectations = batch_max_margin_cirl(
    data=dataset,
    feature_map_fn=feature_map,
    max_iterations=10,
    epsilon=1e-3
)

print("Final Reward Function Weights:", reward)
print("Policies:", policies)
print("Feature Expectations:", feature_expectations)


Final Reward Function Weights: <function batch_max_margin_cirl.<locals>.<lambda> at 0x000001E24F575620>
Policies: [<function batch_max_margin_cirl.<locals>.<lambda> at 0x000001E24F576200>, <function batch_max_margin_cirl.<locals>.<lambda> at 0x000001E24F5762A0>, <function batch_max_margin_cirl.<locals>.<lambda> at 0x000001E24F574860>]
Feature Expectations: [array([1.07027557, 1.06531077, 0.44895422]), array([0.35799618, 0.12431546, 0.47416783]), array([0.19498702, 0.16758535, 0.41970448])]
