In [1]:
import numpy as np


In [2]:
MAX_STEPS = 20
TUMOR_MAX = 50
SIDE_EFFECT_MAX = 15

In [None]:
# Parameters for simulation
np.random.seed(42)

In [3]:
def simulate_patient(initial_state, actions, p=5):
    """
    Simulates patient trajectory given an initial state and sequence of actions.
    """
    tumor, side_effects = initial_state
    trajectory = []
    for action in actions:
        # Update tumor size and side effects based on actions
        tumor = max(0, tumor - 2.5 * action + np.random.normal(0, 0.1))
        side_effects = min(SIDE_EFFECT_MAX, side_effects + 0.5 * action + np.random.normal(0, 0.1))
        trajectory.append((tumor, side_effects))
        if tumor == 0 or side_effects >= SIDE_EFFECT_MAX:
            break
    return trajectory

In [4]:
def reward_function(tumor, side_effects, weights):
    """
    Computes the reward based on tumor size and side effects.
    """
    tumor_norm = tumor / TUMOR_MAX
    side_effects_norm = side_effects / SIDE_EFFECT_MAX
    return weights[0] * tumor_norm + weights[1] * side_effects_norm

In [5]:
def estimate_counterfactuals(history, actions):
    """
    Mock function to estimate counterfactual outcomes.
    For simplicity, generates random estimates within valid ranges.
    """
    tumor, side_effects = history[-1]
    return [(max(0, tumor - 2.0 * action), 
             min(SIDE_EFFECT_MAX, side_effects + 0.4 * action)) 
            for action in actions]

In [None]:
# Batch IRL Framework
class CIRL:
    def __init__(self, discount_factor=0.99):
        self.discount_factor = discount_factor

    def compute_feature_expectations(self, trajectories, weights):
        """
        Compute discounted feature expectations given trajectories and reward weights.
        """
        feature_expectations = np.zeros(2)
        for trajectory in trajectories:
            for t, (tumor, side_effects) in enumerate(trajectory):
                reward = reward_function(tumor, side_effects, weights)
                feature_expectations += (self.discount_factor ** t) * np.array([tumor, side_effects])
        return feature_expectations / len(trajectories)
    
    def max_margin_irl(self, trajectories, expert_weights, n_iter=10):
        """
        Max-margin IRL to recover reward weights.
        """
        n_features = 2  # Tumor size and side effects
        learned_weights = np.random.uniform(-1, 1, n_features)
        for _ in range(n_iter):
            # Compute feature expectations for expert policy
            expert_feature_expectations = self.compute_feature_expectations(trajectories, expert_weights)

            # Estimate feature expectations for the current policy
            current_feature_expectations = self.compute_feature_expectations(trajectories, learned_weights)

            # Update weights using max-margin principle
            learned_weights += 0.1 * (expert_feature_expectations - current_feature_expectations)
        return learned_weights

    


In [10]:
# Simulate data
initial_states = [(np.random.uniform(30, 50), np.random.uniform(2, 10)) for _ in range(100)]
actions = [np.random.choice([-1, 1]) for _ in range(100)]
trajectories = [simulate_patient(state, actions) for state in initial_states]

In [11]:
print(f'initial_states: {initial_states}\nactions: {actions}\ntrajectories: {trajectories}')

initial_states: [(43.68843919714433, 9.082777432209674), (44.260884143372635, 8.760531161491741), (33.22070262212187, 6.532258667065674), (31.855659273000423, 6.7091008647576), (47.02858025636044, 7.845175782833733), (36.198153363346236, 3.311359959396807), (38.42158072716884, 5.657638330004279), (44.66734345537759, 3.439570722130286), (38.93434574473787, 2.6092049819607324), (43.90532704278573, 9.516172653656525), (32.44433238272561, 2.567066712514748), (35.71146326712474, 9.65578572713006), (37.990814539803466, 5.889082169611854), (45.25090696721019, 5.234624590466546), (33.37504943708036, 2.4872453363555396), (36.43283852331111, 2.3378913201815035), (34.704764292168676, 3.2596754429940553), (41.97221114076282, 5.944717243847362), (34.98271277129385, 3.4117496221132946), (34.752454656742884, 4.996648345151111), (33.564534809541236, 6.486876428463443), (41.028252473200304, 6.363433847219289), (48.50587930827133, 9.297747390569706), (33.62156934923436, 6.0177826286074945), (45.58094424

In [13]:
# Expert weights (known for simulation)
expert_weights = np.array([-0.7, -0.3])

# CIRL Training
cirl = CIRL(discount_factor=0.99)
learned_weights = cirl.max_margin_irl(trajectories, expert_weights, n_iter=50)

print("Expert Weights:", expert_weights)
print("Learned Weights:", learned_weights)

Expert Weights: [-0.7 -0.3]
Learned Weights: [0.6822616  0.97050079]
