Necessary libraries

In [12]:
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

Define the actions and simulation parameters

In [13]:
# Constants representing actions
ACTION_BACK = 0
ACTION_END = 1

# Simulation parameters
RUNS = 10
EPISODES = 10 ** 6

Create the Agent and Environment classes to encapsulate the agent's policies and the environment's dynamics.

In [14]:
class Agent:
    def behavior_policy(self):
        # Randomly choose ACTION_BACK or ACTION_END with equal probability
        return np.random.choice([ACTION_BACK, ACTION_END])

    def target_policy(self):
        # Always choose ACTION_BACK
        return ACTION_BACK

In [15]:
class Environment:
    def step(self, action):
        # If the action is ACTION_END, the episode ends without reward
        if action == ACTION_END:
            return 0, True
        # With 10% probability, receive a reward and end the episode
        if np.random.rand() < 0.1:
            return 1, True
        else:
            # Continue the episode without reward
            return 0, False

Define helper functions for computing the importance sampling ratio and simulating episodes.

In [16]:
def compute_importance_ratio(trajectory):
    # If ACTION_END is in the trajectory, the target policy probability is zero
    if ACTION_END in trajectory:
        return 0
    # Importance ratio is (1/0.5)^len(trajectory) = 2^len(trajectory)
    return 2 ** len(trajectory)

In [17]:
def simulate_episode(agent, env):
    trajectory = []
    done = False
    while not done:
        action = agent.behavior_policy()
        trajectory.append(action)
        reward, done = env.step(action)
    rho = compute_importance_ratio(trajectory)
    return reward, trajectory, rho

Implement functions to perform Ordinary Importance Sampling and Weighted Importance Sampling

In [18]:
def run_ordinary_importance_sampling(episodes):
    rewards = []
    for _ in range(episodes):
        agent = Agent()
        env = Environment()
        reward, trajectory, rho = simulate_episode(agent, env)
        rewards.append(rho * reward)
    cumulative_rewards = np.cumsum(rewards)
    estimations = cumulative_rewards / np.arange(1, episodes + 1)
    return estimations

In [19]:
def run_weighted_importance_sampling(episodes):
    cumulative_reward = 0
    cumulative_rho = 0
    estimations = []
    for _ in range(episodes):
        agent = Agent()
        env = Environment()
        reward, trajectory, rho = simulate_episode(agent, env)
        cumulative_reward += rho * reward
        cumulative_rho += rho
        estimation = cumulative_reward / cumulative_rho if cumulative_rho != 0 else 0
        estimations.append(estimation)
    return estimations

Create a function to perform the simulations across multiple runs

In [20]:
def perform_simulation(runs, episodes):
    ois_estimations = []
    wis_estimations = []
    for _ in range(runs):
        ois_estimation = run_ordinary_importance_sampling(episodes)
        wis_estimation = run_weighted_importance_sampling(episodes)
        ois_estimations.append(ois_estimation)
        wis_estimations.append(wis_estimation)
    return ois_estimations, wis_estimations

Define a function to plot the estimations from both OIS and WIS with labels

In [21]:
def plot_estimations(ois_estimations, wis_estimations, episodes):
    plt.figure(figsize=(10, 6))
    x_values = np.arange(1, episodes + 1)
    # Plot OIS estimations
    for idx, estimation in enumerate(ois_estimations):
        if idx == 0:
            plt.plot(x_values, estimation, color='blue', linewidth=1, label='OIS')
        else:
            plt.plot(x_values, estimation, color='blue', linewidth=1)
    # Plot WIS estimations
    for idx, estimation in enumerate(wis_estimations):
        if idx == 0:
            plt.plot(x_values, estimation, color='red', linewidth=1, label='WIS')
        else:
            plt.plot(x_values, estimation, color='red', linewidth=1)
    plt.xlabel('Episodes (log scale)')
    plt.ylabel('Importance Sampling Estimates')
    plt.xscale('log')
    plt.xlim([1, episodes])
    plt.legend()
    plt.savefig('figure_5_4.png')
    plt.close()

Run the simulations and plot the results

In [22]:
def main():
    ois_estimations, wis_estimations = perform_simulation(RUNS, EPISODES)
    plot_estimations(ois_estimations, wis_estimations, EPISODES)

if __name__ == '__main__':
    main()

  plt.savefig('figure_5_4.png')
