In [4]:
!pip install gymnasium stable_baselines3[extra]



In [3]:
import numpy as np
import gymnasium as gym
import random

class HikingEnv(gym.Env):
    """
    Custom Environment for hiking insurance company.
    """
    # google colab - no GUI
    metadata = {"render_modes": ["console"]}

    # Define constants for clearer code
    REFUSE = 0
    APPROVE = 1

    def __init__(self, customers=10, render_mode="console"):
        super(HikingEnv, self).__init__()
        self.render_mode = render_mode

        # Initialize parameters
        # Simulation params
        self.customers = customers
        self.step_n = 0


        # Episode params
        self.balance = 35
        self.hiker_experience = 0.0
        self.hiker_age = 10
        self.refusal_cost = 15

        # Secret info about the paycheck - model gets it only once it decides to insure
        self.customer_paycheck_SECRET = 0

        # Define action and observation space. They must be gym.spaces objects
        n_actions = 2
        self.action_space = gym.spaces.discrete.Discrete(n_actions)

        # Our observation space:
        # balance - if it goes under 0 then we lose
        # HikerExperience - 0%-100%
        # HikerAge - 10-90
        # Refusal cost 15+5*x
        self.observation_space = gym.spaces.Box(low=np.array([0.0, 0.0, 10.0, 15.0]),
                                                high=np.array([100000.0, 1.0, 90.0, 265.0]), dtype=np.float32)

    def get_new_customer(self):
        #Function for generating examples - in a actual scenario we would have historical data

        # Generate customer data - "unreal" uniform distribution for demo purposes
        self.hiker_experience = round(random.random(), 3)
        self.hiker_age = random.randint(10, 90)
        accident_probability = random.random()

        if (self.hiker_experience*0.6 + self.hiker_age*0.001 + 0.0) > accident_probability:
          self.customer_paycheck_SECRET = 0
        elif random.random() < 0.75:
          self.customer_paycheck_SECRET = random.randint(-500, -10)
        else:
          if self.hiker_age < 20:
            self.customer_paycheck_SECRET = -1000
          else:
            self.customer_paycheck_SECRET = random.choice([0, -1000])

    def reset(self, seed=None, options=None):
        super().reset(seed=None, options=options)
        # Reset values to defaults and
        self.step_n = 0
        self.balance = 35
        self.refusal_cost = 15
        self.get_new_customer()
        # convert values to float32 to make it more general
        return np.array(([self.balance, self.hiker_experience, self.hiker_age, self.refusal_cost])).astype(np.float32), {}

    def step(self, action):
        self.step_n += 1
        if action == self.REFUSE:
            reward = -self.refusal_cost
            self.refusal_cost += 5 # Every subsequent refusal is more expensive
        elif action == self.APPROVE:
            reward = 150
            reward += self.customer_paycheck_SECRET  # Temporary simplification
        else:
            raise ValueError(
                f"Received invalid action={action} which is not part of the action space")

        self.balance += reward

        terminated = bool(self.step_n >= self.customers)
        if terminated:
          reward = self.balance
        truncated = bool(self.balance < 0)
        if truncated:
          reward = -4000
        info = {}

        self.get_new_customer()
        return (
            np.array(([self.balance, self.hiker_experience, self.hiker_age, self.refusal_cost])).astype(np.float32),
            reward,
            terminated,
            truncated,
            info,
        )

    def render(self):
        if self.render_mode == "console":
            print(f"\t Balance: {self.balance}")
            print(f"\t Hiker's experience: {self.hiker_experience:.1%} age: {self.hiker_age:n} Probable paycheck: {self.customer_paycheck_SECRET}")

    def close(self):
        pass

In [None]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env
from gymnasium.wrappers import FlattenObservation

# Instantiate the env
vec_env = make_vec_env(HikingEnv, n_envs=1, env_kwargs=dict(customers=25))
env = HikingEnv(customers=25)
# Train the agent
model = PPO("MlpPolicy", env, verbose=0).learn(10000)

# Showcase the learned behaviour
obs = vec_env.reset()
n_steps = 100
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print(f"Step {step + 1}")
    vec_env.render()
    print("\t Action: ", action)
    obs, reward, done, info = vec_env.step(action)
    # print("obs=", obs, "reward=", reward, "done=", done)
    print("\t reward=", reward, "done=", done)
    if done:
        # Note that the VecEnv resets automatically when a done signal is encountered
        print("-"*20+"Episode ends!", "reward=", reward, "-"*20)
        obs = vec_env.reset()

Step 1
	 Balance: 35
	 Hiker's experience: 39.6% age: 80 Probable paycheck: -232
	 Action:  [1]
	 reward= [-4000.] done= [ True]
--------------------Episode ends! reward= [-4000.] --------------------
Step 2
	 Balance: 35
	 Hiker's experience: 18.2% age: 88 Probable paycheck: -428
	 Action:  [1]
	 reward= [-4000.] done= [ True]
--------------------Episode ends! reward= [-4000.] --------------------
Step 3
	 Balance: 35
	 Hiker's experience: 75.3% age: 34 Probable paycheck: 0
	 Action:  [1]
	 reward= [150.] done= [False]
Step 4
	 Balance: 185
	 Hiker's experience: 68.1% age: 58 Probable paycheck: 0
	 Action:  [1]
	 reward= [150.] done= [False]
Step 5
	 Balance: 335
	 Hiker's experience: 55.9% age: 64 Probable paycheck: -406
	 Action:  [0]
	 reward= [-15.] done= [False]
Step 6
	 Balance: 320
	 Hiker's experience: 17.1% age: 67 Probable paycheck: -94
	 Action:  [0]
	 reward= [-20.] done= [False]
Step 7
	 Balance: 300
	 Hiker's experience: 3.0% age: 82 Probable paycheck: 0
	 Action:  [0]
	

In [None]:
obs = np.array([[1000.  ,  0.01, 10.  , 35.]])
action, _ = model.predict(obs, deterministic=True)
print(action)

[0]
