<a href="https://colab.research.google.com/github/HarshithaGM01/customRLEnv/blob/main/CustomRLEnv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys, platform, random
import numpy as np

print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())

SEED_271 = 271
random.seed(SEED_271)
np.random.seed(SEED_271)


Python: 3.12.12
Platform: Linux-6.6.105+-x86_64-with-glibc2.35


In [2]:
!pip -q install "gymnasium>=0.29" "stable-baselines3>=2.2.1" "shimmy>=1.3.0"


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/188.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m184.3/188.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.0/188.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np


In [10]:
class LongHorizonDecisionEnv(gym.Env):
    metadata = {"render_modes": []}
    def __init__(self, max_steps=200, seed=271):
      super().__init__()

      self.max_steps = max_steps
      self.current_step = 0

      self._rng = np.random.default_rng(seed)

      #observation space - we have [skill,energy,resources,confidence,market_noise]
      self.observation_space = spaces.Box(
          low=0.0,
          high=1.0,
          shape=(5,),
          dtype=np.float32
      )

      #Action space - 0: invest in a skill
      # 1: exploit current skill
      # 2: rest/recover
      # 3: take risky opportunity
      # 4: play safe
      self.action_space = spaces.Discrete(5)

      self.state = None

    def reset(self, seed=None, options=None):
      super().reset(seed=seed)

      if seed is not None:
        self._rng = np.random.default_rng(seed)

      self.current_step = 0

      skill = self._rng.uniform(0.1, 0.3)
      energy = self._rng.uniform(0.6, 0.9)
      resources = self._rng.uniform(0.4, 0.7)
      confidence = self._rng.uniform(0.3, 0.6)
      market_noise = self._rng.uniform(0.0, 1.0)

      self.state = np.array(
            [skill, energy, resources, confidence, market_noise],
            dtype=np.float32
        )

      return self.state, {}


    def step(self, action):

      skill, energy, resources, confidence, market_noise = self.state.astype(np.float32)

      # --- dynamics knobs (tunable later) ---
      invest_cost = 0.04
      exploit_cost_energy = 0.05
      exploit_gain_res = 0.06

      rest_gain_energy = 0.10
      rest_cost_res = 0.02

      safe_gain_res = 0.03
      safe_cost_energy = 0.01

      risky_cost_energy = 0.06
      risky_cost_res = 0.03
      risky_success_base = 0.20  # base success prob
      risky_success_skill_boost = 0.60  # extra prob from skill

      # Market noise updates each step (stochastic world)
      market_noise = float(self._rng.uniform(0.0, 1.0))

      # Track reward components for analysis
      immediate_reward = 0.0

      # --- apply action ---
      if action == 0:
        # invest in skill: slow payoff, upfront cost
        resources -= invest_cost
        energy -= 0.02
        skill += 0.03 + 0.01 * confidence
        confidence += 0.01
        immediate_reward -= 0.01  # tiny immediate penalty (delayed reward theme)

      elif action == 1:
        # exploit current skill: immediate gain but drains energy
        energy -= exploit_cost_energy
        gain = exploit_gain_res * (0.5 + skill) * (0.7 + 0.3 * market_noise)
        resources += gain
        confidence += 0.01
        immediate_reward += 0.5 * gain  # partial immediate reward

      elif action == 2:
        # rest: recovers energy, small cost to resources (time passes)
        energy += rest_gain_energy
        resources -= rest_cost_res
        confidence += 0.005
        immediate_reward -= 0.005

      elif action == 3:
        # risky opportunity: success depends on skill + randomness
        energy -= risky_cost_energy
        resources -= risky_cost_res

        p_success = risky_success_base + risky_success_skill_boost * skill
        p_success = float(np.clip(p_success, 0.0, 0.95))
        success = (self._rng.random() < p_success)

        if success:
          big_gain = 0.20 + 0.25 * market_noise + 0.10 * confidence
          resources += big_gain
          confidence += 0.04
          immediate_reward += big_gain
        else:
          confidence -= 0.03
          immediate_reward -= 0.03

      elif action == 4:
        # play safe: small reliable gain, minimal drain
        resources += safe_gain_res * (0.8 + 0.2 * market_noise)
        energy -= safe_cost_energy
        confidence += 0.002
        immediate_reward += 0.01

      else:
        raise ValueError(f"Invalid action: {action}")

      # --- natural dynamics / constraints ---
      # burnout effect: if energy too low, confidence and resources suffer
      if energy < 0.15:
        immediate_reward -= 0.05
        confidence -= 0.02
        resources -= 0.02

      # clamp to [0,1]
      skill = float(np.clip(skill, 0.0, 1.0))
      energy = float(np.clip(energy, 0.0, 1.0))
      resources = float(np.clip(resources, 0.0, 1.0))
      confidence = float(np.clip(confidence, 0.0, 1.0))

      self.state = np.array([skill, energy, resources, confidence, market_noise], dtype=np.float32)

      # --- delayed reward component ---
      # We want long-term stability + growth:
      # - reward increases with resources and skill
      # - penalty for low energy (burnout) and stagnation
      burnout_penalty = max(0.0, 0.25 - energy)  # only penalize when energy is low
      growth_reward = 0.6 * resources + 0.4 * skill
      reward = float(growth_reward - 0.8 * burnout_penalty + 0.2 * immediate_reward)

      self.current_step += 1

      terminated = False
      truncated = False

      # terminate if resources depleted badly (collapse) or extreme burnout
      if resources <= 0.02 or energy <= 0.02:
        terminated = True

      # truncate if time horizon reached
      if self.current_step >= self.max_steps:
        truncated = True

      info = {
        "skill": skill,
        "energy": energy,
        "resources": resources,
        "confidence": confidence,
        "market_noise": market_noise,
        "burnout_penalty": burnout_penalty,
        "immediate_reward": immediate_reward,
        "growth_reward": growth_reward
      }

      return self.state, reward, terminated, truncated, info



In [11]:
# Create env + run a few random steps to sanity check
env_271 = LongHorizonDecisionEnv(max_steps=20, seed=271)

obs, info = env_271.reset()
print("Reset obs:", obs)

for t in range(5):
    a = env_271.action_space.sample()
    obs, reward, terminated, truncated, info = env_271.step(a)
    print(f"\nStep {t+1}")
    print("  action:", a)
    print("  obs:", obs)
    print("  reward:", reward)
    print("  terminated:", terminated, "| truncated:", truncated)
    print("  info (key fields):",
          {k: round(info[k], 3) for k in ["skill","energy","resources","confidence","burnout_penalty"]})
    if terminated or truncated:
        break


Reset obs: [0.2563959  0.8538769  0.5808103  0.3747876  0.19264777]

Step 1
  action: 4
  obs: [0.2563959 0.8438769 0.6078727 0.3767876 0.5104025]
  reward: 0.4692819972038269
  terminated: False | truncated: False
  info (key fields): {'skill': 0.256, 'energy': 0.844, 'resources': 0.608, 'confidence': 0.377, 'burnout_penalty': 0.0}

Step 2
  action: 1
  obs: [0.2563959  0.7938769  0.6438893  0.3867876  0.31200263]
  reward: 0.4924936294555664
  terminated: False | truncated: False
  info (key fields): {'skill': 0.256, 'energy': 0.794, 'resources': 0.644, 'confidence': 0.387, 'burnout_penalty': 0.0}

Step 3
  action: 3
  obs: [0.2563959  0.7338769  0.922181   0.42678759 0.27845174]
  reward: 0.7175253033638
  terminated: False | truncated: False
  info (key fields): {'skill': 0.256, 'energy': 0.734, 'resources': 0.922, 'confidence': 0.427, 'burnout_penalty': 0.0}

Step 4
  action: 1
  obs: [0.2563959  0.6838769  0.9609017  0.43678758 0.51061034]
  reward: 0.6829714179039001
  terminate