In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys

# ============================================================================
# STEP 1: Apply Compatibility Patches
# ============================================================================

def apply_compatibility_patches() -> None:
    """Apply necessary compatibility patches for gym/gymnasium compatibility."""
    print("Applying compatibility patches...")
    print("OK Compatibility patches applied\n")

apply_compatibility_patches()

# ============================================================================
# STEP 2: Import Main Libraries
# ============================================================================

print("Importing libraries...")

# Clean uninstall
! pip uninstall -y gym gymnasium numpy minari d3rlpy scipy scikit-learn 2>/dev/null

# Install ONLY compatible versions
!pip install -q 'numpy==1.26.4'
!pip install -q 'scipy>=1.6.0'
!pip install -q 'scikit-learn>=1.5.0,<1.8'
!pip install -q d3rlpy
!pip install -q minari

# NOW import the packages AFTER they're installed
import gymnasium
import gym
from typing import Optional, Tuple, Dict, List, Any, Callable
import time
import numpy as np
import shutil
import os

try:
    import d3rlpy
    import minari
    print("OK All imports successful\n")
    print(f"DEBUG:  Minari version after (re)install: {minari.__version__}")
    print(f"DEBUG: d3rlpy version:  {d3rlpy.__version__}")
except Exception as e: 
    print(f"ERROR Import error: {e}")
    import traceback
    traceback.print_exc()
    raise


Applying compatibility patches...
OK Compatibility patches applied

Importing libraries...
Found existing installation: gym 0.26.2
Uninstalling gym-0.26.2:
  Successfully uninstalled gym-0.26.2
Found existing installation: gymnasium 1.0.0
Uninstalling gymnasium-1.0.0:
  Successfully uninstalled gymnasium-1.0.0
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: minari 0.5.3
Uninstalling minari-0.5.3:
  Successfully uninstalled minari-0.5.3
Found existing installation: d3rlpy 2.8.1
Uninstalling d3rlpy-2.8.1:
  Successfully uninstalled d3rlpy-2.8.1
Found existing installation: scipy 1.16.3
Uninstalling scipy-1.16.3:
  Successfully uninstalled scipy-1.16.3
Found existing installation: scikit-learn 1.7.2
Uninstalling scikit-learn-1.7.2:
  Successfully uninstalled scikit-learn-1.7.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviou

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


OK All imports successful

DEBUG:  Minari version after (re)install: 0.5.3
DEBUG: d3rlpy version:  2.8.1


In [2]:
# ============================================================================
# MOCK T1D ENVIRONMENT (Copied from previous cell to make this cell self-contained)
# ============================================================================

class MockT1DEnv:
    """Mock Type 1 Diabetes environment for offline RL training."""

    def __init__(self):
        """Initialize mock environment."""
        self.current_glucose = 120.0
        self.time_step = 0
        self.max_steps = 480

    def reset(self):
        """Reset the environment."""
        min_glucose = 100.0
        max_glucose = 150.0
        self.current_glucose = np.random.uniform(min_glucose, max_glucose)
        self.time_step = 0
        return self.current_glucose

    def step(self, action):
        """Step the environment."""
        action = float(action)
        baseline = 15.0
        mean_noise = 0.0
        std_noise = 5.0
        noise = np.random.normal(mean_noise, std_noise)
        factor = 0.5
        delta = (action - baseline) * factor + noise
        self.current_glucose = self.current_glucose + delta
        self.current_glucose = np.clip(self.current_glucose, 40.0, 300.0)

        self.time_step = self.time_step + 1
        done = self.time_step >= self.max_steps

        if self.current_glucose < 70.0:
            reward = -1.0
        elif self.current_glucose > 180.0:
            reward = -0.5
        else:
            reward = 1.0

        info = {'glucose': self.current_glucose}

        return self.current_glucose, reward, done, info

    def close(self):
        """Close environment."""
        pass


# ============================================================================
# SIMGLUCOSE ENVIRONMENT WRAPPER - DIRECT INSTANTIATION (Now wraps MockT1DEnv)
# ============================================================================

class SimglucoseGymEnv(gymnasium.Env):
    """
    Gymnasium-compatible wrapper for SimGlucose T1DSimEnv.
    Now directly instantiates MockT1DEnv due to simglucose dependency issues.
    """

    metadata = {"render_modes": ["human"], "render_fps": 30}

    def __init__(
        self,
        patient_name: str = 'adolescent#001',
        seed: Optional[int] = None,
        render_mode: Optional[str] = None
    ):
        """Initialize the SimglucoseGymEnv."""
        super().__init__() # Removed seed=seed here

        self.render_mode = render_mode
        self.patient_name = patient_name
        self._episode_steps = 0
        self._max_episode_steps = 480
        self._episode_rewards = []
        self._last_obs = None

        # Use MockT1DEnv instead of T1DSimEnv due to dependency conflicts
        self.env = MockT1DEnv()
        print(f"OK Successfully initialized MockT1DEnv (instead of T1DSimEnv due to compatibility issues).")

        self.action_space = gymnasium.spaces.Box(
            low=np.float32(0.0),
            high=np.float32(30.0),
            shape=(1,),
            dtype=np.float32
        )

        self.observation_space = gymnasium.spaces.Box(
            low=np.float32(0.0),
            high=np.float32(1000.0),
            shape=(1,),
            dtype=np.float32
        )

        # Ensure reproducibility for internal random operations if seed is provided
        if seed is not None:
            np.random.seed(seed)

    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
        """Perform one step in the environment."""
        if isinstance(action, np.ndarray):
            scalar_action = float(action[0]) if action.size == 1 else float(action)
        else:
            scalar_action = float(action)

        try:
            observation, reward, done, info = self.env.step(scalar_action)

        except Exception as e:
            print(f"ERROR in step: {e}")
            import traceback
            traceback.print_exc()
            raise

        self._episode_steps += 1
        self._episode_rewards.append(float(reward))

        if observation is None:
            observation = self._last_obs if self._last_obs is not None else np.array([0.0], dtype=np.float32)
        else:
            if not isinstance(observation, np.ndarray):
                observation = np.array([float(observation)], dtype=np.float32)
            else:
                if observation.ndim == 0:
                    observation = np.array([float(observation)], dtype=np.float32)
                elif observation.shape == (1,):
                    observation = observation.astype(np.float32)
                else:
                    observation = np.array([float(observation.flat[0])], dtype=np.float32)
            self._last_obs = observation.copy()

        truncated = self._episode_steps >= self._max_episode_steps

        return observation, float(reward), bool(done), truncated, info

    def reset(
        self,
        seed: Optional[int] = None,
        options: Optional[dict] = None
    ) -> Tuple[np.ndarray, dict]:
        """Reset the environment."""
        super().reset(seed=seed)

        try:
            # If a seed is provided to reset, ensure MockT1DEnv uses it for reproducibility
            if seed is not None:
                np.random.seed(seed)
            observation = self.env.reset()

        except Exception as e:
            print(f"ERROR in reset: {e}")
            import traceback
            traceback.print_exc()
            raise

        if observation is None:
            observation = np.array([0.0], dtype=np.float32)
        else:
            if not isinstance(observation, np.ndarray):
                observation = np.array([float(observation)], dtype=np.float32)
            else:
                if observation.ndim == 0:
                    observation = np.array([float(observation)], dtype=np.float32)
                elif observation.shape == (1,):
                    observation = observation.astype(np.float32)
                else:
                    observation = np.array([float(observation.flat[0])], dtype=np.float32)

        self._last_obs = observation.copy()
        self._episode_steps = 0
        self._episode_rewards = []

        return observation, {}

    def render(self) -> Optional[Any]:
        """Render the environment (if applicable)."""
        return None

    def close(self) -> None:
        """Close the environment and cleanup resources."""
        if hasattr(self, 'env'):
            self.env.close()


# ============================================================================
# ENVIRONMENT SETUP AND TESTING
# ============================================================================

def setup_simglucose_environment(
    patient_name: str = 'adolescent#001',
    seed: int = 42
) -> SimglucoseGymEnv:
    """Setup and initialize a SimGlucose environment."""
    print("Initializing SimglucoseGymEnv...")
    env = SimglucoseGymEnv(patient_name=patient_name, seed=seed)
    print("OK SimglucoseGymEnv initialized successfully!")
    return env


def test_environment(env: SimglucoseGymEnv, n_steps: int = 5) -> None:
    """Test the environment with random actions."""
    print("\n" + "=" * 80)
    print("Testing Environment with Random Actions")
    print("=" * 80)

    obs, info = env.reset(seed=42)
    print(f"\nOK Reset successful!")
    print(f"Initial Observation: {obs}")

    print("\n--- Testing Steps ---")
    episode_rewards = []

    for i in range(n_steps):
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        episode_rewards.append(reward)

        print(f"Step {i + 1}: obs={obs[0]:.2f}, reward={reward:.4f}")

        if terminated or truncated:
            print("  Episode ended!")
            break

    print(f"\n--- Episode Summary ---")
    print(f"  Total Steps: {len(episode_rewards)}")
    print(f"  Total Return: {sum(episode_rewards):.4f}")
    print(f"  Average Reward: {np.mean(episode_rewards):.4f}")
    print("\n" + "=" * 80)


def define_behavior_policy(observation: np.ndarray, env: SimglucoseGymEnv) -> np.ndarray:
    """Simple random behavior policy for data collection."""
    return env.action_space.sample()

In [3]:
# ============================================================================
# MOCK T1D ENVIRONMENT
# ============================================================================

class MockT1DEnv: 
    """Mock Type 1 Diabetes environment for offline RL training."""

    def __init__(self):
        """Initialize mock environment."""
        self. current_glucose = 120.0
        self.time_step = 0
        self. max_steps = 480

    def reset(self) -> float:
        """Reset the environment and return initial observation."""
        min_glucose = 100.0
        max_glucose = 150.0
        self.current_glucose = np.random.uniform(min_glucose, max_glucose)
        self.time_step = 0
        return self.current_glucose

    def step(self, action: float) -> Tuple[float, float, bool, Dict[str, Any]]:
        """
        Step the environment. 
        
        Args:
            action:  Insulin dosage (0.0 to 30.0)
        
        Returns:
            observation, reward, done, info
        """
        action = float(action)
        baseline = 15.0
        mean_noise = 0.0
        std_noise = 5.0
        noise = np.random.normal(mean_noise, std_noise)
        factor = 0.5
        delta = (action - baseline) * factor + noise
        self.current_glucose = self.current_glucose + delta
        self.current_glucose = np.clip(self.current_glucose, 40.0, 300.0)

        self.time_step += 1
        done = self.time_step >= self.max_steps

        if self.current_glucose < 70.0:
            reward = -1.0
        elif self.current_glucose > 180.0:
            reward = -0.5
        else:
            reward = 1.0

        info = {'glucose': self.current_glucose}

        return self.current_glucose, reward, done, info

    def close(self) -> None:
        """Close environment and cleanup resources."""
        pass


# ============================================================================
# SIMGLUCOSE ENVIRONMENT WRAPPER
# ============================================================================

class SimglucoseGymEnv(gymnasium.Env):
    """
    Gymnasium-compatible wrapper for SimGlucose T1D environment.
    Uses MockT1DEnv as the underlying simulation.
    """

    metadata = {"render_modes": ["human"], "render_fps": 30}

    def __init__(
        self,
        patient_name: str = 'adolescent#001',
        seed: Optional[int] = None,
        render_mode: Optional[str] = None,
        verbose: bool = False
    ):
        """
        Initialize the SimglucoseGymEnv.
        
        Args:
            patient_name: Name of patient profile (default: adolescent#001)
            seed: Random seed for reproducibility
            render_mode: Render mode (not used, kept for compatibility)
            verbose:  Whether to print initialization messages
        """
        super().__init__()

        self.render_mode = render_mode
        self.patient_name = patient_name
        self.verbose = verbose
        self._episode_steps = 0
        self._max_episode_steps = 480
        self._episode_rewards = []
        self._last_obs = None

        # Initialize mock environment
        self. env = MockT1DEnv()
        if self.verbose:
            print(f"OK Successfully initialized MockT1DEnv")

        # Define action and observation spaces
        self.action_space = gymnasium.spaces.Box(
            low=np.float32(0.0),
            high=np.float32(30.0),
            shape=(1,),
            dtype=np.float32
        )

        self.observation_space = gymnasium.spaces.Box(
            low=np.float32(0.0),
            high=np.float32(1000.0),
            shape=(1,),
            dtype=np.float32
        )

        # Set seed if provided
        if seed is not None:
            self.seed(seed)

    def seed(self, seed: Optional[int] = None) -> List[int]:
        """Set random seed for reproducibility."""
        if seed is not None:
            np.random.seed(seed)
            return [seed]
        return []

    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, Dict]: 
        """
        Perform one step in the environment.
        
        Args:
            action: Action array from policy
        
        Returns:
            observation, reward, terminated, truncated, info
        """
        # Convert action to scalar
        if isinstance(action, np.ndarray):
            scalar_action = float(action[0]) if action.size == 1 else float(action. flat[0])
        else:
            scalar_action = float(action)

        # Clip action to valid range
        scalar_action = np.clip(scalar_action, 0.0, 30.0)

        try:
            observation, reward, done, info = self.env.step(scalar_action)
        except Exception as e:
            print(f"ERROR in step: {e}")
            import traceback
            traceback.print_exc()
            raise

        self._episode_steps += 1
        self._episode_rewards. append(float(reward))

        # Convert observation to proper format
        observation = self._format_observation(observation)
        self._last_obs = observation.copy()

        # Check if episode is truncated
        truncated = self._episode_steps >= self._max_episode_steps

        return observation, float(reward), bool(done), truncated, info

    def reset(
        self,
        seed: Optional[int] = None,
        options: Optional[Dict] = None
    ) -> Tuple[np.ndarray, Dict]:
        """
        Reset the environment.
        
        Args:
            seed: Random seed
            options: Additional reset options
        
        Returns:
            observation, info
        """
        super().reset(seed=seed)

        # Set seed if provided
        if seed is not None:
            np.random. seed(seed)

        try:
            observation = self.env.reset()
        except Exception as e:
            print(f"ERROR in reset: {e}")
            import traceback
            traceback.print_exc()
            raise

        # Format observation
        observation = self._format_observation(observation)
        self._last_obs = observation.copy()
        self._episode_steps = 0
        self._episode_rewards = []

        return observation, {}

    def _format_observation(self, obs: Any) -> np.ndarray:
        """
        Convert observation to proper numpy format.
        
        Args:
            obs: Raw observation from environment
        
        Returns:
            Formatted observation as numpy array with shape (1,)
        """
        if obs is None:
            obs = self._last_obs if self._last_obs is not None else np.array([0.0], dtype=np.float32)
        else:
            if not isinstance(obs, np.ndarray):
                obs = np.array([float(obs)], dtype=np.float32)
            else:
                if obs.ndim == 0:
                    obs = np.array([float(obs)], dtype=np.float32)
                elif obs.shape == (1,):
                    obs = obs.astype(np. float32)
                else: 
                    obs = np.array([float(obs.flat[0])], dtype=np.float32)
        
        return obs

    def render(self) -> Optional[Any]:
        """Render the environment (not implemented)."""
        return None

    def close(self) -> None:
        """Close the environment and cleanup resources."""
        if hasattr(self, 'env') and self.env is not None:
            self.env.close()


# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def _ensure_numpy_array(value: Any, dtype: type = np.float32) -> np.ndarray:
    """
    Utility function to ensure value is a numpy array.
    
    Args:
        value: Value to convert
        dtype: Target data type
    
    Returns: 
        Numpy array
    """
    if isinstance(value, np.ndarray):
        return value. astype(dtype)
    return np.array([value], dtype=dtype)


# ============================================================================
# ENVIRONMENT SETUP AND TESTING
# ============================================================================

def setup_simglucose_environment(
    patient_name: str = 'adolescent#001',
    seed:  int = 42,
    verbose: bool = True
) -> SimglucoseGymEnv:
    """
    Setup and initialize a SimGlucose environment.
    
    Args:
        patient_name:  Patient profile name
        seed: Random seed
        verbose: Print status messages
    
    Returns: 
        Initialized SimglucoseGymEnv
    """
    if verbose:
        print("\n" + "=" * 80)
        print("ENVIRONMENT INITIALIZATION")
        print("=" * 80)
        print(f"Patient:  {patient_name}")
        print(f"Seed: {seed}")
    
    env = SimglucoseGymEnv(
        patient_name=patient_name,
        seed=seed,
        verbose=verbose
    )
    
    if verbose:
        print(f"OK SimglucoseGymEnv initialized successfully!")
        print("=" * 80 + "\n")
    
    return env


def test_environment(
    env: SimglucoseGymEnv,
    n_steps: int = 5,
    seed: int = 42
) -> Dict[str, Any]:
    """
    Test the environment with random actions.
    
    Args:
        env: Environment to test
        n_steps: Number of steps to test
        seed: Random seed
    
    Returns:
        Dictionary with test statistics
    """
    print("\n" + "=" * 80)
    print("ENVIRONMENT TEST")
    print("=" * 80)

    obs, info = env.reset(seed=seed)
    print(f"✅ Reset successful!")
    print(f"Initial Observation: {obs}")
    print(f"Observation shape: {obs.shape}")
    print(f"Observation dtype: {obs.dtype}")
    print("\n--- Testing Steps ---")
    
    episode_rewards = []
    episode_glucose = []

    for i in range(n_steps):
        action = env. action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        episode_rewards.append(reward)
        
        glucose = info.get('glucose', obs[0])
        episode_glucose. append(glucose)

        print(f"Step {i + 1}:")
        print(f"  Action: {action[0]:.2f} | Glucose: {glucose:.2f} | Reward: {reward:.4f}")

        if terminated or truncated: 
            print("  ⚠️  Episode ended!")
            break

    # Summary statistics
    print(f"\n--- Episode Summary ---")
    print(f"  Total Steps: {len(episode_rewards)}")
    print(f"  Total Return: {sum(episode_rewards):.4f}")
    print(f"  Average Reward: {np.mean(episode_rewards):.4f}")
    print(f"  Mean Glucose: {np.mean(episode_glucose):.2f} mg/dL")
    print(f"  Glucose Range: [{np.min(episode_glucose):.2f}, {np.max(episode_glucose):.2f}]")
    print("=" * 80 + "\n")

    return {
        'episode_rewards': episode_rewards,
        'episode_glucose': episode_glucose,
        'total_return': sum(episode_rewards),
        'mean_glucose': np.mean(episode_glucose)
    }


def define_behavior_policy(
    observation: np.ndarray,
    env: SimglucoseGymEnv,
    policy_type: str = 'random'
) -> np.ndarray:
    """
    Behavior policy for data collection.
    
    Args:
        observation: Current observation
        env: Environment
        policy_type: Type of policy ('random', 'mean', or 'follow_glucose')
    
    Returns:
        Action array
    """
    if policy_type == 'random': 
        return env.action_space. sample()
    elif policy_type == 'mean':
        # Return mean action
        return np.array([(env.action_space.low[0] + env.action_space.high[0]) / 2.0], dtype=np.float32)
    elif policy_type == 'follow_glucose':
        # Simple reactive policy:  increase insulin if glucose is high
        glucose = observation[0]
        if glucose > 180:
            action = 20.0
        elif glucose > 140:
            action = 15.0
        else:
            action = 10.0
        return np.array([action], dtype=np.float32)
    else:
        raise ValueError(f"Unknown policy type:  {policy_type}")


# ============================================================================
# MAIN EXECUTION (OPTIONAL)
# ============================================================================

if __name__ == "__main__": 
    # Setup environment
    env = setup_simglucose_environment(
        patient_name='adolescent#001',
        seed=42,
        verbose=True
    )
    
    # Test environment
    test_stats = test_environment(env, n_steps=5, seed=42)
    
    # Close environment
    env.close()
    print("✅ Environment test completed successfully!")


ENVIRONMENT INITIALIZATION
Patient:  adolescent#001
Seed: 42
OK Successfully initialized MockT1DEnv
OK SimglucoseGymEnv initialized successfully!


ENVIRONMENT TEST
✅ Reset successful!
Initial Observation: [118.727005]
Observation shape: (1,)
Observation dtype: float32

--- Testing Steps ---
Step 1:
  Action: 6.97 | Glucose: 109.15 | Reward: 1.0000
Step 2:
  Action: 29.65 | Glucose: 118.07 | Reward: 1.0000
Step 3:
  Action: 20.83 | Glucose: 122.39 | Reward: 1.0000
Step 4:
  Action: 28.45 | Glucose: 134.16 | Reward: 1.0000
Step 5:
  Action: 29.96 | Glucose: 138.74 | Reward: 1.0000

--- Episode Summary ---
  Total Steps: 5
  Total Return: 5.0000
  Average Reward: 1.0000
  Mean Glucose: 124.50 mg/dL
  Glucose Range: [109.15, 138.74]

✅ Environment test completed successfully!


In [4]:
"""
Data Collection Module for Offline RL Pipeline
Collects trajectory data from SimGlucose environment without minari dependency
"""

import warnings
warnings.filterwarnings('ignore')

from typing import Callable, Dict, List, Tuple, Optional, Any
import numpy as np
import time


# ============================================================================
# DATA COLLECTION
# ============================================================================

class SimpleDataCollector:
    """Custom data collector - no minari dependency required."""
    
    def __init__(self, env: SimglucoseGymEnv, record_infos: bool = True, verbose: bool = True):
        """
        Initialize data collector.
        
        Args:
            env: SimGlucose environment
            record_infos: Whether to record step infos
            verbose: Print status messages
        """
        self. env = env
        self.record_infos = record_infos
        self.verbose = verbose
        self.episodes = []
    
    def collect(
        self,
        policy: Callable,
        num_episodes: int = 10,
        max_steps:  int = 480,
        seed: Optional[int] = None
    ) -> List[Dict[str, np.ndarray]]:
        """
        Collect episodes using a policy.
        
        Args:
            policy: Function that takes (obs, env) and returns action
            num_episodes: Number of episodes to collect
            max_steps: Maximum steps per episode
            seed: Random seed for reproducibility
        
        Returns:
            List of episode dictionaries
        """
        if self.verbose:
            print(f"\nStarting data collection for {num_episodes} episodes...")
            start_time = time.time()

        episodes_data = []

        for episode_num in range(num_episodes):
            # Reset environment
            if seed is not None:
                episode_seed = seed + episode_num
            else:
                episode_seed = None
            
            obs, info = self.env.reset(seed=episode_seed)

            # Initialize episode data lists
            observations = []
            actions = []
            rewards = []
            terminations = []
            truncations = []
            infos = []
            
            episode_return = 0.0
            episode_length = 0

            # Collect steps
            for step in range(max_steps):
                # Store observation
                observations.append(obs. copy())
                
                # Get action from policy
                action = policy(obs, self.env)
                actions.append(action)

                # Take environment step
                obs, reward, terminated, truncated, step_info = self.env.step(action)
                
                # Store transition data
                rewards.append(reward)
                terminations.append(terminated)
                truncations.append(truncated)
                
                if self.record_infos:
                    infos.append(step_info)
                
                episode_return += reward
                episode_length += 1

                # Check termination
                if terminated or truncated: 
                    break

            # Convert lists to numpy arrays and create episode dict
            episode_dict = {
                'observations': np. array(observations, dtype=np.float32),
                'actions': np.array(actions, dtype=np.float32),
                # Reshape rewards to 2D:  (n_steps, 1)
                'rewards': np.array(rewards, dtype=np.float32).reshape(-1, 1),
                'terminations': np.array(terminations, dtype=bool),
                'truncations': np.array(truncations, dtype=bool),
                'episode_return': float(episode_return),
                'episode_length': int(episode_length)
            }
            
            if self.record_infos:
                episode_dict['infos'] = infos
            
            episodes_data.append(episode_dict)
            
            # Progress update
            if self.verbose and (episode_num + 1) % max(1, num_episodes // 5) == 0:
                print(f"  Collected {episode_num + 1}/{num_episodes} episodes...")

        # Calculate statistics
        elapsed_time = time.time() - start_time if self.verbose else 0
        total_transitions = sum(ep['episode_length'] for ep in episodes_data)
        avg_return = np.mean([ep['episode_return'] for ep in episodes_data])
        std_return = np.std([ep['episode_return'] for ep in episodes_data])

        if self.verbose:
            print(f"\n✅ OK Data collection complete! (took {elapsed_time:.2f}s)")
            print(f"Collected {len(episodes_data)} episodes")
            print(f"   Total transitions: {total_transitions}")
            print(f"   Average return: {avg_return:.4f} ± {std_return:.4f}")

        self.episodes = episodes_data
        return episodes_data


# ============================================================================
# SETUP DATA COLLECTION
# ============================================================================

def setup_data_collection(
    env: SimglucoseGymEnv,
    num_episodes: int = 10,
    max_steps_per_episode: int = 480,
    dataset_name: Optional[str] = None,
    record_infos: bool = True,
    verbose: bool = True
) -> Tuple[str, SimpleDataCollector]:
    """
    Setup and prepare data collection. 
    
    Args:
        env: SimGlucose environment
        num_episodes: Number of episodes to collect
        max_steps_per_episode: Max steps per episode
        dataset_name: Name for the dataset (auto-generated if None)
        record_infos: Whether to record step infos
        verbose: Print status messages
    
    Returns: 
        Tuple of (dataset_name, data_collector)
    """
    if verbose:
        print("\n" + "=" * 80)
        print("Setting up Data Collection")
        print("=" * 80)

    # Generate dataset name if not provided
    if dataset_name is None:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        dataset_name = f'simglucose_adolescent_random_{timestamp}'

    if verbose:
        print(f"Dataset Name: {dataset_name}")
        print(f"\nData Collection Parameters:")
        print(f"  - Number of Episodes: {num_episodes}")
        print(f"  - Max Steps per Episode: {max_steps_per_episode}")
        print(f"  - Record Infos: {record_infos}")

    # Initialize data collector
    data_collector = SimpleDataCollector(
        env=env,
        record_infos=record_infos,
        verbose=verbose
    )
    
    if verbose:
        print(f"\n✅ DataCollector created successfully!")
        print("=" * 80)

    return dataset_name, data_collector


# ============================================================================
# COLLECT DATA
# ============================================================================

def collect_data_simple(
    env: SimglucoseGymEnv,
    policy: Callable,
    num_episodes: int = 10,
    max_steps_per_episode: int = 480,
    dataset_name: str = 'simglucose-adolescent-random-v0',
    seed: Optional[int] = None,
    verbose: bool = True
) -> List[Dict[str, np.ndarray]]:
    """
    Collect trajectory data and return as list of episode dictionaries.
    
    Args:
        env: SimGlucose environment
        policy:  Behavior policy function
        num_episodes: Number of episodes to collect
        max_steps_per_episode: Max steps per episode
        dataset_name: Name of dataset (for logging)
        seed: Random seed for reproducibility
        verbose: Print status messages
    
    Returns: 
        List of episode dictionaries with observations, actions, rewards, etc.
    """
    if verbose:
        print("\n" + "=" * 80)
        print("PHASE 1: DATA COLLECTION")
        print("=" * 80)
        print(f"Dataset:  {dataset_name}")
        print(f"Episodes: {num_episodes}")
        print(f"Max steps per episode: {max_steps_per_episode}")
        print("=" * 80)

    # Create data collector
    collector = SimpleDataCollector(env, record_infos=True, verbose=verbose)
    
    # Collect episodes
    episodes_data = collector. collect(
        policy=policy,
        num_episodes=num_episodes,
        max_steps=max_steps_per_episode,
        seed=seed
    )

    return episodes_data


# ============================================================================
# VISUALIZE COLLECTED DATA
# ============================================================================

def visualize_collected_data(
    episodes_data: List[Dict[str, np.ndarray]],
    num_episodes_to_show: int = 3,
    verbose: bool = True
) -> Dict[str, Any]:
    """
    Visualize and analyze collected episodes.
    
    Args:
        episodes_data: List of collected episodes
        num_episodes_to_show: Number of episodes to display details for
        verbose: Print statistics
    
    Returns:
        Dictionary with data statistics
    """
    if verbose: 
        print("\n" + "=" * 80)
        print("Collected Dataset Statistics")
        print("=" * 80)

    # Extract statistics
    all_returns = [ep['episode_return'] for ep in episodes_data]
    all_lengths = [ep['episode_length'] for ep in episodes_data]
    all_rewards = np.concatenate([ep['rewards']. flatten() for ep in episodes_data])

    stats = {
        'num_episodes': len(episodes_data),
        'total_transitions': int(sum(all_lengths)),
        'mean_return': float(np.mean(all_returns)),
        'std_return':  float(np.std(all_returns)),
        'min_return': float(np.min(all_returns)),
        'max_return': float(np.max(all_returns)),
        'mean_length': float(np.mean(all_lengths)),
        'std_length': float(np. std(all_lengths)),
        'mean_reward': float(np.mean(all_rewards)),
        'std_reward': float(np. std(all_rewards))
    }

    if verbose:
        print(f"\nOverall Statistics:")
        print(f"  Total Episodes: {stats['num_episodes']}")
        print(f"  Total Transitions:  {stats['total_transitions']}")
        print(f"  Mean Return: {stats['mean_return']:.4f} ± {stats['std_return']:.4f}")
        print(f"  Return Range: [{stats['min_return']:.4f}, {stats['max_return']:.4f}]")
        print(f"  Mean Episode Length: {stats['mean_length']:.2f} ± {stats['std_length']:.2f}")
        print(f"  Mean Reward: {stats['mean_reward']:.4f} ± {stats['std_reward']:.4f}")

        print(f"\nFirst {min(num_episodes_to_show, len(episodes_data))} episodes:")
        for i in range(min(num_episodes_to_show, len(episodes_data))):
            ep = episodes_data[i]
            print(f"  Episode {i+1}:")
            print(f"    Return: {ep['episode_return']:.4f}")
            print(f"    Length: {ep['episode_length']} steps")
            print(f"    Actions: min={ep['actions']. min():.2f}, max={ep['actions'].max():.2f}")
            print(f"    Rewards: min={ep['rewards'].min():.4f}, max={ep['rewards'].max():.4f}")
        
        print("=" * 80 + "\n")

    return stats


# ============================================================================
# COMPLETE DATA COLLECTION PIPELINE
# ============================================================================

def collect_data_and_save(
    num_episodes: int = 10,
    max_steps_per_episode: int = 480,
    patient_name: str = 'adolescent#001',
    policy_type: str = 'random',
    seed: int = 42,
    verbose: bool = True
) -> Dict[str, Any]:
    """
    Complete SimGlucose environment setup and data collection pipeline.
    
    Args:
        num_episodes: Number of episodes to collect
        max_steps_per_episode:  Max steps per episode
        patient_name: Patient profile name
        policy_type: Type of behavior policy ('random', 'mean', 'follow_glucose')
        seed: Random seed for reproducibility
        verbose: Print status messages
    
    Returns: 
        Dictionary containing episodes, dataset name, and statistics
    """
    if verbose:
        print("\n" + "=" * 80)
        print("SIMGLUCOSE DATA COLLECTION PIPELINE")
        print("=" * 80)

    # Setup environment
    if verbose:
        print("\n[1/5] Setting up environment...")
    env = setup_simglucose_environment(
        patient_name=patient_name,
        seed=seed,
        verbose=verbose
    )

    # Test environment
    if verbose:
        print("\n[2/5] Testing environment...")
    test_stats = test_environment(env, n_steps=5, seed=seed)

    # Setup data collection
    if verbose: 
        print("\n[3/5] Setting up data collection...")
    dataset_name, collector = setup_data_collection(
        env=env,
        num_episodes=num_episodes,
        max_steps_per_episode=max_steps_per_episode,
        verbose=verbose
    )

    # Define behavior policy
    if verbose: 
        print(f"\n[4/5] Using '{policy_type}' behavior policy...")
    policy = lambda obs, env:  define_behavior_policy(obs, env, policy_type=policy_type)

    # Collect data
    if verbose: 
        print(f"\n[5/5] Collecting data...")
    episodes_data = collect_data_simple(
        env=env,
        policy=policy,
        num_episodes=num_episodes,
        max_steps_per_episode=max_steps_per_episode,
        dataset_name=dataset_name,
        seed=seed,
        verbose=verbose
    )

    # Visualize data
    data_stats = visualize_collected_data(episodes_data, num_episodes_to_show=3, verbose=verbose)

    # Close environment
    env.close()

    if verbose:
        print("\n" + "=" * 80)
        print("✅ Data Collection Pipeline Complete!")
        print("=" * 80)

    return {
        'episodes_data': episodes_data,
        'dataset_name': dataset_name,
        'num_episodes': num_episodes,
        'max_steps_per_episode': max_steps_per_episode,
        'statistics': data_stats
    }


# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    # Execute complete data collection pipeline
    result = collect_data_and_save(
        num_episodes=10,
        max_steps_per_episode=480,
        patient_name='adolescent#001',
        policy_type='random',
        seed=42,
        verbose=True
    )
    
    # Access results
    episodes_data = result['episodes_data']
    dataset_name = result['dataset_name']
    statistics = result['statistics']
    
    print(f"\n✅ Data collection successful!")
    print(f"Dataset name: {dataset_name}")
    print(f"Episodes collected: {statistics['num_episodes']}")
    print(f"Total transitions: {statistics['total_transitions']}")


SIMGLUCOSE DATA COLLECTION PIPELINE

[1/5] Setting up environment...

ENVIRONMENT INITIALIZATION
Patient:  adolescent#001
Seed: 42
OK Successfully initialized MockT1DEnv
OK SimglucoseGymEnv initialized successfully!


[2/5] Testing environment...

ENVIRONMENT TEST
✅ Reset successful!
Initial Observation: [118.727005]
Observation shape: (1,)
Observation dtype: float32

--- Testing Steps ---
Step 1:
  Action: 9.99 | Glucose: 110.66 | Reward: 1.0000
Step 2:
  Action: 26.03 | Glucose: 117.77 | Reward: 1.0000
Step 3:
  Action: 19.98 | Glucose: 121.66 | Reward: 1.0000
Step 4:
  Action: 4.61 | Glucose: 121.52 | Reward: 1.0000
Step 5:
  Action: 10.28 | Glucose: 116.25 | Reward: 1.0000

--- Episode Summary ---
  Total Steps: 5
  Total Return: 5.0000
  Average Reward: 1.0000
  Mean Glucose: 117.57 mg/dL
  Glucose Range: [110.66, 121.66]


[3/5] Setting up data collection...

Setting up Data Collection
Dataset Name: simglucose_adolescent_random_20260109_181437

Data Collection Parameters:
  - Nu

In [5]:
"""
Dataset Management Module for Offline RL Pipeline
Handles loading, creating, and managing datasets for training
"""

import warnings
warnings.filterwarnings('ignore')

from typing import Dict, List, Any, Optional, Tuple
import numpy as np
import time
import os
import pickle


# ============================================================================
# DATASET MANAGEMENT (WITHOUT MINARI DEPENDENCY)
# ============================================================================

class DatasetManager:
    """Handler for managing collected episode data as datasets."""
    
    def __init__(self, verbose: bool = True):
        """
        Initialize dataset manager.
        
        Args:
            verbose: Print status messages
        """
        self.verbose = verbose
        self.dataset = None
        self.statistics = None
    
    def create_dataset(
        self,
        episodes_data: List[Dict[str, np.ndarray]],
        dataset_name: str = 'simglucose-offline-dataset',
        description: str = ''
    ) -> Dict[str, Any]:
        """
        Create a dataset from collected episodes.
        
        Args:
            episodes_data: List of episode dictionaries from data collection
            dataset_name:  Name for the dataset
            description: Dataset description
        
        Returns:
            Dataset dictionary with metadata and episodes
        """
        if self. verbose:
            print(f"\n{'='*80}")
            print(f"Creating Dataset: {dataset_name}")
            print(f"{'='*80}")
        
        # Validate episodes
        if not episodes_data:
            raise ValueError("No episodes provided for dataset creation")
        
        dataset = {
            'name': dataset_name,
            'description': description,
            'creation_time': time.strftime("%Y-%m-%d %H:%M:%S"),
            'num_episodes': len(episodes_data),
            'episodes': episodes_data,
            'metadata': self._extract_metadata(episodes_data)
        }
        
        if self.verbose:
            print(f"✅ Dataset created successfully!")
            print(f"  - Name: {dataset_name}")
            print(f"  - Episodes: {len(episodes_data)}")
            print(f"  - Total transitions: {dataset['metadata']['total_transitions']}")
            print(f"{'='*80}\n")
        
        self.dataset = dataset
        return dataset
    
    @staticmethod
    def _extract_metadata(episodes_data: List[Dict[str, np.ndarray]]) -> Dict[str, Any]:
        """
        Extract metadata from episodes. 
        
        Args:
            episodes_data: List of episode dictionaries
        
        Returns: 
            Metadata dictionary
        """
        episode_returns = [ep['episode_return'] for ep in episodes_data]
        episode_lengths = [ep['episode_length'] for ep in episodes_data]
        all_rewards = np.concatenate([ep['rewards']. flatten() for ep in episodes_data])
        
        return {
            'total_transitions': int(sum(episode_lengths)),
            'total_episodes': len(episodes_data),
            'mean_episode_return': float(np.mean(episode_returns)),
            'std_episode_return': float(np.std(episode_returns)),
            'max_episode_return': float(np.max(episode_returns)),
            'min_episode_return':  float(np.min(episode_returns)),
            'mean_episode_length': float(np.mean(episode_lengths)),
            'std_episode_length':  float(np.std(episode_lengths)),
            'mean_reward': float(np.mean(all_rewards)),
            'std_reward': float(np. std(all_rewards)),
            'min_reward': float(np.min(all_rewards)),
            'max_reward': float(np.max(all_rewards))
        }
    
    def compute_statistics(self, episodes_data: Optional[List[Dict]] = None) -> Dict[str, Any]:
        """
        Compute comprehensive statistics about the dataset.
        
        Args:
            episodes_data: Episodes to compute stats for (uses self.dataset if None)
        
        Returns:
            Statistics dictionary
        """
        if episodes_data is None:
            if self.dataset is None:
                raise ValueError("No dataset available.  Create or load a dataset first.")
            episodes_data = self.dataset['episodes']
        
        episode_returns = [ep['episode_return'] for ep in episodes_data]
        episode_lengths = [ep['episode_length'] for ep in episodes_data]
        all_rewards = np.concatenate([ep['rewards'].flatten() for ep in episodes_data])
        
        # Compute reward distribution statistics
        reward_percentiles = {
            'p25': float(np.percentile(all_rewards, 25)),
            'p50': float(np. percentile(all_rewards, 50)),
            'p75':  float(np.percentile(all_rewards, 75))
        }
        
        stats = {
            'num_episodes': len(episodes_data),
            'total_transitions':  int(sum(episode_lengths)),
            'mean_episode_return': float(np.mean(episode_returns)),
            'std_episode_return':  float(np.std(episode_returns)),
            'max_episode_return': float(np.max(episode_returns)),
            'min_episode_return': float(np.min(episode_returns)),
            'mean_episode_length': float(np.mean(episode_lengths)),
            'std_episode_length': float(np. std(episode_lengths)),
            'mean_reward': float(np.mean(all_rewards)),
            'std_reward': float(np.std(all_rewards)),
            'min_reward': float(np.min(all_rewards)),
            'max_reward':  float(np.max(all_rewards)),
            'reward_percentiles':  reward_percentiles,
        }
        
        self.statistics = stats
        return stats
    
    def print_statistics(self, stats: Optional[Dict[str, Any]] = None, verbose: bool = True) -> None:
        """
        Print dataset statistics in a formatted manner.
        
        Args:
            stats: Statistics dictionary (uses self.statistics if None)
            verbose: Whether to print (for compatibility)
        """
        if stats is None:
            stats = self.statistics
        
        if stats is None:
            print("No statistics available.  Compute statistics first.")
            return
        
        print(f"\n{'='*80}")
        print("Dataset Statistics")
        print(f"{'='*80}")
        print(f"\nEpisode Information:")
        print(f"  - Total Episodes: {stats['num_episodes']}")
        print(f"  - Total Transitions:  {stats['total_transitions']}")
        print(f"  - Mean Episode Length: {stats['mean_episode_length']:.1f} ± {stats['std_episode_length']:.1f}")
        
        print(f"\nEpisode Returns:")
        print(f"  - Mean:  {stats['mean_episode_return']:.4f} ± {stats['std_episode_return']:.4f}")
        print(f"  - Range: [{stats['min_episode_return']:.4f}, {stats['max_episode_return']:.4f}]")
        
        print(f"\nReward Statistics:")
        print(f"  - Mean:  {stats['mean_reward']:. 4f} ± {stats['std_reward']:.4f}")
        print(f"  - Range: [{stats['min_reward']:.4f}, {stats['max_reward']:.4f}]")
        print(f"  - Percentiles:")
        print(f"    - 25th: {stats['reward_percentiles']['p25']:.4f}")
        print(f"    - 50th (median): {stats['reward_percentiles']['p50']:.4f}")
        print(f"    - 75th: {stats['reward_percentiles']['p75']:. 4f}")
        print(f"{'='*80}\n")


# ============================================================================
# REPLAY BUFFER CREATION
# ============================================================================

class ReplayBufferManager:
    """Handler for converting episodes to d3rlpy ReplayBuffer."""
    
    def __init__(self, verbose: bool = True):
        """
        Initialize replay buffer manager.
        
        Args:
            verbose: Print status messages
        """
        self.verbose = verbose
        self.buffer = None
    
    def create_replay_buffer(
        self,
        episodes_data: List[Dict[str, np.ndarray]],
        d3rlpy_module: Any,
        buffer_limit: int = 1000000
    ) -> Any:
        """
        Convert episode data to d3rlpy ReplayBuffer. 
        
        Args:
            episodes_data: List of episode dictionaries from data collection
            d3rlpy_module:  Imported d3rlpy module
            buffer_limit: Maximum buffer size
        
        Returns:
            d3rlpy ReplayBuffer
        """
        if self.verbose:
            print(f"\n{'='*80}")
            print("Creating d3rlpy ReplayBuffer from Episodes")
            print(f"{'='*80}")
        
        try:
            from d3rlpy. dataset import Episode
            
            # Validate inputs
            if d3rlpy_module is None:
                raise ValueError("d3rlpy_module cannot be None")
            
            if not episodes_data:
                raise ValueError("episodes_data cannot be empty")
            
            # Convert to d3rlpy Episode format
            d3rlpy_episodes = []
            total_transitions = 0
            failed_episodes = 0
            
            for ep_idx, ep in enumerate(episodes_data):
                try:
                    # Validate episode data
                    if 'observations' not in ep or 'actions' not in ep or 'rewards' not in ep: 
                        raise ValueError(f"Episode {ep_idx} missing required keys")
                    
                    # Create Episode with proper format
                    d3rlpy_ep = Episode(
                        observations=ep['observations']. astype(np.float32),
                        actions=ep['actions']. astype(np.float32),
                        rewards=ep['rewards'].astype(np. float32),
                        terminated=bool(ep['terminations'][-1]) if len(ep['terminations']) > 0 else True,
                        truncated=ep. get('truncations', np.zeros(len(ep['terminations']), dtype=bool))
                    )
                    d3rlpy_episodes.append(d3rlpy_ep)
                    total_transitions += len(ep['rewards'])
                
                except Exception as e:
                    if self.verbose:
                        print(f"⚠️  Warning: Failed to convert episode {ep_idx}: {str(e)}")
                    failed_episodes += 1
                    continue
            
            if not d3rlpy_episodes: 
                raise ValueError(f"No episodes could be converted to d3rlpy format.  Failed: {failed_episodes}")
            
            if self.verbose and failed_episodes > 0:
                print(f"⚠️  {failed_episodes} episode(s) skipped due to errors")
            
            # Create FIFO replay buffer
            self.buffer = d3rlpy_module.dataset.create_fifo_replay_buffer(
                episodes=d3rlpy_episodes,
                limit=buffer_limit
            )
            
            if self.verbose:
                print(f"\n✅ ReplayBuffer created successfully!")
                print(f"  - Buffer size:  {self.buffer.size()} transitions")
                print(f"  - Episodes: {len(d3rlpy_episodes)}")
                print(f"  - Buffer limit: {buffer_limit}")
                print(f"{'='*80}\n")
            
            return self.buffer
        
        except ImportError as e: 
            print(f"❌ Error:  Could not import d3rlpy. dataset:  {e}")
            raise
        except Exception as e:
            print(f"❌ Error creating replay buffer: {e}")
            import traceback
            traceback.print_exc()
            raise
    
    def get_buffer_info(self) -> Dict[str, Any]:
        """
        Get information about the current replay buffer.
        
        Returns:
            Dictionary with buffer information
        """
        if self.buffer is None:
            return {'status': 'No buffer created yet'}
        
        return {
            'buffer_size': self.buffer.size(),
            'status': 'Created and ready for training'
        }


# ============================================================================
# DATASET PIPELINE
# ============================================================================

class DatasetPipeline:
    """Complete pipeline for dataset creation and replay buffer preparation."""
    
    def __init__(self, verbose: bool = True):
        """
        Initialize pipeline. 
        
        Args:
            verbose: Print status messages
        """
        self.verbose = verbose
        self.dataset_manager = DatasetManager(verbose=verbose)
        self.buffer_manager = ReplayBufferManager(verbose=verbose)
        self.dataset = None
        self.replay_buffer = None
    
    def setup_and_create_buffer(
        self,
        episodes_data: List[Dict[str, np.ndarray]],
        d3rlpy_module: Any,
        dataset_name: str = 'simglucose-offline-dataset',
        compute_stats: bool = True
    ) -> Tuple[Dict[str, Any], Any]:
        """
        Complete pipeline:  create dataset and prepare replay buffer.
        
        Args:
            episodes_data:  Collected episode data
            d3rlpy_module: Imported d3rlpy module
            dataset_name: Name for the dataset
            compute_stats:  Whether to compute and print statistics
        
        Returns: 
            Tuple of (dataset, replay_buffer)
        """
        if self.verbose:
            print(f"\n{'='*80}")
            print("DATASET PIPELINE")
            print(f"{'='*80}")
        
        # Validate inputs
        if d3rlpy_module is None: 
            raise ValueError("d3rlpy_module cannot be None.  Make sure d3rlpy is imported.")
        
        if not episodes_data:
            raise ValueError("episodes_data cannot be empty")
        
        # Step 1: Create dataset
        if self.verbose:
            print("\n[1/3] Creating dataset...")
        self.dataset = self.dataset_manager.create_dataset(
            episodes_data,
            dataset_name=dataset_name
        )
        
        # Step 2: Compute statistics
        if compute_stats:
            if self.verbose:
                print("[2/3] Computing statistics...")
            stats = self.dataset_manager.compute_statistics(episodes_data)
            self.dataset_manager.print_statistics(stats)
        else:
            if self.verbose:
                print("[2/3] Skipping statistics computation...")
        
        # Step 3: Create replay buffer
        if self.verbose:
            print("[3/3] Creating replay buffer...")
        self.replay_buffer = self.buffer_manager.create_replay_buffer(
            episodes_data,
            d3rlpy_module
        )
        
        if self.verbose:
            print(f"✅ Dataset pipeline complete!")
            print(f"{'='*80}\n")
        
        return self.dataset, self.replay_buffer


# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def save_dataset(
    dataset: Dict[str, Any],
    filepath: str = './offline_rl_dataset.pkl',
    verbose: bool = True
) -> str:
    """
    Save dataset to disk.
    
    Args:
        dataset: Dataset dictionary
        filepath: Path to save to
        verbose: Print status
    
    Returns:
        Path to saved file
    """
    if verbose:
        print(f"\nSaving dataset to {filepath}...")
    
    try:
        # Create directory if it doesn't exist
        os.makedirs(os.path. dirname(filepath) or '.', exist_ok=True)
        
        # Save dataset using pickle
        with open(filepath, 'wb') as f:
            pickle.dump(dataset, f)
        
        # Get file size
        file_size = os.path.getsize(filepath) / (1024 * 1024)  # Convert to MB
        
        if verbose:
            print(f"✅ Dataset saved successfully!")
            print(f"  - File:  {filepath}")
            print(f"  - Size: {file_size:.2f} MB")
        
        return filepath
    
    except Exception as e:
        print(f"❌ Error saving dataset: {e}")
        raise


def load_dataset(
    filepath: str = './offline_rl_dataset.pkl',
    verbose: bool = True
) -> Dict[str, Any]:
    """
    Load dataset from disk. 
    
    Args:
        filepath: Path to load from
        verbose: Print status
    
    Returns:
        Dataset dictionary
    """
    if verbose:
        print(f"\nLoading dataset from {filepath}...")
    
    try:
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Dataset file not found: {filepath}")
        
        with open(filepath, 'rb') as f:
            dataset = pickle.load(f)
        
        if verbose: 
            print(f"✅ Dataset loaded successfully!")
            print(f"  - Name: {dataset.get('name', 'Unknown')}")
            print(f"  - Episodes: {dataset.get('num_episodes', 'Unknown')}")
            print(f"  - Creation time: {dataset.get('creation_time', 'Unknown')}")
        
        return dataset
    
    except FileNotFoundError as e:
        print(f"❌ Dataset file not found: {filepath}")
        raise
    except Exception as e: 
        print(f"❌ Error loading dataset: {e}")
        raise


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main(
    episodes_data: List[Dict[str, np.ndarray]],
    d3rlpy_module: Any,
    dataset_name: str = 'simglucose-offline-v1',
    save_path: str = './simglucose_offline_dataset.pkl'
) -> Tuple[Dict[str, Any], Any]:
    """
    Complete dataset management pipeline.
    
    Args:
        episodes_data:  Collected episode data
        d3rlpy_module: Imported d3rlpy module
        dataset_name: Name for the dataset
        save_path: Path to save dataset
    
    Returns: 
        Tuple of (dataset, replay_buffer)
    """
    try:
        # Create pipeline
        pipeline = DatasetPipeline(verbose=True)
        
        # Run pipeline
        dataset, replay_buffer = pipeline.setup_and_create_buffer(
            episodes_data=episodes_data,
            d3rlpy_module=d3rlpy_module,
            dataset_name=dataset_name,
            compute_stats=True
        )
        
        # Save dataset
        save_dataset(dataset, filepath=save_path, verbose=True)
        
        return dataset, replay_buffer
    
    except Exception as e:
        print(f"❌ Pipeline execution failed: {e}")
        import traceback
        traceback.print_exc()
        raise


if __name__ == "__main__": 
    """
    Example usage - uncomment and modify as needed:
    
    # Make sure these are defined from previous cells: 
    # - episodes_data (from data collection)
    # - d3rlpy (imported module)
    
    dataset, replay_buffer = main(
        episodes_data=episodes_data,
        d3rlpy_module=d3rlpy,
        dataset_name='simglucose-offline-v1',
        save_path='./simglucose_offline_dataset. pkl'
    )
    """
    print("\n" + "="*80)
    print("Dataset Management Module")
    print("="*80)
    print("\nTo use this module, call main() with:")
    print("  - episodes_data: List of collected episodes")
    print("  - d3rlpy_module: Imported d3rlpy")
    print("\nExample:")
    print("  dataset, replay_buffer = main(episodes_data, d3rlpy)")
    print("="*80 + "\n")


Dataset Management Module

To use this module, call main() with:
  - episodes_data: List of collected episodes
  - d3rlpy_module: Imported d3rlpy

Example:
  dataset, replay_buffer = main(episodes_data, d3rlpy)



In [7]:
# ============================================================================
# OFFLINE RL ALGORITHM CONFIGURATION
# ============================================================================

class OfflineRLAlgorithm:
    """Factory for creating offline RL algorithms."""

    ALGORITHM_CONFIGS = {
        'cql': {
            'name': 'Conservative Q-Learning (CQL)',
            'description': 'Best for general offline RL',
            'config_class': d3rlpy.algos.CQLConfig,
            'params': {
                'actor_learning_rate': 1e-4,
                'critic_learning_rate': 3e-4,
                'batch_size': 256,
                'gamma': 0.99,
                'tau': 5e-3,
                'alpha_learning_rate': 1e-4,
                'conservative_weight': 10.0,
                'n_action_samples': 10,
            }
        },
        'iql': {
            'name':  'Implicit Q-Learning (IQL)',
            'description': 'Avoids querying unseen actions',
            'config_class':  d3rlpy.algos.IQLConfig,
            'params': {
                'actor_learning_rate': 3e-4,
                'critic_learning_rate': 3e-4,
                'batch_size': 256,
                'gamma': 0.99,
                'tau': 5e-3,
                'expectile':  0.7,
                'weight_temp': 3.0,
                'max_weight': 100.0,
            }
        },
        'bc': {
            'name': 'Behavioral Cloning (BC)',
            'description': 'Simple imitation learning baseline',
            'config_class':  d3rlpy.algos.BCConfig,
            'params': {
                'learning_rate': 1e-4,
                'batch_size': 256,
            }
        },
    }

    @classmethod
    def list_algorithms(cls) -> None:
        """List available algorithms."""
        print(f"\n{'='*80}")
        print("Available Offline RL Algorithms")
        print(f"{'='*80}")
        for algo_type, config in cls.ALGORITHM_CONFIGS.items():
            print(f"\n  {algo_type.upper()}:\n    Name: {config['name']}\n    Description: {config['description']}")

    @classmethod
    def create(cls, algo_type: str = 'cql', device: str = 'cpu:0', **custom_params):
        """Create an offline RL algorithm."""
        algo_type = algo_type.lower()

        if algo_type not in cls.ALGORITHM_CONFIGS:
            raise ValueError(f"Unknown algorithm:  {algo_type}")

        config_info = cls.ALGORITHM_CONFIGS[algo_type]
        params = config_info['params'].copy()
        params.update(custom_params)

        print(f"\n{'='*80}")
        print(f"Creating {config_info['name']}")
        print(f"{'='*80}")
        print(f"Device: {device}")
        print(f"\nHyperparameters:")
        for key, value in params.items():
            print(f"  - {key}: {value}")

        config = config_info['config_class'](**params)
        algo = config.create(device=device)

        print(f"OK {config_info['name']} created successfully!")
        return algo


# ============================================================================
# TRAINING
# ============================================================================

class OfflineRLTrainer:
    """Trainer for offline RL algorithms."""

    @staticmethod
    def train(
        algo,
        replay_buffer,
        n_steps:  int = 50000,
        save_interval: int = 5000,
        verbose: bool = True
    ) -> Dict[str, Any]:
        """Train offline RL algorithm."""
        print("\n" + "="*80)
        print("Starting Offline RL Training")
        print("="*80)
        print("Training Configuration:")
        print(f"  - Algorithm: {algo.__class__.__name__}")
        print(f"  - Total Steps: {n_steps}")
        # ✅ Use . size() instead of len()
        print(f"  - Replay Buffer Size: {replay_buffer.size()}")
        print()

        start_time = time.time()

        try:
            # Train the algorithm
            algo.fit(
                replay_buffer,
                n_steps=n_steps,
                show_progress=verbose
            )

            training_time = time.time() - start_time

            if verbose:
                print(f"\nOK Training complete!")
                print(f"  - Training Time: {training_time:.2f}s") # Fixed f-string format
                print(f"  - Steps per Second: {n_steps / training_time:.2f}")

            return {
                'n_steps': n_steps,
                'training_time': training_time,
                'steps_per_second': n_steps / training_time
            }

        except Exception as e:
            if verbose:
                print(f"ERROR during training: {e}")
            raise


In [8]:
# ============================================================================
# EVALUATION
# ============================================================================

class PolicyEvaluator:
    """Handler for evaluating trained policies."""

    @staticmethod
    def evaluate(
        algo,
        env,
        n_episodes: int = 5,
        max_steps:  Optional[int] = None,
        verbose: bool = True
    ) -> Dict[str, float]:
        """Evaluate a trained policy."""
        if verbose:
            print(f"\n{'='*80}")
            print(f"Evaluating Policy ({n_episodes} episodes)")
            print(f"{'='*80}")

        episode_returns = []
        episode_lengths = []

        for episode_num in range(n_episodes):
            obs, _ = env.reset()
            total_return = 0.0
            steps = 0

            done = False
            while not done:
                action = algo.predict(np.expand_dims(obs, axis=0))[0]
                obs, reward, terminated, truncated, _ = env.step(action)
                total_return += reward
                steps += 1

                done = terminated or truncated
                if max_steps and steps >= max_steps:
                    done = True

            episode_returns.append(total_return)
            episode_lengths.append(steps)

            if verbose:
                print(f"  Episode {episode_num + 1:3d}: Return = {total_return:8.2f}, Steps = {steps:3d}")

        mean_return = float(np.mean(episode_returns))
        std_return = float(np.std(episode_returns))
        mean_length = float(np.mean(episode_lengths))

        stats = {
            'mean_return': mean_return,
            'std_return': std_return,
            'max_return': float(np.max(episode_returns)),
            'min_return': float(np.min(episode_returns)),
            'mean_episode_length': mean_length,
        }

        if verbose:
            print(f"\nOK Evaluation Complete!")
            print(f"  - Mean Return: {mean_return:.4f} +/- {std_return:.4f}")

        return stats


# ============================================================================
# MODEL PERSISTENCE
# ============================================================================

class ModelManager:
    """Handler for saving and loading models."""

    @staticmethod
    def save_model(algo, save_path: str = "./offline_rl_model") -> None:
        """Save a trained model."""
        print(f"\nSaving model to {save_path}...")
        algo.save_model(save_path)
        print(f"OK Model saved successfully!")

    @staticmethod
    def load_model(
        algo_type: str,
        save_path: str = "./offline_rl_model",
        device: str = "cpu:0"
    ):
        """Load a trained model."""
        print(f"\nLoading model from {save_path}...")
        config_class = OfflineRLAlgorithm.ALGORITHM_CONFIGS[algo_type.lower()]['config_class']
        algo = config_class().create(device=device)
        algo.load_model(save_path)
        print(f"OK Model loaded successfully!")
        return algo


In [9]:
# ============================================================================
# COMPLETE PIPELINE
# ============================================================================

def main(
    episodes_data:      List[Dict],  # Accept episodes data directly
    algo_type:  str = 'cql',
    n_steps:  int = 100000,
    n_eval_episodes: int = 5,
    device:  str = 'cpu:  0',
    save_model_flag:  bool = True
) -> Dict[str, Any]:
    """Complete offline RL pipeline."""
    print("\n" + "="*80)
    print("OFFLINE DEEP REINFORCEMENT LEARNING PIPELINE")
    print("="*80)

    results = {}

    OfflineRLAlgorithm.  list_algorithms()

    # Create replay buffer directly from episodes
    try:
        # ✅ Convert dict episodes to d3rlpy Episode objects
        from d3rlpy.dataset import Episode

        d3rlpy_episodes = []
        for episode_dict in episodes_data:
            # ✅ Episode requires (observations, actions, rewards, terminated:   bool)
            # terminated should be True if episode ended, False otherwise
            episode = Episode(
                observations=episode_dict['observations'],
                actions=episode_dict['actions'],
                rewards=episode_dict['rewards'],
                terminated=True  # ✅ Mark as terminated (episode finished)
            )
            d3rlpy_episodes.append(episode)

        # Now create replay buffer with proper Episode objects
        replay_buffer = d3rlpy.dataset.create_fifo_replay_buffer(
            episodes=d3rlpy_episodes,
            limit=1000000  # 1 million transitions max
        )
        print(f"\nOK ReplayBuffer created successfully!")
        # ✅ Use size() method or get total_transitions
        buffer_size = replay_buffer.size()
        print(f"  - Size: {buffer_size} transitions")
        results['replay_buffer_size'] = buffer_size
    except Exception as e:
        print(f"ERROR creating replay buffer: {e}")
        import traceback
        traceback.print_exc()
        raise

    algo = OfflineRLAlgorithm.  create(algo_type=algo_type, device=device)
    results['algorithm'] = algo_type

    training_stats = OfflineRLTrainer. train(
        algo=algo,
        replay_buffer=replay_buffer,
        n_steps=n_steps,
        save_interval=max(1000, n_steps // 10),
        verbose=True
    )
    results['training_stats'] = training_stats

    try:
        from gymnasium.  wrappers import TimeLimit
        eval_env = TimeLimit(
            SimglucoseGymEnv(patient_name='adolescent#001', seed=42),
            max_episode_steps=480
        )

        eval_stats = PolicyEvaluator.evaluate(
            algo=algo,
            env=eval_env,
            n_episodes=n_eval_episodes,
            max_steps=480,
            verbose=True
        )
        results['evaluation_stats'] = eval_stats
        eval_env.close()

    except Exception as e:
        print(f"\nWARNING Error during evaluation: {e}")

    if save_model_flag:
        model_path = f"./offline_rl_{algo_type}_model"
        ModelManager.save_model(algo, save_path=model_path)
        results['model_path'] = model_path

    print(f"\n{'='*80}")
    print("TRAINING SUMMARY")
    print(f"{'='*80}")
    print(f"OK Pipeline Complete!")
    print(f"\nResults:")
    print(f"  - Algorithm: {results['algorithm']}")
    print(f"  - Training Steps: {results['training_stats']['n_steps']}")
    print(f"  - Training Time: {results['training_stats']['training_time']:.2f}s")
    print(f"  - Replay Buffer Size: {results['replay_buffer_size']}")

    if 'evaluation_stats' in results:
        eval_stats = results['evaluation_stats']
        print(f"  - Evaluation Mean Return: {eval_stats['mean_return']:.4f} +/- {eval_stats['std_return']:.4f}")

    if 'model_path' in results:
        print(f"  - Model Saved:   {results['model_path']}")

    return results


In [10]:

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("=" * 70)
    print("PHASE 1: ENVIRONMENT VERIFICATION")
    print("=" * 70)
    # ...  Phase 1 code unchanged ...

    print("\n" + "=" * 80)
    print("PHASE 2: DATA COLLECTION")
    print("=" * 80)

    episodes_data = None
    try:
        episodes_data = collect_data_and_save(
            num_episodes=10,
            max_steps_per_episode=480,
            patient_name='adolescent#001',
            policy_type='random'
        )
        print(f"OK Collected {len(episodes_data)} episodes")
    except Exception as e:
        print(f"ERROR during data collection: {e}")
        import traceback
        traceback.print_exc()

    print("\n" + "=" * 80)
    print("PHASE 3: OFFLINE RL TRAINING")
    print("=" * 80)

    try:
        if episodes_data is not None:
            results = main(
                episodes_data=episodes_data,
                algo_type='cql',
                n_steps=50000,
                n_eval_episodes=3,
                device='cpu:0',
                save_model_flag=True
            )
            print("\n" + "=" * 80)
            print("OK TRAINING COMPLETE!")
            print("=" * 80)
        else:
            print("WARNING: No episodes collected.  Skipping training.")
    except Exception as e:
        print(f"ERROR during training: {e}")
        import traceback
        traceback.print_exc()

    print("\n" + "=" * 80)
    print("OK COMPLETE OFFLINE DRL PIPELINE FINISHED!")
    print("=" * 80)

PHASE 1: ENVIRONMENT VERIFICATION

PHASE 2: DATA COLLECTION

SIMGLUCOSE DATA COLLECTION PIPELINE

[1/5] Setting up environment...

ENVIRONMENT INITIALIZATION
Patient:  adolescent#001
Seed: 42
OK Successfully initialized MockT1DEnv
OK SimglucoseGymEnv initialized successfully!


[2/5] Testing environment...

ENVIRONMENT TEST
✅ Reset successful!
Initial Observation: [118.727005]
Observation shape: (1,)
Observation dtype: float32

--- Testing Steps ---
Step 1:
  Action: 22.65 | Glucose: 116.99 | Reward: 1.0000
Step 2:
  Action: 16.71 | Glucose: 119.45 | Reward: 1.0000
Step 3:
  Action: 11.56 | Glucose: 119.12 | Reward: 1.0000
Step 4:
  Action: 1.04 | Glucose: 117.19 | Reward: 1.0000
Step 5:
  Action: 7.13 | Glucose: 110.35 | Reward: 1.0000

--- Episode Summary ---
  Total Steps: 5
  Total Return: 5.0000
  Average Reward: 1.0000
  Mean Glucose: 116.62 mg/dL
  Glucose Range: [110.35, 119.45]


[3/5] Setting up data collection...

Setting up Data Collection
Dataset Name: simglucose_adolescen

Traceback (most recent call last):
  File "/var/folders/n4/5y0dr95d4qn5qvlcvfwg85140000gn/T/ipykernel_65056/36382913.py", line 32, in main
    observations=episode_dict['observations'],
                 ~~~~~~~~~~~~^^^^^^^^^^^^^^^^
TypeError: string indices must be integers, not 'str'
Traceback (most recent call last):
  File "/var/folders/n4/5y0dr95d4qn5qvlcvfwg85140000gn/T/ipykernel_65056/2549507451.py", line 35, in <module>
    results = main(
        episodes_data=episodes_data,
    ...<4 lines>...
        save_model_flag=True
    )
  File "/var/folders/n4/5y0dr95d4qn5qvlcvfwg85140000gn/T/ipykernel_65056/36382913.py", line 32, in main
    observations=episode_dict['observations'],
                 ~~~~~~~~~~~~^^^^^^^^^^^^^^^^
TypeError: string indices must be integers, not 'str'


In [1]:
# Install necessary package if not already installed
!pip install nbdev

# Initialize git repository (if not already done)
!git init

# Configure git user (replace with your information)
!git config --global user.email "monika.kujur25@imperial.ac.uk"
!git config --global user.name "MK25BM"

# Add all files to git
!git add .

# Commit changes
!git commit -m "Initial commit"

# Create a new repository on GitHub first, then connect your local repo
# Replace the URL with your GitHub repository URL
!git remote add origin https://github.com/MK25BM/offline-DRL.git

# Push to GitHub
!git push -u origin master  # or 'main' depending on your default branch name



Collecting nbdev
  Downloading nbdev-2.4.10-py3-none-any.whl.metadata (10 kB)
Collecting fastcore>=1.11.0 (from nbdev)
  Downloading fastcore-1.11.2-py3-none-any.whl.metadata (3.7 kB)
Collecting execnb>=0.1.12 (from nbdev)
  Downloading execnb-0.1.18-py3-none-any.whl.metadata (3.7 kB)
Collecting astunparse (from nbdev)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting ghapi>=1.0.3 (from nbdev)
  Downloading ghapi-1.0.8-py3-none-any.whl.metadata (14 kB)
Collecting build (from nbdev)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting fastgit (from nbdev)
  Downloading fastgit-0.0.1-py3-none-any.whl.metadata (1.9 kB)
Collecting pyproject_hooks (from build->nbdev)
  Downloading pyproject_hooks-1.2.0-py3-none-any.whl.metadata (1.3 kB)
Downloading nbdev-2.4.10-py3-none-any.whl (69 kB)
Downloading execnb-0.1.18-py3-none-any.whl (13 kB)
Downloading fastcore-1.11.2-py3-none-any.whl (88 kB)
Downloading ghapi-1.0.8-py3-none-any.whl (68 kB)
Downl