In [None]:
import numpy as np
from typing import Any
from pydantic import BaseModel, Field
import math
from enum import Enum

class CartPoleMini:
    """Cart Pole 极简版的自行实现
    """
    def __init__(self, seed, max_steps = 5000) -> None:
        pass
    
    def reset(self, seed) -> Any:
        pass
    
    def step(self, action) -> Any:
        pass

class State(BaseModel):
    x: float = Field(default=0, description="小车位置")
    x_dot: float = Field(default=0, description="小车速度")
    theta: float = Field(default=0, description="杆子角度")
    theta_dot: float = Field(default=0, description="杆子角速度")
    
class Feature(BaseModel):
    x: float = Field(default=0, description="小车位置")
    x2: float = Field(default=0, description="小车位置平方")
    x_dot: float = Field(default=0, description="小车速度")
    x_dot2: float = Field(default=0, description="小车速度平方")
    theta: float = Field(default=0, description="杆子角度")
    theta2: float = Field(default=0, description="杆子角度平方")
    theta_dot: float = Field(default=0, description="杆子角速度")
    theta_dot2: float = Field(default=0, description="杆子角速度平方")
    bias: float = Field(default=1.0, description="偏置项")
    
class Step(BaseModel):
    state: State
    reward: float
    done: bool
    terminated: bool
    truncated: bool
    info: dict

class Action(str, Enum):
    LEFT = "left"
    RIGHT = "right"

class CartPoleConfig(BaseModel):
    state_dim: int = Field(default=len(State().model_dump()), description="状态维度")
    phi_dim : int = Field(default=len(Feature().model_dump()), description="手工特征维度")
    n_actions: int = Field(default=len(Action), description="动作个数")
    alpha: float = Field(default=0.01, description="学习率")
    gamma: float = Field(default=0.99, description="折扣因子，未来奖励的当前价值")
    eps_start: float = Field(default=1.0, description="epsilon-贪婪策略的初始epsilon，初期高探索，快速了解环境和哪些动作好")
    eps_end: float = Field(default=0.01, description="epsilon-贪婪策略的最终epsilon值，后期低探索，更多利用已学知识")
    eps_steps: int = Field(default=5000, description="epsilon-贪婪策略的epsilon衰减步数，  从eps_start 线性衰减到 eps_end 需要多少步")
    
class LinearQNet:
    """
    Q(s,a) = w_a^T * phi(s)  （phi: 特征映射）
    这里使用简单的手工特征： [s, s^2, 1] 以提高表达力，但仍保持超小型。
    """
    def __init__(self, cfg: CartPoleConfig, seed = 42) -> None:
        pass
    
    def phi(self, s: np.ndarray) -> np.ndarray: # type: ignore
        pass
    
    def epsilon(self,) -> float: # type: ignore
        pass
    
    def q_values(self, s: np.ndarray) -> np.ndarray: # type: ignore
        pass
    
    def act(self, s: np.ndarray) -> int: # type: ignore
        pass
    
    def update_td0(self, s, a, r, s_, done):
        pass

In [None]:
class CartPoleMini(CartPoleMini):
    def __init__(self, seed: int, max_steps=5000) -> None:
        super().__init__(seed, max_steps)
        self.gravity = 9.8 # 重力加速度
        self.cart_mass = 1.0 # 小车质量
        self.pole_mass = 0.1 # 杆子质量
        self.total_mass = self.cart_mass + self.pole_mass # 总质量
        self.pole_length = 0.5 # 杆子长度的一半
        self.pole_mass_length = self.pole_mass * self.pole_length # 杆子质量与长度的乘积
        self.force_mag = 1 # 作用在小车上的力的大小
        self.tau = 10e-3 # 1000Hz 采样频率，可以理解为帧率的倒数
        
        self.theta_threshold_radians: float = 30 * 2 * math.pi / 360 # 30度，杆子最大倾角，超过就算失败
        self.x_threshold: float = 5 # 小车最大移动距离，超过就算失败
        
        self.np_random = np.random.RandomState(seed) # 环境随机数种子
        self.max_steps = max_steps # 每个回合的最大步数
        self.state: State = State() # 环境状态
        self.steps: int = 0 # 当前步数
        
    def reset(self, seed: int) -> State:
        """重置环境，返回初始状态"""
        if seed is not None:
            self.np_random.seed(seed)
        
        state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
        self.state = State(
            x = state[0],
            x_dot = state[1], 
            theta = state[2], 
            theta_dot = state[3]
        )
        
        return self.state.model_copy()
    
    def step(self, action: Action) -> Step:
        """每一步骤的环境交互"""
        x = self.state.x
        x_dot = self.state.x_dot
        theta = self.state.theta
        theta_dot = self.state.theta_dot
        
        force = self.force_mag if action.value is Action.RIGHT else -self.force_mag
        
        cos_theta = math.cos(self.state.theta)
        sin_theta = math.sin(self.state.theta)
        
        temp = (force + self.pole_mass_length * theta_dot ** 2 * sin_theta) / self.total_mass
        theta_acc = (self.gravity * sin_theta - cos_theta * temp) / (
            self.pole_length * (4.0/3.0 - self.pole_mass * cos_theta ** 2 / self.total_mass)
        )
        
        x_acc = temp - self.pole_mass_length * theta_acc * cos_theta / self.total_mass
        
        x = x + self.tau * x_dot
        x_dot = x_dot + self.tau * x_acc
        theta = theta + self.tau * theta_dot
        theta_dot = theta_dot + self.tau * theta_acc
        
        self.state = State(x=x, x_dot=x_dot, theta=theta, theta_dot=theta_dot)
        self.steps += 1
        
        terminated = bool(
            x < -self.x_threshold
            or x > self.x_threshold
            or theta < -self.theta_threshold_radians
            or theta > self.theta_threshold_radians
        )
        truncated = self.steps >= self.max_steps
        done = terminated or truncated
        reward = 1
        
        return Step(
            state=self.state.model_copy(),
            reward=reward,
            done=done,
            terminated=terminated,
            truncated=truncated,
            info={}
        )
        
class LinearQNet(LinearQNet):
    """
    Q(s,a) = w_a^T * phi(s)  （phi: 特征映射）
    这里使用简单的手工特征： [s, s^2, 1] 以提高表达力，但仍保持超小型。
    """
    def __init__(self, cfg: CartPoleConfig, seed=42) -> None:
        super().__init__(cfg, seed)
        
        self.cfg = cfg
        self.rng = np.random.RandomState(seed)
        