REINFORCEMENT LEARNING - REG NO:RA2412049015036 - DATE:22.02.2025

In [None]:
# Lab 4- Cart Pole Balancing with Random Policy, Unified Notation for Episodic and
# Continuing Tasks, Policies and Value Functions, Optimal Policies and Optimal Value Functions, Optimality and Approximation

In [None]:
import gymnasium as gym

def run_cartpole_random_policy(episodes=10, seed=42):
    # Create CartPole environment with a seed for reproducibility
    env = gym.make("CartPole-v1", render_mode="human")  # Use "human" for rendering

    for episode in range(episodes):
        state, info = env.reset(seed=seed + episode)  # Reset the environment with a unique seed
        total_reward = 0
        done = False

        while not done:
            env.render()  # Render the environment (optional, can slow execution)
            action = env.action_space.sample()  # Select a random action (0 or 1)
            state, reward, terminated, truncated, info = env.step(action)  # Take the action
            done = terminated or truncated  # Check if the episode is done
            total_reward += reward

        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

    env.close()

if __name__ == "__main__":
    run_cartpole_random_policy(episodes=5)  # Run 5 episodes

Episode 1: Total Reward = 18.0
Episode 2: Total Reward = 10.0
Episode 3: Total Reward = 14.0
Episode 4: Total Reward = 9.0
Episode 5: Total Reward = 18.0


In [None]:
# Lab 5- Policy Iteration on GridWorld problem, Dynamic Programming-
# Policy Evaluation (Prediction), Policy Improvement, Policy Iteration,
# Value Iteration, Asynchronous Dynamic Programming, Generalized Policy Iteration,

In [None]:
import numpy as np

# Define the GridWorld environment
class GridWorld:
    def __init__(self, grid_size=(4, 4), goal=(3, 3)):
        self.grid_size = grid_size
        self.goal = goal
        self.actions = ['up', 'down', 'left', 'right']
        self.states = [(i, j) for i in range(grid_size[0]) for j in range(grid_size[1])]
        self.rewards = {s: -1 for s in self.states}  # Default reward
        self.rewards[goal] = 0  # Goal state has reward 0
        self.terminals = [goal]  # Terminal state

    def step(self, state, action):
        if state in self.terminals:
            return state, 0, True  # Terminal state, no further action

        next_state = list(state)
        if action == 'up':
            next_state[0] = max(0, next_state[0] - 1)
        elif action == 'down':
            next_state[0] = min(self.grid_size[0] - 1, next_state[0] + 1)
        elif action == 'left':
            next_state[1] = max(0, next_state[1] - 1)
        elif action == 'right':
            next_state[1] = min(self.grid_size[1] - 1, next_state[1] + 1)

        next_state = tuple(next_state)
        reward = self.rewards[next_state]
        done = next_state in self.terminals
        return next_state, reward, done

# Policy Iteration Algorithm
def policy_iteration(grid, gamma=0.9, theta=1e-6):
    states = grid.states
    actions = grid.actions
    policy = {s: np.random.choice(actions) for s in states}  # Random initial policy
    V = {s: 0 for s in states}  # Initialize value function

    while True:
        # Policy Evaluation
        while True:
            delta = 0
            for s in states:
                v = V[s]
                a = policy[s]
                next_state, reward, _ = grid.step(s, a)
                V[s] = reward + gamma * V[next_state]
                delta = max(delta, abs(v - V[s]))
            if delta < theta:
                break

        # Policy Improvement
        policy_stable = True
        for s in states:
            old_action = policy[s]
            action_values = {}
            for a in actions:
                next_state, reward, _ = grid.step(s, a)
                action_values[a] = reward + gamma * V[next_state]
            policy[s] = max(action_values, key=action_values.get)
            if old_action != policy[s]:
                policy_stable = False

        if policy_stable:
            break

    return policy, V

# Run Policy Iteration
grid = GridWorld()
policy, V = policy_iteration(grid)
print("Optimal Policy:", policy)
print("Value Function:", V)

Optimal Policy: {(0, 0): 'down', (0, 1): 'down', (0, 2): 'down', (0, 3): 'down', (1, 0): 'down', (1, 1): 'down', (1, 2): 'down', (1, 3): 'down', (2, 0): 'down', (2, 1): 'down', (2, 2): 'down', (2, 3): 'down', (3, 0): 'right', (3, 1): 'right', (3, 2): 'right', (3, 3): 'up'}
Value Function: {(0, 0): -4.0951, (0, 1): -3.439, (0, 2): -2.71, (0, 3): -1.9, (1, 0): -3.439, (1, 1): -2.71, (1, 2): -1.9, (1, 3): -1.0, (2, 0): -2.71, (2, 1): -1.9, (2, 2): -1.0, (2, 3): 0.0, (3, 0): -1.9, (3, 1): -1.0, (3, 2): 0.0, (3, 3): 0.0}


In [None]:
# Lab 6- Value iteration on gambler’s problem.

In [None]:
def value_iteration_gamblers(goal=100, p_win=0.4, gamma=1.0, theta=1e-6):
    states = range(goal + 1)
    V = np.zeros(goal + 1)  # Value function
    policy = np.zeros(goal + 1)  # Policy

    while True:
        delta = 0
        for s in states:
            if s == 0 or s == goal:
                continue  # Terminal states
            v = V[s]
            action_values = []
            for a in range(1, min(s, goal - s) + 1):  # Possible bets
                win_state = s + a
                lose_state = s - a
                action_value = p_win * (gamma * V[win_state]) + (1 - p_win) * (gamma * V[lose_state])
                action_values.append(action_value)
            V[s] = max(action_values)
            policy[s] = np.argmax(action_values) + 1  # Best action
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    return policy, V

# Run Value Iteration
policy, V = value_iteration_gamblers()
print("Optimal Policy:", policy)
print("Value Function:", V)

Optimal Policy: [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0.]
Value Function: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]
