<a href="https://colab.research.google.com/github/Krishnan-Raghavan/Packt/blob/main/GameTheoryChapter9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random

In [2]:
class GridWorld:
    def __init__(self, grid_size=(50, 50), num_obstacles=100):
        self.grid_size = grid_size
        self.num_obstacles = num_obstacles
        self.state = (0, 0)  # Start state
        self.goal_state = (grid_size[0]-1, grid_size[1]-1)  # Goal state at the bottom-right corner
        self.actions = ['up', 'down', 'left', 'right']
        self.rewards = np.zeros(grid_size)
        self.rewards[self.goal_state] = 1  # Reward for reaching the goal
        self.obstacle_states = self._place_obstacles()
        for obstacle in self.obstacle_states:
            self.rewards[obstacle] = -1  # Penalty for hitting the obstacle

    def _place_obstacles(self):
        obstacles = set()
        while len(obstacles) < self.num_obstacles:
            obstacle = (random.randint(0, self.grid_size[0]-1), random.randint(0, self.grid_size[1]-1))
            if obstacle != self.state and obstacle != self.goal_state:
                obstacles.add(obstacle)
        return list(obstacles)

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        next_state = list(self.state)
        if action == 0:  # Up
            next_state[0] = max(0, self.state[0] - 1)
        elif action == 1:  # Down
            next_state[0] = min(self.grid_size[0] - 1, self.state[0] + 1)
        elif action == 2:  # Left
            next_state[1] = max(0, self.state[1] - 1)
        elif action == 3:  # Right
            next_state[1] = min(self.grid_size[1] - 1, self.state[1] + 1)

        self.state = tuple(next_state)
        reward = self.rewards[self.state]
        done = self.state == self.goal_state
        return self.state, reward, done

    def print_grid(self):
        grid = np.full(self.grid_size, ' ')
        grid[self.state] = 'S'
        grid[self.goal_state] = 'G'
        for obstacle in self.obstacle_states:
            grid[obstacle] = 'X'

        print('\n'.join(' '.join(row) for row in grid))

In [3]:
def q_learning(env, num_episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
    q_table = np.zeros((*env.grid_size, len(env.actions)))

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(len(env.actions)))  # Explore: random action
            else:
                action = np.argmax(q_table[state])  # Exploit: best action based on Q-table

            next_state, reward, done = env.step(action)

            old_value = q_table[state][action]
            next_max = np.max(q_table[next_state])

            # Update Q-value using the Bellman equation
            q_table[state][action] = old_value + alpha * (reward + gamma * next_max - old_value)

            state = next_state

    return q_table

In [4]:
# Initialize environment with a 8x8 grid and 16 obstacles
grid_size = (5, 5)
num_obstacles = 5
env = GridWorld(grid_size, num_obstacles)

# Print the initial grid
print("Initial Grid:")
env.print_grid()

# Run Q-learning
q_table = q_learning(env)

# Print the Q-table
print("Q-Table:")
print(q_table)

Initial Grid:
S       X
X   X    
  X      
         
      X G
Q-Table:
[[[ 0.32296569 -0.67303051  0.42231948  0.4782969 ]
  [ 0.44361408  0.32552745  0.40736329  0.531441  ]
  [ 0.49998303 -0.60276012  0.4463094   0.59049   ]
  [ 0.54225082  0.6561      0.50848162 -0.39952455]
  [-0.39901955  0.728302    0.         -0.5017291 ]]

 [[ 0.41460793  0.         -0.40951     0.0340037 ]
  [ 0.45275674 -0.61257951 -0.45708339 -0.47850578]
  [ 0.52767649  0.          0.02414001  0.        ]
  [ 0.52847381  0.50691927 -0.54991743  0.729     ]
  [-0.37483899  0.81        0.63615757  0.60807109]]

 [[-0.3439      0.          0.         -0.1       ]
  [ 0.039154    0.          0.          0.        ]
  [-0.1         0.          0.          0.        ]
  [ 0.65561482  0.00713231  0.          0.1538071 ]
  [ 0.61957728  0.9         0.51712337  0.78205593]]

 [[ 0.          0.          0.          0.        ]
  [-0.1         0.          0.          0.        ]
  [ 0.          0.          0.       

In [5]:
# Extract policy
policy = np.argmax(q_table, axis=2)
policy_symbols = {0: '↑', 1: '↓', 2: '←', 3: '→'}

print("\nPolicy:")
for row in policy:
    print(' '.join(policy_symbols[action] for action in row))


Policy:
→ → → ↓ ↓
↑ ↑ ↑ → ↓
↓ ↑ ↓ ↑ ↓
↑ ↓ ↑ ↑ ↓
↑ ↑ ↑ ↑ ↑


In [6]:
# Example usage of the learned policy
for _ in range(5):
    state = env.reset()
    path = [state]
    done = False
    while not done:
        action = np.argmax(q_table[state])
        next_state, reward, done = env.step(action)
        path.append(next_state)
        state = next_state
    print(f"Path taken: {path}")
    print("Goal reached!\n")

Path taken: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (1, 4), (2, 4), (3, 4), (4, 4)]
Goal reached!

Path taken: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (1, 4), (2, 4), (3, 4), (4, 4)]
Goal reached!

Path taken: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (1, 4), (2, 4), (3, 4), (4, 4)]
Goal reached!

Path taken: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (1, 4), (2, 4), (3, 4), (4, 4)]
Goal reached!

Path taken: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (1, 4), (2, 4), (3, 4), (4, 4)]
Goal reached!

