In [None]:
import numpy as np  
import random  

# Environment parameters  
grid_size = 3  
goal_state = (0, 2)  # Flag cell  
fire_state = (1, 2)  

# Q-learning parameters  
Q = np.zeros((grid_size, grid_size, 4))  # Q-table: 4 actions (0: up, 1: down, 2: left, 3: right)  
alpha = 0.1  
gamma = 0.9  
epsilon = 1.0  
epsilon_decay = 0.99  
num_episodes = 1000  

# Reward function  
def get_reward(state):  
    if state == goal_state:  
        return 10  
    elif state == fire_state:  
        return -10  
    else:  
        return -1  

# Choose action  
def choose_action(state):  
    if random.uniform(0, 1) < epsilon:  # Exploration  
        return random.choice([0, 1, 2, 3])  # Random action  
    else:  # Exploitation  
        return np.argmax(Q[state[0], state[1]])  

# Move robot based on action  
def apply_action(state, action):  
    if action == 0 and state[0] > 0:  # Up  
        return (state[0] - 1, state[1])  
    elif action == 1 and state[0] < grid_size - 1:  # Down  
        return (state[0] + 1, state[1])  
    elif action == 2 and state[1] > 0:  # Left  
        return (state[0], state[1] - 1)  
    elif action == 3 and state[1] < grid_size - 1:  # Right  
        return (state[0], state[1] + 1)  
    return state  # Invalid move  

# Training Loop  
for episode in range(num_episodes):  
    state = (1, 0)  # Starting in the middle left cell  
    while state != goal_state:  
        action = choose_action(state)  
        new_state = apply_action(state, action)  
        reward = get_reward(new_state)  
        
        # Update Q-table  
        Q[state[0], state[1], action] += alpha * (reward + gamma * np.max(Q[new_state[0], new_state[1]]) - Q[state[0], state[1], action])  
        
        state = new_state  
    
    # Decay epsilon  
    epsilon *= epsilon_decay  
    
print("Training Completed.")