In [1]:
import numpy as np
import random

In [2]:
# defining the road crossing environment
class RoadCrossingEnv:
    def __init__(self, width=5):
        self.width = width
        self.agent_pos = 0
        self.goal_pos = width - 1
    
    def reset(self):
        self.agent_pos = 0
        return self.agent_pos
    
    def step(self, action):
        # Actions: 0 = left, 1 = right, 2 = stay
        if action == 0 and self.agent_pos > 0:
            self.agent_pos -= 1
        elif action == 1 and self.agent_pos < self.width - 1:
            self.agent_pos += 1
        
        reward = -1  # step penalty
        done = False
        
        if self.agent_pos == self.goal_pos:
            reward = 10
            done = True
        
        return self.agent_pos, reward, done
    
    def get_possible_actions(self):
        return [0, 1, 2]


In [3]:
#Define Q-Learning Agent
class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
        self.env = env
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.q_table = np.zeros((env.width, 3))
    
    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.env.get_possible_actions())
        else:
            return np.argmax(self.q_table[state])
    
    def learn(self, state, action, reward, next_state, done):
        predict = self.q_table[state, action]
        target = reward
        if not done:
            target += self.gamma * np.max(self.q_table[next_state])
        self.q_table[state, action] += self.lr * (target - predict)


In [4]:
#training the agent
def train_agent(episodes=500):
    env = RoadCrossingEnv(width=5)
    agent = QLearningAgent(env)
    
    for ep in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
        
        if (ep + 1) % 50 == 0:
            print(f"Episode {ep + 1}: total reward = {total_reward}")
    
    print("Training complete!")
    return agent, env

agent, env = train_agent()


Episode 50: total reward = 7
Episode 100: total reward = 7
Episode 150: total reward = 7
Episode 200: total reward = 7
Episode 250: total reward = 7
Episode 300: total reward = 6
Episode 350: total reward = 7
Episode 400: total reward = 7
Episode 450: total reward = 7
Episode 500: total reward = 5
Training complete!


In [5]:
#testing the trained agent
state = env.reset()
done = False
steps = 0
print("Testing trained agent...")

while not done and steps < 20:
    action = agent.choose_action(state)
    action_name = ['left', 'right', 'stay'][action]
    print(f"Step {steps}: Position = {state}, Action = {action_name}")
    state, reward, done = env.step(action)
    steps += 1

print(f"Final position: {state}")
print(f"Goal reached: {'Yes' if done else 'No'}")


Testing trained agent...
Step 0: Position = 0, Action = right
Step 1: Position = 1, Action = right
Step 2: Position = 2, Action = right
Step 3: Position = 3, Action = right
Final position: 4
Goal reached: Yes
