# Q-Learning: From Theory to Implementation

> **"Q-Learning is the foundation of modern reinforcement learning."**

## Learning Objectives
- Understand the Q-Learning algorithm and its mathematical foundation
- Implement Q-Learning from scratch for grid world environments
- Learn about exploration strategies (ε-greedy, UCB)
- Master hyperparameter tuning and convergence analysis
- Apply Q-Learning to solve real-world problems


## 1. Q-Learning Algorithm

### Q-Function
Q(s,a) represents the expected cumulative reward for taking action a in state s.

### Q-Learning Update Rule
**Q(s,a) ← Q(s,a) + α[r + γ max_a' Q(s',a') - Q(s,a)]**

Where:
- α: learning rate
- r: immediate reward
- γ: discount factor
- s': next state
- a': next action


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import random

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
class QLearningAgent:
    """Q-Learning agent implementation from scratch."""
    
    def __init__(self, n_states, n_actions, learning_rate=0.1, discount_factor=0.9, 
                 epsilon=0.1, epsilon_decay=0.995, epsilon_min=0.01):
        self.n_states = n_states
        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        # Initialize Q-table with zeros
        self.q_table = np.zeros((n_states, n_actions))
        
    def choose_action(self, state, training=True):
        """Choose action using epsilon-greedy policy."""
        if training and np.random.random() < self.epsilon:
            # Explore: choose random action
            return np.random.choice(self.n_actions)
        else:
            # Exploit: choose best action
            return np.argmax(self.q_table[state])
    
    def update_q_table(self, state, action, reward, next_state, done):
        """Update Q-table using Q-learning update rule."""
        if done:
            # Terminal state
            target = reward
        else:
            # Non-terminal state
            target = reward + self.discount_factor * np.max(self.q_table[next_state])
        
        # Q-learning update rule
        self.q_table[state, action] += self.learning_rate * (target - self.q_table[state, action])
    
    def decay_epsilon(self):
        """Decay exploration rate."""
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

print("Q-Learning agent class defined successfully!")


In [None]:
# Grid World Environment
class GridWorld:
    """Simple grid world environment for Q-learning."""
    
    def __init__(self, size=5):
        self.size = size
        self.n_states = size * size
        self.n_actions = 4  # Up, Down, Left, Right
        
        # Define grid
        self.grid = np.zeros((size, size))
        self.start_pos = (0, 0)
        self.goal_pos = (size-1, size-1)
        self.current_pos = self.start_pos
        
        # Actions: 0=Up, 1=Down, 2=Left, 3=Right
        self.action_effects = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        
    def reset(self):
        """Reset environment to initial state."""
        self.current_pos = self.start_pos
        return self._pos_to_state(self.current_pos)
    
    def _pos_to_state(self, pos):
        """Convert position to state index."""
        return pos[0] * self.size + pos[1]
    
    def _state_to_pos(self, state):
        """Convert state index to position."""
        return (state // self.size, state % self.size)
    
    def step(self, action):
        """Take a step in the environment."""
        # Calculate new position
        new_row = self.current_pos[0] + self.action_effects[action][0]
        new_col = self.current_pos[1] + self.action_effects[action][1]
        
        # Check boundaries
        if 0 <= new_row < self.size and 0 <= new_col < self.size:
            self.current_pos = (new_row, new_col)
        
        # Calculate reward
        if self.current_pos == self.goal_pos:
            reward = 100  # Goal reached
            done = True
        else:
            reward = -1   # Step penalty
            done = False
        
        return self._pos_to_state(self.current_pos), reward, done
    
    def render(self):
        """Render the current state of the environment."""
        grid_display = self.grid.copy()
        grid_display[self.current_pos] = 2  # Agent position
        grid_display[self.goal_pos] = 3     # Goal position
        
        print("Grid World:")
        print("0: Empty, 2: Agent, 3: Goal")
        print(grid_display)
        print(f"Current position: {self.current_pos}")
        print(f"Goal position: {self.goal_pos}")

print("Grid World environment defined successfully!")


In [None]:
# Train Q-Learning agent
def train_q_learning(agent, env, episodes=1000):
    """Train Q-Learning agent on the environment."""
    episode_rewards = []
    episode_lengths = []
    
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        steps = 0
        max_steps = 100  # Prevent infinite episodes
        
        while steps < max_steps:
            # Choose action
            action = agent.choose_action(state, training=True)
            
            # Take step
            next_state, reward, done = env.step(action)
            
            # Update Q-table
            agent.update_q_table(state, action, reward, next_state, done)
            
            # Update state and tracking variables
            state = next_state
            total_reward += reward
            steps += 1
            
            # Check if episode is done
            if done:
                break
        
        # Decay exploration rate
        agent.decay_epsilon()
        
        # Store episode statistics
        episode_rewards.append(total_reward)
        episode_lengths.append(steps)
        
        # Print progress
        if episode % 100 == 0:
            avg_reward = np.mean(episode_rewards[-100:])
            print(f"Episode {episode}, Average Reward: {avg_reward:.2f}, Epsilon: {agent.epsilon:.3f}")
    
    return episode_rewards, episode_lengths

# Create environment and agent
env = GridWorld(size=5)
agent = QLearningAgent(n_states=env.n_states, n_actions=env.n_actions, 
                      learning_rate=0.1, discount_factor=0.9, 
                      epsilon=0.9, epsilon_decay=0.995, epsilon_min=0.01)

print("Training Q-Learning agent...")
print("=" * 50)

# Train the agent
episode_rewards, episode_lengths = train_q_learning(agent, env, episodes=1000)

# Plot training progress
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot episode rewards
ax1.plot(episode_rewards, alpha=0.6)
ax1.set_xlabel('Episode')
ax1.set_ylabel('Total Reward')
ax1.set_title('Episode Rewards During Training')
ax1.grid(True, alpha=0.3)

# Plot moving average
window = 50
if len(episode_rewards) >= window:
    moving_avg = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
    ax1.plot(range(window-1, len(episode_rewards)), moving_avg, 'r-', linewidth=2, label=f'Moving Average ({window})')
    ax1.legend()

# Plot episode lengths
ax2.plot(episode_lengths, alpha=0.6)
ax2.set_xlabel('Episode')
ax2.set_ylabel('Episode Length')
ax2.set_title('Episode Lengths During Training')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTraining completed!")
print(f"Final average reward (last 100 episodes): {np.mean(episode_rewards[-100:]):.2f}")
print(f"Final average episode length: {np.mean(episode_lengths[-100:]):.2f}")
print(f"Final epsilon: {agent.epsilon:.3f}")
