In [99]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [100]:
import numpy as np
import pandas as pd
import gym
import random
import os

In [101]:
import warnings
warnings.filterwarnings("ignore")

# Part 1 Frozen Lake

Frozen Lake is a classic environment in Reinforcement Learning (RL) used to demonstrate fundamental concepts such as value iteration, policy iteration, and Q-learning.

Frozen Lake is an environment provided by OpenAI Gym (now Gymnasium) where an agent must learn to navigate a frozen lake to reach a goal without falling into holes.

### Environment Description
The environment is typically a 4x4 or 8x8 grid:

S – Start (initial state)

F – Frozen (safe)

H – Hole (terminal state, game over)

G – Goal (terminal state, success)

### RL Problem Setup
States (S): Grid cells (e.g., 16 states in a 4x4 map)

Actions (A): Left, Right, Up, Down

Reward:

+1 for reaching the Goal (G)

0 for every other state

Transition Probabilities: The environment is stochastic. The agent might slip and not move in the intended direction.


### Types of Frozen Lake
is_slippery=True: Default; stochastic transitions

is_slippery=False: Deterministic transitions (easier)


### Goal
The agent must learn an optimal policy (π*) to maximize reward by safely reaching the goal, avoiding holes, and handling uncertainty due to slippage.

### Solving Frozen Lake
You can apply:

- Dynamic Programming
    - Policy Iteration
    - Value Iteration

- Model-Free Methods
    - Q-learning
    - SARSA



# 4x4 Map and Non Sliperty

## Step 1: Create and understand FrozenLake environment 


In [102]:
# create the frozen lake environment using 4x4 Map and Non Sliperty version
env = gym.make("FrozenLake-v1", map_name='4x4', is_slippery=False) 

In [103]:
env.reset() # This resets the environment to its initial state.

0

**Agent has 4 discrete actions**

| Action Index | Action Name |
| ------------ | ----------- |
| 0            | Left        |
| 1            | Down        |
| 2            | Right       |
| 3            | Up          |


In [104]:
print('\n ____Action Space____ \n')
print('Action Space Shape: ', env.action_space.n)
print('Action Space Sample: ', env.action_space.sample())


 ____Action Space____ 

Action Space Shape:  4
Action Space Sample:  3


### Frozen Lake Rewards
**In the FrozenLake-v1 environment**:

| State Type    | Reward | Description                       |
| ------------- | ------ | --------------------------------- |
| Safe Tile (F) | 0      | No reward for just moving         |
| Hole (H)      | 0      | Episode ends, no reward (failure) |
| Goal (G)      | 1      | Success! You reached the goal     |


In [105]:
state_space = env.observation_space.n
print(f'There are {state_space} possible states. \n')
action_space = env.action_space.n
print(f'There are {action_space} possible actions. \n')

There are 16 possible states. 

There are 4 possible actions. 



## Step 2: Create and Initialize the Q-table

In [106]:
# lets create QTable of size(state_space, action_space) & Initialize each value at 0 using np.zeros

def initialize_q_table(state_space, action_space):
    qTable = np.zeros((state_space, action_space))

    return qTable


In [107]:
qTable_frozenLake = initialize_q_table(state_space, action_space)

In [108]:
qTable_frozenLake

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## step 3: Define Epsilon and Greedy Policy

The epsilon-greedy policy helps the agent balance between:

- Exploration (try new actions)
- Exploitation (use the best-known action)

**Parameters**:
| Parameter      | Description                                   |
| -------------- | --------------------------------------------- |
| `q_table`      | The Q-table (state × action values)           |
| `state`        | Current state of the agent                    |
| `epsilon`      | Exploration rate (e.g., 0.1 means 10% random) |
| `action_space` | Number of possible actions                    |


### Epsilon Greedy Policy (Acting Policy)

In [109]:
def epsilon_greedy_policy(qTable, state, epsilon):

    random_num = random.uniform(0,1) # Randomly generate number b/w 0 & 1.
    if random_num > epsilon:         #    if random_num > epsilon: ---> exploitation
        # Take the action with the heighest value
        # Agent will choose the action with highest q-value
        action = np.argmax(qTable[state])

    else: #        else: -----> exploration 

        action = env.action_space.sample() # Take a random action


    return action     
        

### On-Policy vs Off-Policy in Reinforcement Learning
These terms refer to how the agent learns from experience. Specifically, whether it learns from the same policy it uses to act or from a different one.

- Off-Policy: using a diffrent policy for acting and for updating
    - Q-Learning 
- On-Policy: using the same policy for acting and for updating
    - Example: SARSA


Remember:
- Epsilon Greedy Policy (Acting Policy)
- Greedy Policy (Updating Policy)


## Step 4: Define the greedy policy 

### Greedy Policy (Updating Policy)

In [110]:
def greedy_policy(qTable, state):
    # Exploitation: Take the action with the heighest state & action-value
    action = np.argmax(qTable[state])

    return action

## Step 5: Define Hyperparameters 

In [111]:
# Training parameters
n_training_episodes = 10000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate # This is the discounting rate, which determines the importance of future rewards. A value close to 1 means future rewards are highly considered, while a value close to 0 means the agent prioritizes immediate rewards.
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability
decay_rate = 0.0005            # Exponential decay rate for exploration prob

## Step 6: Create Trainig Loop Method

In [112]:
def train (n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qTable):
    for episode in range(n_training_episodes):

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) # Reduce epsilon (because we need less and less exploration)

        # Reset Environment
        state = env.reset()
        step = 0
        done = False

        # Repeat
        for step in range(max_steps):

            # choose action using epsilon greedy policy
            action = epsilon_greedy_policy(qTable, state, epsilon)

            # take action and observe Rt+1 and St+1
            # take action (A), observe the outcome state (S), and reward (R)
            new_state, reward, done, info = env.step(action) # take choosen action to the environment 


            # update the Q-value for the state-action pair using the Q-Learning Formula
            qTable[state][action] = qTable[state][action] +  learning_rate * (reward + gamma * np.max(qTable[new_state]) - qTable[state][action])

            # if done, finish the episode
            if done:
                break


            # now our state is the new state
            state = new_state


    return qTable

## Step 7: Train the The Q-Learning Agent

In [113]:
q_Table_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qTable_frozenLake)

## Step 8: Let's see the Q-Learning Table

In [114]:
qTable_frozenLake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

## Step 9: Define the Evaluation Method

## Step 10: Evaluate our Q-Learning Agent

In [117]:
def evaluate_agent(env, qTable, n_eval_episodes, max_steps):
    rewards_per_episode = []

    for episode in range(n_eval_episodes):
        state = env.reset()
        done = False
        episode_reward = 0

        for step in range(max_steps):
            action = np.argmax(qTable[state])  # Always exploit (greedy)
            new_state, reward, done, info = env.step(action)
            episode_reward += reward

            if done:
                break

            state = new_state

        rewards_per_episode.append(episode_reward)

    # calculate avg reward
    avg_reward = np.mean(rewards_per_episode)

    # calculate std deviation of rewards
    std_reward = np.std(rewards_per_episode)

   

    return avg_reward, std_reward


In [122]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_agent(env, q_Table_frozenlake, n_eval_episodes=100, max_steps=99) 
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

Mean Reward: 1.0 +/- 0.0


# 8x8 Map and Non Sliperty

In [129]:
env = gym.make("FrozenLake-v1", map_name='8x8', is_slippery=True) 

In [130]:
env.reset() # This resets the environment to its initial state.

0

In [131]:
print('\n ____Action Space____ \n')
print('Action Space Shape: ', env.action_space.n)
print('Action Space Sample: ', env.action_space.sample())


 ____Action Space____ 

Action Space Shape:  4
Action Space Sample:  2


In [132]:
state_space = env.observation_space.n
print(f'There are {state_space} possible states. \n')
action_space = env.action_space.n
print(f'There are {action_space} possible actions. \n')

There are 64 possible states. 

There are 4 possible actions. 



In [133]:
# lets create QTable of size(state_space, action_space) & Initialize each value at 0 using np.zeros

def initialize_q_table(state_space, action_space):
    qTable = np.zeros((state_space, action_space))

    return qTable


In [134]:
qTable_frozenLake = initialize_q_table(state_space, action_space)

In [135]:
qTable_frozenLake

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],


In [140]:
# Training parameters
n_training_episodes = 100000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate # This is the discounting rate, which determines the importance of future rewards. A value close to 1 means future rewards are highly considered, while a value close to 0 means the agent prioritizes immediate rewards.
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability
decay_rate = 0.0005            # Exponential decay rate for exploration prob

In [141]:
q_Table_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qTable_frozenLake)

In [142]:
qTable_frozenLake

array([[1.81363345e-02, 1.80563345e-02, 2.30345571e-02, 2.00733203e-02],
       [1.94641801e-02, 2.99256040e-02, 2.17363368e-02, 2.01546397e-02],
       [2.76213574e-02, 2.31521188e-02, 2.93919006e-02, 2.21842464e-02],
       [3.20111985e-02, 3.06481745e-02, 3.18897461e-02, 3.79085643e-02],
       [4.14165645e-02, 4.17103057e-02, 4.33028097e-02, 3.65310493e-02],
       [4.42770779e-02, 4.71268699e-02, 5.27674247e-02, 4.83307021e-02],
       [5.25656555e-02, 5.47252267e-02, 6.21033347e-02, 5.44383942e-02],
       [5.60557451e-02, 5.34360518e-02, 5.98815844e-02, 5.43024625e-02],
       [1.68839894e-02, 1.70022841e-02, 1.73344822e-02, 1.88616813e-02],
       [1.73236423e-02, 1.58165986e-02, 2.10693816e-02, 1.77630947e-02],
       [1.98732689e-02, 2.04758864e-02, 2.61888758e-02, 2.06865151e-02],
       [1.30096149e-02, 5.52828272e-03, 8.76824808e-03, 3.68543437e-02],
       [3.51152354e-02, 3.06177701e-02, 3.05471278e-02, 5.18237891e-02],
       [4.89040633e-02, 5.04186428e-02, 5.17815002e

In [144]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_agent(env, q_Table_frozenlake, n_eval_episodes=100, max_steps=99) 
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

Mean Reward: 0.07 +/- 0.25514701644346144


# Part 2: Taxi-v3 🚖

### Step 1: Create and understand [Taxi-v3 🚕](https://www.gymlibrary.ml/environments/toy_text/taxi/)
---

💡 A good habit when you start to use an environment is to check its documentation

👉 https://www.gymlibrary.ml/environments/toy_text/taxi/

---

In Taxi-v3 🚕, there are four designated locations in the grid world indicated by R(ed), G(reen), Y(ellow), and B(lue). When the episode starts, the taxi starts off at a random square and the passenger is at a random location. The taxi drives to the passenger’s location, picks up the passenger, drives to the passenger’s destination (another one of the four specified locations), and then drops off the passenger. Once the passenger is dropped off, the episode ends.


| Action                          | Reward  |
| ------------------------------- | ------- |
| Each time step (move)           | **–1**  |
| Illegal pickup/dropoff          | **–10** |
| Successful dropoff (goal state) | **+20** |

### What is a “Good” Average Reward?

| Avg. Reward per Episode | Rating        | Comment                                              |
| ----------------------- | ------------- | ---------------------------------------------------- |
| **+8 to +20**           | ⭐ Excellent   | Fast and optimal routes with no illegal moves        |
| **0 to +7**             | ✅ Acceptable  | A few wrong turns or minor delays                    |
| **–10 to 0**            | ⚠️ Needs Work | Likely illegal pickups/dropoffs or inefficient moves |
| **< –10**               | ❌ Poor        | Many penalties, aimless wandering                    |



In [145]:
env = gym.make("Taxi-v3")

In [146]:
env.reset() # This resets the environment to its initial state.

209

In [147]:
print('\n ____Action Space____ \n')
print('Action Space Shape: ', env.action_space.n)
print('Action Space Sample: ', env.action_space.sample())


 ____Action Space____ 

Action Space Shape:  6
Action Space Sample:  4


In [148]:
state_space = env.observation_space.n
print(f'There are {state_space} possible states. \n')
action_space = env.action_space.n
print(f'There are {action_space} possible actions. \n')

There are 500 possible states. 

There are 6 possible actions. 



In [149]:
# lets create QTable of size(state_space, action_space) & Initialize each value at 0 using np.zeros

def initialize_q_table(state_space, action_space):
    qTable = np.zeros((state_space, action_space))

    return qTable


In [150]:
qTable_taxi = initialize_q_table(state_space, action_space)

In [151]:
qTable_taxi

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [157]:
# Training parameters
n_training_episodes = 500000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
env_id = "Taxi-v3"           # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate # This is the discounting rate, which determines the importance of future rewards. A value close to 1 means future rewards are highly considered, while a value close to 0 means the agent prioritizes immediate rewards.
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability
decay_rate = 0.0005            # Exponential decay rate for exploration prob

In [158]:
qTable_taxi = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qTable_taxi)

In [159]:
qTable_taxi

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 2.75200369,  3.94947757,  2.75200369,  3.94947757,  5.20997639,
        -5.05052243],
       [ 7.93349184,  9.40367562,  7.93349184,  9.40367562, 10.9512375 ,
         0.40367562],
       ...,
       [10.9512375 , 12.58025   , 10.9512375 ,  9.40367562,  1.9512375 ,
         1.9512375 ],
       [ 5.20997639,  6.53681725,  5.20997639,  6.53681725, -3.79002361,
        -3.79002361],
       [16.1       , 14.295     , 16.1       , 18.        ,  7.1       ,
         7.1       ]])

In [161]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_agent(env, qTable_taxi, n_eval_episodes=100, max_steps=99) 
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

Mean Reward: 7.84 +/- 2.667283262047734
