In [None]:
!py -m pip install cmake "gym[atari]" scipy
!py -m pip install gym[atari]
!pip install gym[toy_text]
!pip install gym[accept-rom-license]
!pip install ale-py

In [2]:
import gym
from IPython.display import clear_output
from ale_py import ALEInterface
import numpy as np
import random
from time import sleep

  logger.warn(


## Setup Environment

In [3]:
def set_env(name):
    """
    This function is for creating the envirnment
    
    Input:
        name (string) : The environment name
    
    Output:
        env
    """
    env = gym.make(name)
    return env

## Training

In [4]:
def model_training(env,alpha=0.1,gamma=0.6,epsilon=0.1):
    """
    
    This function is for training the model
    
    Inputs:
        env
        alpha (float) : Learning rate --> Hyper parameter
        gamma (float) : Discount Factor --> Hyper parameter
        epsilon (float) : Exploration-Exploitation Factor --> Hyper parameter
    
    Output: 
        Q-table (list)
        
    """
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    # For plotting metrics
    all_epochs = []
    all_penalties = []

    for i in range(1, 100001):
        state = env.reset()

        epochs, penalties, reward, = 0, 0, 0
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(q_table[state]) # Exploit learned values

            next_state, reward, done, info = env.step(action) 

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state, action] = new_value

            if reward == -10:
                penalties += 1

            state = next_state
            epochs += 1

        if i % 100 == 0:
            clear_output(wait=True)
            print(f"Episode: {i}")

    print("Training finished.\n")
    return q_table

## Evaluation

In [5]:
def model_evaluation(env,q_table):
    """
    
    This function is for model evaluation
    Input: 
    
        q_table (list): from the training function
    
    Output
    
    frames (list of dictionaries) : For animating the result
    average_timesteps (float) : For time steps
    average_penalties (int) : For penalities
    

    """
    frames = [] # for animation
    
    total_epochs, total_penalties = 0, 0
    episodes = 100

    for _ in range(episodes):
        state = env.reset()
        epochs, penalties, reward = 0, 0, 0

        done = False

        while not done:
            action = np.argmax(q_table[state])
            state, reward, done, info = env.step(action)

            if reward == -10:
                penalties += 1
            # Put each rendered frame into dict for animation
            frames.append({
                'frame': env.render(mode='ansi'),
                'state': state,
                'action': action,
                'reward': reward
                }
            )
            epochs += 1

        total_penalties += penalties
        total_epochs += epochs
    average_timesteps = total_epochs / episodes
    average_penalties = total_penalties / episodes
    print(f"Results after {episodes} episodes:")
    print(f"Average timesteps per episode: {average_timesteps}")
    print(f"Average penalties per episode: {average_penalties}")
    
    return frames, average_timesteps , average_penalties

## Plotting Frames

In [6]:
def print_frames(frames):
    """
    This funtion for plotting the frames of the episodes
    
    Input:
        frames (list): list of frames
    
    """
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)

#### 1) Turn this code into a module of functions that can use multiple environments

In [7]:
def model_setup(name_="Taxi-v3",alpha=0.1,gamma=0.6,epsilon=0.1):
    """
    This model is for gathering all functions of setuping the environmen, training and evaluation in one function
    
    Inputs:
    
        name (string): the name of environment
        alpha (float) : Learning rate --> Hyper parameter
        gamma (float) : Discount Factor --> Hyper parameter
        epsilon (float) : Exploration-Exploitation Factor --> Hyper parameter
        
    Output:
        
        frames (list of dictionaries) : For animating the result
        average_timesteps (float) : For time steps
        average_penalties (int) : For penalities
    
    """
    env_ = set_env(name=name_)
    q_table_ = model_training(env=env_,alpha=alpha,gamma=gamma,epsilon=epsilon)
    frames_, average_timesteps,average_penalties = model_evaluation(env=env_,q_table=q_table_)
    return frames_,average_timesteps,average_penalties

In [19]:
# "Taxi-v3" Environment
frames,average_timesteps,average_penalties = model_setup(name_="Taxi-v3",alpha=0.1,gamma=0.6,epsilon=0.1)

Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 12.89
Average penalties per episode: 0.0


In [20]:
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep: 1289
State: 410
Action: 5
Reward: 20


In [21]:
# "FrozenLake-v1" Environment
frames,average_timesteps,average_penalties = model_setup(name_="FrozenLake-v1",alpha=0.1,gamma=0.6,epsilon=0.1)

Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 11.78
Average penalties per episode: 0.0


In [22]:
print_frames(frames)

  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG

Timestep: 1178
State: 5
Action: 1
Reward: 0.0


In [9]:
# "CliffWalking-v1" Environment
frames,average_timesteps,average_penalties = model_setup(name_="CliffWalking-v0",alpha=0.1,gamma=0.6,epsilon=0.1)

Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 13.0
Average penalties per episode: 0.0


In [10]:
print_frames(frames)

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  x


Timestep: 1300
State: 47
Action: 2
Reward: -1


#### 2) Tune alpha, gamma, and/or epsilon using a decay over episodes

In [29]:
def tuning_model_training(env,alpha=0.3,gamma=0.4,epsilon=0.7):
    """
    This function is for training the model
    
    Inputs:
        env
        alpha (float) : Learning rate --> Hyper parameter
        gamma (float) : Discount Factor --> Hyper parameter
        epsilon (float) : Exploration-Exploitation Factor --> Hyper parameter
    
    Output: 
        Q-table (list)
        
    """
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    # For plotting metrics
    all_epochs = []
    all_penalties = []
    episodes = 100001
    
    for i in range(1, episodes): 
        if i > episodes//4 and i < episodes//2:
            alpha, gamma, epsilon=alpha-0.05, gamma-0.05, epsilon-0.05
        elif i > episodes//2 and i < episodes//(4/3):
            alpha, gamma, epsilon=alpha-0.05, gamma-0.05, epsilon-0.05
        elif i > episodes// (4/3):
            alpha, gamma, epsilon=alpha-0.05, gamma-0.05, epsilon-0.05
            
        state = env.reset()

        epochs, penalties, reward, = 0, 0, 0
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(q_table[state]) # Exploit learned values

            next_state, reward, done, info = env.step(action) 

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state, action] = new_value

            if reward == -10:
                penalties += 1

            state = next_state
            epochs += 1

        if i % 100 == 0:
            clear_output(wait=True)
            print(f"Episode: {i}")

    print("Training finished.\n")
    return q_table

In [30]:
def model_tuning_setup(name_="Taxi-v3",alpha=0.2,gamma=0.4,epsilon=0.5):
    
    """
    This model is for gathering all functions of setuping the environmen, training and evaluation in one function
    
    Inputs:
    
        name (string): the name of environment
        alpha (float) : Learning rate --> Hyper parameter
        gamma (float) : Discount Factor --> Hyper parameter
        epsilon (float) : Exploration-Exploitation Factor --> Hyper parameter
        
    Output:
        
        frames (list of dictionaries) : For animating the result
        average_timesteps (float) : For time steps
        average_penalties (int) : For penalities
    
    """
    env_ = set_env(name=name_)
    q_table_ = tuning_model_training(env=env_,alpha=alpha,gamma=gamma,epsilon=epsilon)
    frames_, average_timesteps,average_penalties = model_evaluation(env=env_,q_table=q_table_)
    return frames_,average_timesteps,average_penalties

In [31]:
frames,average_timesteps,average_penalties = model_tuning_setup(name_="Taxi-v3",alpha=0.3,gamma=0.4,epsilon=0.7)

Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 200.0
Average penalties per episode: 0.0


We decreased alpha by 0.05 and epsilon by 0.1 and increased gamma by 0.1 on each quarter of episodes and we realized that the time of episodes till completing is increased due to decreasing the learning rate step.

In [None]:
print_frames(frames)

#### 3) Implement a grid search to discover the best hyperparameters

In [37]:
def Grid_search(env_name="Taxi-v3",param=None):
    """
    This function is for finding the best parameters 
    
    Inputs:
        env_name (string): environment name
        param (Dict): {'alpha':None, 'gamma':None, 'epsilon':None}
        
    Outputs:
        best_params (Dict): {'alpha':None,'gamma':None,'epsilon':None,'penalty':None,'time step':None}
    
    """
    temp_timestep=10000  # any large number
    temp_penalties = 10000 # any large number
    parameters = param
    for alpha in parameters['alpha']:
        for gamma in parameters['gamma']:
            for epsilon in parameters['epsilon']:
                frames,average_timesteps,average_penalties = model_setup(name_=env_name,alpha=alpha,gamma=gamma,epsilon=epsilon)
                if average_penalties <= temp_penalties:
                    if average_timesteps <= temp_timestep :
                        temp_penalties = average_penalties
                        temp_timestep = average_timesteps
                        best_params = {'alpha':alpha,'gamma':gamma,'epsilon':epsilon,'penalty':temp_penalties,'time step':temp_timestep}
    clear_output(wait=True)
    print(f"Best parameters are: {best_params}")
    return best_params;

In [38]:
parameters = {'alpha': [0.3,0.2,0.1],'gamma':[0.3,0.2,0.1],'epsilon':[0.4,0.3,0.2]}
best_params = Grid_search(env_name="Taxi-v3",param=parameters)
best_params

Best parameters are: {'alpha': 0.1, 'gamma': 0.2, 'epsilon': 0.4, 'penalty': 0.0, 'time step': 12.49}


{'alpha': 0.1,
 'gamma': 0.2,
 'epsilon': 0.4,
 'penalty': 0.0,
 'time step': 12.49}

In [41]:
frames,average_timesteps,average_penalties = model_setup(name_="Taxi-v3",alpha=0.1,gamma=0.2,epsilon=0.4)

Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 12.59
Average penalties per episode: 0.0


In [42]:
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Timestep: 1259
State: 475
Action: 5
Reward: 20
