# Reinforcement Learning

Outline
1. Path finding example
2. Getting started with Gym
3. Designing AI agent
4. Exercise
5. References

### 1. Path finding example

In [None]:
from locale import currency
import numpy as np

R = np.matrix([[-1,-1,-1,-1,0,-1],
               [-1,-1,-1,0,-1,100],
               [-1,-1,-1,0,-1,-1],
               [-1,0,0,-1,0,-1],
               [-1,0,0,-1,-1,100],
               [-1,0,-1,-1,0,100]])

Q=np.matrix(np.zeros([6,6]))

alpha = 0.8

initial_state=1

def available_actions(state):
    curr_state_row = R[state,]
    av_act = np.where(curr_state_row>=0)[1]
    return av_act

available_act = available_actions(initial_state)

def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act))
    return next_action

action = sample_next_action(available_act)

def update (current_state, action, alpha):
    max_index = np.where(Q[action,]==np.max(Q[action,]))[1]

    if (max_index.shape[0] > 1):
        max_index = int (np.random.choice(max_index))
    else:
        max_index = max_index[0]
    max_value = Q[action, max_index]

    #Q learning formula
    Q[current_state,action] = R[current_state, action] + alpha*max_value

update(initial_state, action, alpha)

# TRAINING
for i in range (10000):
    current_state = np.random.randint(0,int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, alpha)

print("Trained Q matrix")
print(Q/np.max(Q)* 100)

# TESTING
goal_state = 5
current_state = 2
steps = [current_state]

while current_state != goal_state:
    next_step_index = np.where(Q[current_state,]==np.max(Q[current_state,]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index=int(np.random.choice(next_step_index))
    else:
        next_step_index = next_step_index[0]
    steps.append(next_step_index)
    current_state  =  next_step_index

print("Selected path")
print(steps)

### 2. Getting started with Gym

In this part, we install the necessary packages and <br>
1) create an environment <br>
2) draw the current state (observation) <br>
3) create a default loop to interact with the environment


In [None]:
# install the library
!pip install gymnasium
!pip install pygame
!pip install matplotlib

In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt 
import random

# Initialize environment
env = gym.make('CartPole-v1', render_mode = 'rgb_array')

# Observation and action space 
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {}".format(obs_space))
print("The action space: {}".format(action_space))

In [None]:
# reset the environment and see the initial observation
obs, info = env.reset()
print("The initial observation is {}".format(obs))

# Draw the current state (observation)
env_screen = env.render()
env.close()

plt.imshow(env_screen)

In [None]:
# Main (default) loop of the simulation
for _ in range(300):
    # get sample action from action space
    action = env.action_space.sample() 

    # apply selected action 
    # get new observation and reward
    # status if terminated or truncated, and other debug info
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()
        

In [None]:
# Draw the current state (observation)
env_screen = env.render()
env.close()

plt.imshow(env_screen)

### 3. Designing AI Agent

In this part, we create a class for our Agent that is not learning anything yet, <br>
but selects his actions randomly given the state.

In [None]:
from time import sleep
import gymnasium as gym
import random

# Initialize environment
env = gym.make('CartPole-v1', render_mode = 'rgb_array')

class Agent():
    def __init__(self, env):
        self.action_size = env.action_space.n
        
    def get_action(self, state):
        # select a random action
        action = random.choice(range(self.action_size))
        
        # extract a pole angle from state
        pole_angle = state[2]

        # select action based on the pole angle
        # 0 - Push cart to the left 
        # 1 - Push cart to the right
        action = 0 if pole_angle < 0 else 1 
        return action

agent = Agent(env)
state, _ = env.reset()
print("Reset state: ", state)

for i in range(100):
    action = agent.get_action(state)
    state, reward, done, trunc, info = env.step(action)
    print("State at %d:" % i, state)
    print(reward,state,done)

    if done:
        env.reset()
        env.close()
        break
    
    env.render()
    if reward<1:
        sleep(3)
        break
env.close()

### 4. Exercise
Implement Q-learning for CartPole task in Gym.<br>

There is a sample implementation (but uses outdated packages): <br>
https://github.com/nnqomariyah/aima-python/blob/master/gym-cartpole-qlearning.py

### 5. References

1. Slides for week 7 - Reinforcement Learning
2. https://github.com/aimacode/aima-python
3. https://github.com/nnqomariyah/aima-python (forked)
