In [104]:
import numpy as np
import random

In [193]:
# Implementing the environment

from enum import Enum
from lib2to3.pgen2.token import N_TOKENS

class Action(Enum):
    right = 0
    down = 1
    left = 2
    up = 3

class Environment:
    def __init__(self):
        self.height = 3
        self.width = 4
        self.rewards = np.zeros((self.height, self.width))
        self.rewards[0, self.width-1] = 1
        self.rewards[1, self.width-1] = -1
        self.rewards[1, 1] = None
        self.state = (self.height-1, 0)
    
    def reset(self):
        self.state = (self.height-1, 0)
    
    def checkOutBound(self, state):
        if state[0] < 0 or state[0] > self.height-1:
            return True
        if state[1] < 0 or state[1] > self.width-1:
            return True
        return False

    def checkWall(self, state):
        if state == (1, 1):
            return True
        return False
    
    def checkDone(self, state):
        if self.rewards[state] == 1:
            return True
        if self.rewards[state] == -1:
            return True
        return False 
    
    def step(self, action):
        if action == Action.right:
            newState = (self.state[0], self.state[1]+1)
        elif action == Action.down:
            newState = (self.state[0]+1, self.state[1])
        elif action == Action.left:
            newState = (self.state[0], self.state[1]-1)
        elif action == Action.up:
            newState = (self.state[0]-1, self.state[1])
        
        if self.checkOutBound(newState):
            newState = self.state

        if self.checkWall(newState):
            newState = self.state
        
        self.state = newState

        done = self.checkDone(newState)
        
        reward = self.rewards[newState]
        
        return (newState, reward, done)

In [210]:
# Initializing all variables
episodes = 1000
gamma = 0.9
maxStep = 100

epsilon = 1.0
maxEpsilon = 1.0
minEpsilon = 0.1
decay = 1 / episodes

In [211]:
env = Environment()
qTable = np.random.rand(env.height, env.width)

In [212]:
def getAction(state):
    array = []
    actions = []
    if state[1]+1 <= qTable.shape[1]-1:
        array.append(qTable[state[0],state[1]+1] + env.rewards[state[0],state[1]+1])
        actions.append(Action.right)
    if state[0]+1 <= qTable.shape[0]-1:
        array.append(qTable[state[0]+1, state[1]] + env.rewards[state[0]+1, state[1]])
        actions.append(Action.down)
    if state[1]-1 >= 0:
        array.append(qTable[state[0],state[1]-1] + env.rewards[state[0],state[1]-1])
        actions.append(Action.left)
    if state[0]-1 >= 0:
        array.append(qTable[state[0]-1, state[1]] + env.rewards[state[0]-1, state[1]])
        actions.append(Action.up)
    return actions[np.argmax(array)]

def getRandomA(state):
    actions = []
    if state[1]+1 <= qTable.shape[1]-1:
        actions.append(Action.right)
    if state[0]+1 <= qTable.shape[0]-1:
        actions.append(Action.down)
    if state[1]-1 >= 0:
        actions.append(Action.left)
    if state[0]-1 >= 0:
        actions.append(Action.up)
    return actions[np.random.randint(0, len(actions))]

In [None]:
for episode in range(episodes):
    env.reset()
    for step in range(maxStep):
        state = env.state
        tradeOff = random.uniform(0, 1)
        if tradeOff > epsilon:
            action = getAction(state)
        else:
            action = getRandomA(state)
        newState, reward, done = env.step(action)
        if reward == None:
            continue
        qTable[state] = reward + gamma * qTable[newState]
        
        if done == True:
            break
        print(qTable)
    epsilon = minEpsilon + (maxEpsilon - minEpsilon) * np.exp(-decay*episode)
print(qTable)