# SARSA & Q-learning

This assignment implements SARSA and Q-learning algorithms from Reinforcement Learning

In [19]:
# Python 2D Array operations - https://www.tutorialspoint.com/python/python_2darray.htm

def showMatrix(matrix):
    for row in matrix:
        for col in row:
            print(col,end = '\t')
        print()


def createMatrix(matrix_size):
    # Create an NxN matrix with all values initialized to 0
    # https://www.geeksforgeeks.org/python-using-2d-arrays-lists-the-right-way/
    squareMatrix = [[0 for i in range(matrix_size)] for j in range(matrix_size)]
    return squareMatrix

def prettifyMatrix(matrix):
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            matrix[i][j] = round(matrix[i][j], 4)
    return matrix

def prettify3dMatrix(matrix):
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            for k in range(len(matrix[i][j])):
                matrix[i][j][k] = round(matrix[i][j][k], 2)
    return matrix

def arbitrarilyInitializeMatrix(matrix, initial_q_range):
    # Import function to generate random integers
    # https://www.geeksforgeeks.org/python-randint-function/
    from random import randint

    # Arbitrarily initialize Q values in the format ['→', '↓', '←', '↑']
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            # https://stackoverflow.com/questions/16655089/python-random-numbers-into-a-list/16655135#16655135
            matrix[i][j] = [randint(0, initial_q_range) for count in range(4)]
    return matrix

def nextStateCoordinates(current_row, current_col, direction, grid_size):
    i = current_row
    j = current_col
    #check for out of bounds and update coordinates
    if((direction == '←') and (current_col > 0)):
        j-=1
    elif((direction == '↑') and (current_row > 0)):
        i-=1
    elif((direction == '→') and (current_col < grid_size-1)):
        j+=1
    elif(current_row < grid_size-1):
        i+=1
    return (i,j)

In [20]:
from random import uniform as probability
from random import choice as randomChoice

def findNextAction(qListOfCurrentState, epsilon):
    actionList = ['→', '↓', '←', '↑']
    # https://stackoverflow.com/questions/33359740/random-number-between-0-and-1-in-python/33359801#33359801
    if(probability(0, 1) < epsilon):
        # Explore for ε times
        # https://www.geeksforgeeks.org/random-numbers-in-python/
        return randomChoice(actionList)
    else:
        # Exploit for 1-ε times (greedy action)
        # https://stackoverflow.com/questions/2474015/getting-the-index-of-the-returned-max-or-min-item-using-max-min-on-a-list/2474030#2474030
        max_index = qListOfCurrentState.index(max(qListOfCurrentState))
        return actionList[max_index]

def sarsa(grid_size, startState, goalState, reward, discount_factor, step_size, epsilon, initial_q_range, episode_range):

    qGrid = createMatrix(grid_size)
    qGrid[startState[0]][startState[1]] = 's'
    qGrid[goalState[0]][goalState[1]] = 'x'

    print("Matrix representation of given grid world :")
    showMatrix(qGrid)
    
    qGrid = arbitrarilyInitializeMatrix(qGrid, initial_q_range)

    # Set Q values of goal state to 0
    qGrid[goalState[0]][goalState[1]] = [0,0,0,0]

    print("\nArbitrarily initialized state-action Q values in the format ['→', '↓', '←', '↑'] :")
    showMatrix(qGrid)

    for episode_count in range(episode_range):
        currentState = startState
        current_action = findNextAction(qGrid[currentState[0]][currentState[1]], epsilon)

        # Simulate do-while on Python
        # https://www.javatpoint.com/python-do-while-loop
        while(True):
            nextState = nextStateCoordinates(currentState[0], currentState[1], current_action, grid_size)
            next_action = findNextAction(qGrid[nextState[0]][nextState[1]], epsilon)
            current_action_index = ['→', '↓', '←', '↑'].index(current_action)
            next_action_index = ['→', '↓', '←', '↑'].index(next_action)            
            currentQ = qGrid[currentState[0]][currentState[1]][current_action_index]
            nextQ = qGrid[nextState[0]][nextState[1]][next_action_index]
            qGrid[currentState[0]][currentState[1]][current_action_index] = currentQ + step_size * (reward + discount_factor * nextQ - currentQ)
            currentState = nextState
            current_action = next_action
            if(currentState == goalState):
                break

    return prettify3dMatrix(qGrid)

In [21]:
print("SARSA\n=====\n")

stateActionMatrix = sarsa(5, (3,1), (2,3), -1, 0.625, 0.75, 0.3, 4, 10)

print("\nOptimal state-action Q values :")
showMatrix(stateActionMatrix)


SARSA
=====

Matrix representation of given grid world :
0	0	0	0	0	
0	0	0	0	0	
0	0	0	x	0	
0	s	0	0	0	
0	0	0	0	0	

Arbitrarily initialized state-action Q values in the format ['→', '↓', '←', '↑'] :
[1, 1, 2, 1]	[3, 4, 1, 0]	[2, 3, 0, 4]	[2, 3, 3, 3]	[1, 2, 2, 1]	
[3, 2, 0, 3]	[3, 0, 1, 1]	[1, 0, 2, 3]	[3, 3, 2, 0]	[4, 3, 4, 0]	
[4, 1, 4, 4]	[4, 3, 1, 3]	[0, 4, 2, 4]	[0, 0, 0, 0]	[2, 4, 1, 3]	
[2, 0, 4, 4]	[0, 0, 3, 2]	[4, 3, 2, 2]	[2, 0, 3, 2]	[2, 3, 3, 0]	
[2, 3, 3, 2]	[2, 3, 4, 1]	[0, 0, 0, 2]	[0, 0, 3, 0]	[3, 1, 0, 3]	

Optimal state-action Q values :
[-0.54, -1.19, -0.88, -0.64]	[-0.52, -0.91, -0.33, -1.12]	[-0.37, -0.05, -0.76, -0.48]	[0.22, -1.1, -0.09, -0.67]	[0.28, 1.62, 2, 0.91]	
[-1.41, -1.5, -1.41, -1.11]	[-1.24, -1.46, -1.19, -0.98]	[-1.2, -1.32, -0.59, -0.31]	[-0.9, -0.94, -1.54, -0.85]	[0.46, 0.94, -0.98, -0.23]	
[-1.2, -1.21, -1.51, -1.48]	[-1.19, -1.52, -1.26, -1.38]	[-0.94, -1.35, -0.78, -1.25]	[0, 0, 0, 0]	[-0.75, -0.11, -0.5, 0.09]	
[-1.61, -1.6, -1.44, -1.59]	[-1.56, 