# SARSA & Q-learning

This assignment implements SARSA and Q-learning algorithms from Reinforcement Learning

In [1]:
# Python 2D Array operations - https://www.tutorialspoint.com/python/python_2darray.htm

def showMatrix(matrix):
    for row in matrix:
        for col in row:
            print(col,end = '\t')
        print()

def prettifyMatrix(matrix):
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            matrix[i][j] = round(matrix[i][j], 4)
    return matrix

def prettify3dMatrix(matrix):
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            for k in range(len(matrix[i][j])):
                matrix[i][j][k] = round(matrix[i][j][k], 2)
    return matrix

def createMatrix(matrix_size):
    # Create an NxN matrix with all values initialized to 0
    # https://www.geeksforgeeks.org/python-using-2d-arrays-lists-the-right-way/
    squareMatrix = [[0 for i in range(matrix_size)] for j in range(matrix_size)]
    return squareMatrix

def nextStateCoordinates(current_row, current_col, direction, grid_size):
    i = current_row
    j = current_col
    #check for out of bounds and update coordinates
    if((direction == '←') and (current_col > 0)):
        j-=1
    elif((direction == '↑') and (current_row > 0)):
        i-=1
    elif((direction == '→') and (current_col < grid_size-1)):
        j+=1
    elif(current_row < grid_size-1):
        i+=1
    return (i,j)

In [2]:
from random import uniform as probability
from random import choice as randomChoice

def findNextAction(qListOfCurrentState, epsilon):
    actionList = ['→', '↓', '←', '↑']
    # https://stackoverflow.com/questions/33359740/random-number-between-0-and-1-in-python/33359801#33359801
    if(probability(0, 1) < epsilon):
        # Explore for ε times
        # https://www.geeksforgeeks.org/random-numbers-in-python/
        return randomChoice(actionList)
    else:
        # Exploit for 1-ε times (greedy action)
        # https://stackoverflow.com/questions/2474015/getting-the-index-of-the-returned-max-or-min-item-using-max-min-on-a-list/2474030#2474030
        max_index = qListOfCurrentState.index(max(qListOfCurrentState))
        return actionList[max_index]

def sarsa(grid_size, startState, goalState, reward, discount_factor, step_size, epsilon, initial_q_range, episode_range):

    qGrid = createMatrix(grid_size)
    qGrid[startState[0]][startState[1]] = 's'
    qGrid[goalState[0]][goalState[1]] = 'x'

    print("Matrix representation of given grid world :")
    showMatrix(qGrid)
    
    # Import function to generate random integers
    # https://www.geeksforgeeks.org/python-randint-function/
    from random import randint

    # Arbitrarily initialize Q values in the format ['→', '↓', '←', '↑']
    for i in range(grid_size):
        for j in range(grid_size):
            # https://stackoverflow.com/questions/16655089/python-random-numbers-into-a-list/16655135#16655135
            qGrid[i][j] = [randint(0, initial_q_range) for count in range(4)]

    # Set Q values of goal state to 0
    qGrid[goalState[0]][goalState[1]] = [0,0,0,0]

    print("\nArbitrarily initialized state-action Q values in the format ['→', '↓', '←', '↑'] :")
    showMatrix(qGrid)

    for episode_count in range(episode_range):
        currentState = startState
        current_action = findNextAction(qGrid[currentState[0]][currentState[1]], epsilon)

        # Simulate do-while on Python
        # https://www.javatpoint.com/python-do-while-loop
        while(True):
            nextState = nextStateCoordinates(currentState[0], currentState[1], current_action, grid_size)
            next_action = findNextAction(qGrid[nextState[0]][nextState[1]], epsilon)
            current_action_index = ['→', '↓', '←', '↑'].index(current_action)
            next_action_index = ['→', '↓', '←', '↑'].index(next_action)            
            currentQ = qGrid[currentState[0]][currentState[1]][current_action_index]
            nextQ = qGrid[nextState[0]][nextState[1]][next_action_index]
            qGrid[currentState[0]][currentState[1]][current_action_index] = currentQ + step_size * (reward + discount_factor * nextQ - currentQ)
            currentState = nextState
            current_action = next_action
            if(currentState == goalState):
                break

    return prettify3dMatrix(qGrid)

In [3]:
print("SARSA\n=====\n")

stateActionMatrix = sarsa(5, (3,1), (2,3), -1, 0.625, 0.75, 0.3, 4, 10)

print("\nOptimal state-action Q values :")
showMatrix(stateActionMatrix)


SARSA
=====

Matrix representation of given grid world :
0	0	0	0	0	
0	0	0	0	0	
0	0	0	x	0	
0	s	0	0	0	
0	0	0	0	0	

Arbitrarily initialized state-action Q values in the format ['→', '↓', '←', '↑'] :
[3, 2, 2, 1]	[3, 3, 4, 2]	[3, 2, 4, 3]	[2, 2, 0, 2]	[1, 3, 3, 3]	
[1, 2, 0, 1]	[4, 0, 4, 2]	[1, 3, 0, 3]	[0, 1, 2, 3]	[2, 3, 3, 2]	
[4, 4, 3, 3]	[2, 4, 0, 4]	[3, 0, 2, 1]	[0, 0, 0, 0]	[3, 1, 4, 3]	
[3, 1, 4, 3]	[1, 1, 1, 1]	[0, 0, 3, 1]	[1, 0, 2, 3]	[3, 1, 0, 4]	
[0, 3, 3, 3]	[2, 4, 1, 3]	[0, 3, 4, 1]	[4, 4, 2, 0]	[0, 1, 0, 0]	

Optimal state-action Q values :
[1.41, -1.07, 2, -0.5]	[0.94, -0.56, 1.66, -0.46]	[0.42, -0.04, 0.39, 0.47]	[0.95, -0.7, 0, -0.57]	[-0.56, -0.25, 0.32, 0.88]	
[-0.07, -0.72, -0.97, -0.45]	[-0.88, -0.87, -0.45, -0.4]	[-0.31, -1.17, -0.76, -0.3]	[-0.57, -0.5, -0.44, -0.64]	[-0.88, -1.07, -0.51, -0.23]	
[-0.82, -1.74, -1.04, -0.77]	[-1.3, -1.36, -1.59, -0.9]	[-0.94, -1.41, -1.06, -0.94]	[0, 0, 0, 0]	[-1.32, -1.33, -0.69, -0.89]	
[-0.76, -1.7, -2.06, -1.07]	[-1.54, -1.68, 