In [1]:
# load what we need from numpy, matplotlib
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import defaultdict

# set numpy print precision to be 2 digits

# thanks to professor veskler who helped me outside of the class to completely understand RL agent and make my eyes wide open, also he appreciated my small begineer like questions inside the class too so just a shout out to him.

In [2]:
class RL:
    def __init__(self, learningRate = .1, potentialToExplore = .1, discountFactor = .8):
        self.learningRate = learningRate
        self.potentialToExplore = potentialToExplore
        self.discountFactor = discountFactor
        self.stateActionQvalues = defaultdict(float)
        self.trace = []
        self.numberOfRing = 0
        
    def step(self, towers, towerWithRing, numberOfRing):     
        self.numberOfRing = numberOfRing
        self.towers = towers
        action = self.rlAction( towers, towerWithRing ) 
        self.trace.append( ( towers, action ) )
        reward = self.currentStateValue( towers )
        self.updateState(towers, action, towerWithRing)
        if reward:
            self.learn(reward)
        return action
        
    def rlAction(self, towers, towerWithRing):
        if random.random() < self.potentialToExplore:
            towerN = random.choice(towerWithRing)
            pickRing = towers[towerN].pop()
            towers[towerN].append(pickRing)
            towerToPlace = towerN

            while towerToPlace == towerN:
                towerToPlace = towers.index(random.choice(towers))
                popIt = towers[towerToPlace].pop()
                if popIt is None:
                    continue
                towers[towerToPlace].append(popIt)
                if popIt > pickRing and towerToPlace != towerN:
                    return towerN, pickRing, towerToPlace
        else:
            succeed = False
            bestValue = -1
            for towerN in towerWithRing:
                tower = towers[towerN]
                pickIt = tower.pop()
                tower.append(pickIt)
                if pickIt is None:
                    continue
                for toTowerV in towers:
                    toTower = towers.index(toTowerV)
                    if towerN != toTower:
                        stateValue = self.stateActionQvalues[
                            (tuple(tuple(x) for x in towers), towerN, pickIt, toTower)
                        ]
                        if stateValue > bestValue:
                            actionTaken = towerN, pickIt, toTower
                            bestValue = stateValue
                            succeed = True
            if succeed:
                return actionTaken

            
    def currentStateValue( self, towers ):           
        if len(towers[len(towers) - 1]) == self.numberOfRing:
            return 1
        
    def updateState(self, towers, action, towerWithRing):
        towerN, pickIt, toTower = action
        towers[towerN].remove(pickIt)
        towers[toTower].append(pickIt)
        if toTower not in towerWithRing:
            towerWithRing.append(toTower)   
    
    def learn(self, reward):
        self.trace.reverse()
        for stateaction in self.trace:
            eV = self.stateActionQvalues[stateaction]
            self.stateActionQvalues[stateaction] += self.learningRate * ( reward - eV )
            reward *= self.discountFactor
        self.trace = []   

In [3]:
towers = [[1, 2, 3], [], [1,2]]
towerWithRing = [0]

# import random 
# print(random.choice(towers))
rl_agent = RL()
action = rl_agent.step(towers, towerWithRing, 3)
print("Action taken:", action)
print("Updated towers:", towers)

Action taken: (0, 3, 1)
Updated towers: [[1, 2], [3], [1, 2]]
