In [401]:
import gym
import numpy as np
from IPython.display import clear_output
from time import sleep

# Parameters of Q-leanring
ALPHA = .628  # learning rate
GAMMA = .95   # reward discount
EPISODS = 10000
TURN_LIMIT = 100

In [397]:
class Agent:
    def __init__(self, mode="4x4", random_action=True, alpha=ALPHA, gamma=GAMMA, episods=EPISODS):
        # Load Environment and Q-table structure
        if mode == "4x4":
            self.env = gym.make('FrozenLake-v0')
        else:
            self.env = gym.make('FrozenLake8x8-v0')
            
        self.Q = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        self.random_act = random_action
        self.alp = alpha
        self.gam = gamma
        self.episods = episods
        
    # Epsilon-Greedy approach for Exploration and Exploitation of the state-action spaces
    def epsilon_greedy(self, s, i):
        epsilon = 0.03
        p = np.random.uniform(low = 0, high = 1)
        if p > epsilon:
            # initial policy = for each state consider the action having highest
            return np.argmax(self.Q[s, :] + np.random.randn(1, self.env.action_space.n) * (1./(i+1)))                             
      
        return self.env.action_space.sample() # random
        
    # Choose action from Q table
    def choose_action(self, state, i):
        if self.random_act == True:
            return self.env.action_space.sample()
        else: 
            return self.epsilon_greedy(state, i)
    
    # Q-learning Algorithm
    def learn(self):
        print("###### LEARNING #####")
        rewards = 0.0
        
        for i in range(self.episods):
            s = self.env.reset()  # Reset environment
            # The Q-Table learning algorithm
            for j in range(TURN_LIMIT):  
                a = self.choose_action(s, i)
                # Get new state & reward from environment
                s1, r, d, _ = self.env.step(a)
                # Update Q-Table with new knowledge
                self.Q[s, a] = self.Q[s, a] + self.alp *(r + self.gam * np.max(self.Q[s1, :]) - self.Q[s, a])
                # Q <- Q + a(Q' - Q)
                # <=> Q <- (1-a)Q + a(Q')
                s = s1
                if d == True:
                    rewards += r
                    break
                    
        self.print(self.episods, rewards)
        print("   Q - Table   ")
        print(self.Q)
               
    # Test how much learned        
    def test(self):
        print("###### TESTING #####")
        reward_total = 0.0
        testing_episods = 1000
        
        for i in range(testing_episods):
            s = self.env.reset()
            for t in range(TURN_LIMIT):
                action = self.choose_action(s, i)
                s1, reward, done, _ = self.env.step(action)
                state = s1

                if done:
                    reward_total += reward
                
        self.print(testing_episods, reward_total)
    
    # Demonstrate game after learning
    def demonstrate(self):
        s = self.env.reset()
        self.env.render()
        
        while(True):
            clear_output(wait=True)
            sleep(.1)
            action = self.choose_action(s, i)
            s1, r, done, _ = self.env.step(action)
         
            self.env.render()
            state = s1
            
            if done == True:
                break
    
    def print(self, episods, rewards):
        print("episodes      : {}".format(episods))
        print("total reward  : {}".format(rewards))
        print("average reward: {:.2f}".format(rewards / episods))        

In [351]:
agent = Agent(random_action=False)
agent.learn()

###### LEARNING #####
episodes      : 10000
total reward  : 4984.0
average reward: 0.50
   Q - Table   
[[2.80393724e-01 8.04031170e-03 6.66660008e-03 7.45678287e-03]
 [2.13361570e-03 2.01709889e-03 1.99640519e-03 2.04679056e-01]
 [9.78828297e-04 2.06050630e-03 3.26888267e-03 9.04857020e-02]
 [7.67120495e-04 0.00000000e+00 1.02863426e-03 7.25162248e-02]
 [2.66754569e-01 2.75595423e-03 1.51785893e-03 2.74746207e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.51872571e-04 1.23449676e-05 3.92593221e-02 1.41544062e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 3.67787458e-01]
 [0.00000000e+00 5.74793614e-01 2.95252861e-03 0.00000000e+00]
 [6.77863574e-01 1.21204806e-03 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.97466025e-04 1.05474685e-03 7.79856008e-01 1.78607434e-03]
 [0.00000000e+

In [372]:
agent = Agent(random_action=False, alpha=0.9, episods=10000)
agent.learn()

###### LEARNING #####
episodes      : 10000
total reward  : 3060.0
average reward: 0.31
   Q - Table   
[[9.73231451e-02 9.62852451e-02 9.21326226e-02 9.65781507e-02]
 [8.84678104e-03 3.89160759e-02 1.34934070e-03 4.58768965e-02]
 [2.62383895e-03 1.82518404e-03 2.58071375e-03 9.33146908e-02]
 [7.42644413e-05 2.16043660e-03 6.21118546e-04 2.25944571e-02]
 [9.18739203e-02 1.43950139e-02 1.12823793e-02 8.80256646e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.12734401e-05 1.20884300e-04 1.04152518e-01 2.19025521e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.02363883e-05 1.07609050e-01 6.97187252e-04 1.08546220e-01]
 [1.07688049e-02 7.17626108e-02 7.90092633e-03 4.95745590e-03]
 [4.60579013e-02 4.90697051e-04 1.02595785e-03 2.09162909e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.21248223e-03 6.47677861e-02 2.14913548e-01 7.58398250e-03]
 [1.16416237e-

In [373]:
agent = Agent(random_action=False, alpha=0.95, episods=10000)
agent.learn()

###### LEARNING #####
episodes      : 10000
total reward  : 2832.0
average reward: 0.28
   Q - Table   
[[3.05045075e-01 5.78092068e-03 4.51964043e-03 5.20182227e-03]
 [3.12460273e-04 8.94583085e-04 1.92114683e-03 1.12565863e-02]
 [3.46972166e-03 6.15162202e-04 2.43876182e-04 1.24368675e-03]
 [1.17979357e-04 4.16750254e-04 6.01062159e-04 1.24741310e-03]
 [3.37092629e-01 7.63928153e-04 4.98466197e-04 2.30576045e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.70239348e-07 1.56322113e-16 1.56280171e-03 1.78238646e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.59382597e-05 1.24054292e-03 1.32486466e-03 4.45612636e-02]
 [6.89633569e-04 6.63877957e-01 9.53578964e-04 2.79154916e-03]
 [2.98455215e-02 6.64473225e-05 5.30890949e-05 8.52042017e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.68743149e-05 2.11309532e-03 7.07992303e-02 7.05947101e-03]
 [2.97984768e-

In [389]:
agent = Agent(random_action=False, alpha=0.1, gamma=0.97, episods=10000)
agent.learn()

###### LEARNING #####
episodes      : 10000
total reward  : 5491.0
average reward: 0.55
   Q - Table   
[[3.23845731e-01 2.71519700e-01 2.69464508e-01 2.79290815e-01]
 [1.44606944e-01 1.77066076e-01 1.67288256e-01 2.50551215e-01]
 [2.15170728e-01 1.25445672e-01 1.42957390e-01 1.22476720e-01]
 [7.80716677e-02 6.34832121e-10 2.24642991e-04 9.99081312e-03]
 [3.48824730e-01 1.79082516e-01 2.15184890e-01 2.21224845e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.09868940e-02 1.05552472e-01 1.91711515e-01 8.19569684e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.79838811e-01 2.80555527e-01 2.82240779e-01 4.00663131e-01]
 [3.06080851e-01 5.17695852e-01 2.54265408e-01 2.42755121e-01]
 [5.12664919e-01 3.51264279e-01 2.24679386e-01 2.88502604e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.09969362e-01 4.50628707e-01 5.93998334e-01 4.15971044e-01]
 [5.66080715e-

In [390]:
agent.test()

###### TESTING #####
episodes      : 1000
total reward  : 717.0
average reward: 0.72


In [393]:
agent.demonstrate()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [399]:
agent = Agent(random_action=False, alpha=0.1, gamma=0.97, episods=1000)
agent.learn()

###### LEARNING #####
episodes      : 1000
total reward  : 93.0
average reward: 0.09
   Q - Table   
[[0.04703413 0.05987015 0.09002108 0.06308736]
 [0.02650352 0.04580017 0.02939675 0.11724521]
 [0.06053373 0.04074646 0.15197599 0.0415416 ]
 [0.02993932 0.02017863 0.02694279 0.12837577]
 [0.03204682 0.05123584 0.07711284 0.01407081]
 [0.         0.         0.         0.        ]
 [0.15833551 0.02805193 0.02939506 0.00559475]
 [0.         0.         0.         0.        ]
 [0.02816007 0.02753326 0.09501758 0.18660643]
 [0.03936387 0.30596465 0.06376885 0.01913843]
 [0.33746217 0.         0.07629511 0.04312005]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.14118392 0.04297652 0.39744181]
 [0.02720336 0.7120604  0.00597857 0.12316783]
 [0.         0.         0.         0.        ]]


In [400]:
agent = Agent(random_action=False, alpha=0.1, gamma=0.97, episods=100000)
agent.learn()

###### LEARNING #####
episodes      : 100000
total reward  : 56756.0
average reward: 0.57
   Q - Table   
[[0.25137139 0.24256111 0.23523775 0.24728227]
 [0.15308875 0.16662034 0.13860414 0.21039866]
 [0.17796411 0.17791409 0.16543247 0.17855674]
 [0.11805033 0.12207578 0.07838535 0.1683645 ]
 [0.27936572 0.17354904 0.17799836 0.22670118]
 [0.         0.         0.         0.        ]
 [0.09349861 0.17565806 0.11252962 0.101203  ]
 [0.         0.         0.         0.        ]
 [0.21140207 0.27120603 0.20013891 0.34612267]
 [0.24926833 0.43630093 0.29079297 0.26748228]
 [0.34470155 0.26833355 0.26816352 0.19497034]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.2843436  0.52040239 0.56697477 0.31773397]
 [0.61226772 0.81847811 0.61586823 0.62246291]
 [0.         0.         0.         0.        ]]


In [404]:
agent.demonstrate()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [405]:
agentV8_1 = Agent(mode="8x8", random_action=False)
agentV8_1.learn()

###### LEARNING #####
episodes      : 10000
total reward  : 2371.0
average reward: 0.24
   Q - Table   
[[2.12276665e-02 2.31863235e-02 1.91817410e-02 4.93243907e-02]
 [2.75344820e-02 6.57649638e-02 2.12127205e-02 2.18324737e-02]
 [2.51692144e-02 3.64964917e-02 2.52592721e-02 8.64974627e-02]
 [4.82757497e-02 9.56800573e-02 8.17553239e-02 2.84702864e-02]
 [4.43066004e-02 7.11046756e-02 1.05852607e-01 4.38771930e-02]
 [5.63566815e-02 1.45355850e-01 5.64999007e-02 5.63833841e-02]
 [5.64794343e-02 1.16691829e-01 9.51071053e-02 7.62635008e-02]
 [1.17939671e-01 6.67862614e-02 6.97404307e-02 7.06807797e-02]
 [2.08261757e-02 2.41778845e-02 1.82357487e-02 2.80870066e-02]
 [1.99400545e-02 2.31746512e-02 2.05556584e-02 4.01922722e-02]
 [1.67765879e-02 2.42096348e-02 5.34992481e-02 2.22909893e-02]
 [1.48374931e-02 3.75667893e-02 5.10845235e-02 8.46719682e-02]
 [3.80272967e-02 4.11706179e-02 4.24468682e-02 9.67726529e-02]
 [6.54319318e-02 2.12500961e-01 9.43714539e-02 5.64961839e-02]
 [1.17535385e-

In [409]:
agentV8_2 = Agent(mode="8x8", random_action=False, alpha=0.1, gamma=0.97)
agentV8_2.learn()

###### LEARNING #####
episodes      : 10000
total reward  : 3092.0
average reward: 0.31
   Q - Table   
[[1.10804700e-01 1.14024786e-01 1.27455125e-01 1.20543451e-01]
 [1.13670814e-01 1.24442328e-01 1.49924403e-01 1.29706377e-01]
 [1.37163343e-01 1.39901351e-01 1.66923131e-01 1.48083259e-01]
 [1.47980229e-01 1.60266767e-01 1.92771373e-01 1.63815679e-01]
 [1.75359421e-01 1.75023497e-01 2.12666328e-01 1.92697685e-01]
 [1.91960205e-01 2.02250954e-01 2.33851221e-01 1.91243741e-01]
 [2.29363252e-01 2.27145137e-01 2.61684921e-01 2.19549503e-01]
 [2.24736775e-01 2.23070324e-01 2.72529408e-01 2.26535011e-01]
 [1.04231840e-01 1.06319410e-01 1.13948526e-01 1.24066266e-01]
 [1.05334963e-01 1.14358380e-01 1.17420003e-01 1.43229551e-01]
 [1.20641751e-01 1.27477332e-01 1.25850678e-01 1.55254200e-01]
 [1.01176820e-01 9.02802612e-02 1.14198149e-01 1.80574874e-01]
 [1.53752952e-01 1.66851392e-01 2.07507680e-01 1.57785101e-01]
 [1.83374191e-01 1.94020269e-01 2.41626936e-01 2.08170117e-01]
 [2.43674201e-

In [410]:
agentV8_3 = Agent(mode="8x8", random_action=False, alpha=0.1, gamma=0.97, episods=100000)
agentV8_3.learn()

###### LEARNING #####
episodes      : 100000
total reward  : 44530.0
average reward: 0.45
   Q - Table   
[[1.11859094e-01 1.14320484e-01 1.38102169e-01 1.13583785e-01]
 [1.21779004e-01 1.26190352e-01 1.50074996e-01 1.25194282e-01]
 [1.32172029e-01 1.38212538e-01 1.81348654e-01 1.46608300e-01]
 [1.49650817e-01 1.53138108e-01 2.07825574e-01 1.61499572e-01]
 [1.72644503e-01 1.72851584e-01 2.36016119e-01 1.73046741e-01]
 [1.95553096e-01 1.92984991e-01 2.62537268e-01 2.04484343e-01]
 [2.17502490e-01 2.12622220e-01 2.82009369e-01 2.10281772e-01]
 [2.36983038e-01 2.50851072e-01 2.86588233e-01 2.39375389e-01]
 [1.05571002e-01 1.03802042e-01 1.09609170e-01 1.26992552e-01]
 [1.14061505e-01 1.16215346e-01 1.18720177e-01 1.37119399e-01]
 [1.25943514e-01 1.28656474e-01 1.36000525e-01 1.63571767e-01]
 [7.66688670e-02 1.08604793e-01 9.00490219e-02 1.88582501e-01]
 [1.66351046e-01 1.63357539e-01 2.29482835e-01 1.72354701e-01]
 [1.89105294e-01 1.95384795e-01 2.67336903e-01 1.87045498e-01]
 [2.40919775

In [412]:
agentV8_4 = Agent(mode="8x8", random_action=False, alpha=0.1, gamma=0.97, episods=1000000)
agentV8_4.learn()

###### LEARNING #####
episodes      : 1000000
total reward  : 471242.0
average reward: 0.47
   Q - Table   
[[0.1134542  0.1152286  0.11278902 0.11462069]
 [0.12155979 0.12392206 0.12151945 0.12123065]
 [0.12963568 0.13289843 0.13631412 0.1339634 ]
 [0.15036768 0.1547919  0.16058021 0.15561779]
 [0.17423039 0.17821128 0.18806818 0.17702304]
 [0.18831909 0.19906445 0.21221444 0.19620814]
 [0.21487452 0.21459067 0.22894383 0.21534782]
 [0.21909749 0.21838285 0.24941905 0.21657062]
 [0.1082408  0.10523499 0.10525414 0.1138427 ]
 [0.11269678 0.11086408 0.11235446 0.12260458]
 [0.11353235 0.12814595 0.11698989 0.13473796]
 [0.09781179 0.08926837 0.12550139 0.14763337]
 [0.15685633 0.16646592 0.16814217 0.17930178]
 [0.18882433 0.19882001 0.21551154 0.19585378]
 [0.22429025 0.24601843 0.23703688 0.22862208]
 [0.24485614 0.27558399 0.24102083 0.23399889]
 [0.09155877 0.09112577 0.09170104 0.1052215 ]
 [0.09271362 0.09069768 0.08996054 0.10846501]
 [0.10097664 0.0635421  0.08257821 0.07412586]

In [416]:
agentV8_1.test()
agentV8_2.test()
agentV8_3.test()
agentV8_4.test()

###### TESTING #####
episodes      : 1000
total reward  : 310.0
average reward: 0.31
###### TESTING #####
episodes      : 1000
total reward  : 539.0
average reward: 0.54
###### TESTING #####
episodes      : 1000
total reward  : 588.0
average reward: 0.59
###### TESTING #####
episodes      : 1000
total reward  : 439.0
average reward: 0.44


In [417]:
agentV8_1.demonstrate()

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m


In [418]:
agentV8_2.demonstrate()

  (Up)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHF[41mH[0mF
FFFHFFFG


In [419]:
agentV8_3.demonstrate()

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m


In [420]:
agentV8_4.demonstrate()

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m
