In [1]:
import numpy as np
import numpy.testing as npt
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats

import sys
sys.path.append('/Users/johannesthurn/gym')
import gym

# Frozen Lake Environment laden
envFrozLake = gym.make('FrozenLake-v0' )

[2017-12-07 08:18:56,524] Making new env: FrozenLake-v0


In [152]:
class MonteCarlo:
    
    def __init__(self,env):
        self.env = env
        
    def every_visit_monte_carlo_prediction_V(self,policy,gamma=0.9,nb_episodes=10000):
        N_S = np.zeros(self.env.observation_space.n)
        V = np.zeros(self.env.observation_space.n)
        for e in range(nb_episodes):
            done = False
            s = self.env.reset()
            X = []
            R = [] 
            # Eine Episode erstellen 
            while not done:
                a = policy[s]
                X.append(s)
                s_prime, reward, done, _ = self.env.step(a)
                R.append(reward)
                s = s_prime
            # V updaten
            self.EveryVisitMC_V(X,R,V,gamma,N_S)
            
        return V
                # X0, R1, X1, R2, . . . , XT −1, RT , V 
    def EveryVisitMC_V(self,X,R,V,gamma,N_S):
        '''
        X, Array mit den Besuchten States, 
            X[0]  -> Start State
            X[-1] -> State T-1
        R, Rewards,
            R[0]  -> Reward t1
            R[-1] -> Reward T
        gamma, dicount Rate
        N_S, Array Anzahl der Besuche pro State
        alpha, learning Rate
            Kommentar aus Algorithms for Reinforcement Learning:
             ->iterate-averaging is rarely used in practice<- (grund non-stationity ...)
             Es wird Averagin zuerst implementiert
        '''
        _sum = 0
        for t in reversed(range(len(X))):
            # sum ← Rt+1 + γ * sum
            # target[Xt] ← sum
            _sum = R[t] + gamma * _sum
            N_S[X[t]] += 1
            alpha = 1/N_S[X[t]]
            # V[Xt]←  V[Xt]  +  α * (target[Xt]−V[Xt])
            V[X[t]] = V[X[t]] + alpha * (_sum - V[X[t]])
            
    def every_visit_monte_carlo_prediction_Q(self,policy,gamma=0.9,nb_episodes=10000):
        N_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        Q_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        for e in range(nb_episodes):
            done = False
            s = self.env.reset()
            X = []
            R = [] 
            # Eine Episode erstellen 
            while not done:
                a = policy[s]
                s_prime, reward, done, _ = self.env.step(a)
                X.append([s,a])
                R.append(reward)
                s = s_prime
            # Q updaten
            self.EveryVisitMC_Q(X,R,Q_SA,gamma,N_SA)
            
        return Q_SA
        
    def EveryVisitMC_Q(self,X_SA,R,Q_SA,gamma,N_SA):
        '''
        X, Array mit (S,Action,S_Prime), 
            X[0]  -> Start State
            X[-1] -> State T-1
        R, Rewards,
            R[0]  -> Reward t1
            R[-1] -> Reward T
        gamma, dicount Rate
        N_S, Array Anzahl der Besuche pro State
        alpha, learning Rate
            Kommentar aus Algorithms for Reinforcement Learning:
             ->iterate-averaging is rarely used in practice<- (grund non-stationity ...)
             Es wird Averagin zuerst implementiert
        '''
        _sum = 0
        for t in reversed(range(len(X_SA))):
            # sum ← Rt+1 + γ * sum
            # target[Xt] ← sum
            s,a = X_SA[t]
            _sum = R[t] + gamma * _sum
            N_SA[s,a] += 1
            alpha = 1/N_SA[s,a]
            # V[Xt]←  V[Xt]  +  α * (target[Xt]−V[Xt])
            Q_SA[s,a] = Q_SA[s,a] + alpha * (_sum - Q_SA[s,a])
                
    def MonteCarlo_Control(self,gamma=0.9,nb_episodes=10000):
        N_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        Q_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        policy = np.random.randint(0,self.env.action_space.n,self.env.observation_space.n)
        pol_old = policy
        for e in range(1,nb_episodes+1):
            done = False
            s = self.env.reset()
            X = []
            R = [] 
            # Eine Episode erstellen 
            while not done:
                a = policy[s]
                s_prime, reward, done, _ = self.env.step(a)
                X.append([s,a,s_prime])
                R.append(reward)
                s = s_prime
            # Q updaten
            self.EveryVisitMC_Q(X,R,Q_SA,gamma,N_SA)
            # Policy updaten
            policy = self.policyUpdate(Q_SA,e)
            # Zwischenergebnisse ausgeben
            if e % 1000 == 0:
                print('Expected Reward after ' + str(e) + ' episodes')
                pol_new =np.argmax(np.random.random(Q_SA.shape) * (Q_SA.T==Q_SA.max(axis=1)).T, axis=1)
                print(self.evaluate(np.argmax(np.random.random(Q_SA.shape) * (Q_SA.T==Q_SA.max(axis=1)).T, axis=1)))
                if np.allclose(pol_new,pol_old):
                    print('Policy not changed')
                    return N_SA,Q_SA,pol_new
                pol_old = pol_new
                print('------------------------------------------------')
        return N_SA,Q_SA,np.argmax(np.random.random(Q_SA.shape) * (Q_SA.T==Q_SA.max(axis=1)).T, axis=1)
            
    def policyUpdate(self,Q_SA,k):
        epsilon = 0.1 #1/k
        policy = np.zeros(self.env.observation_space.n,dtype=int)
        for s in range(self.env.observation_space.n):
            if np.random.rand() < epsilon:
                policy[s] = self.env.action_space.sample()
            else:
                mostValued = np.where(Q_SA[s] == np.amax(Q_SA[s]))[0] # [0] da np.where returns Tupel
                choose_one = np.random.randint(0,len(mostValued),1)[0] 
                policy[s] = mostValued[choose_one]
        return policy
    
    def evaluate(self,policy,trials=100):
        rewardAll = 0
        counter = 0
        for i in range(trials):
            state = self.env.reset()
            done = False
            rewardTrial = 0
            while not done:
                state, reward, done, _ = self.env.step(policy[state])
                rewardTrial += reward
            rewardAll += rewardTrial
            counter += 1
        return rewardAll/counter

In [153]:
mc = MonteCarlo(envFrozLake)

In [150]:
pi = {0:1, 1:2, 2:1, 3:0, 4:1, 6:1, 8:2, 9:0, 10:1, 13:2, 14:2}
# What is the average performance of the policy,
# i.e. the percentage that the agent reach the goal state starting from the beginning.
mc.evaluate(pi)

0.01

In [156]:
# Monte Carlo Prediction 1
v = mc.every_visit_monte_carlo_prediction_V(pi,gamma=1)
v[0]

0.024880640172147344

In [159]:
# Monte Carlo Prediction 2
q = mc.every_visit_monte_carlo_prediction_Q(pi)
print('Q_SA')
print(q)
print('V')
print(np.max(q,axis=1))

Q_SA
[[ 0.          0.00942032  0.          0.        ]
 [ 0.          0.          0.00997466  0.        ]
 [ 0.          0.02368717  0.          0.        ]
 [ 0.0118944   0.          0.          0.        ]
 [ 0.          0.0120402   0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.05823837  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.02753986  0.        ]
 [ 0.07974976  0.          0.          0.        ]
 [ 0.          0.19516859  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.27971245  0.        ]
 [ 0.          0.          0.58023277  0.        ]
 [ 0.          0.          0.          0.        ]]
V
[ 0.00942032  0.00997466  0.02368717  0.0118944   0.0120402   0.
  0.05823837  0.          0.02753986  0.07974976  0.19516859  0.          0.
  0.27971245  0.58023277  0.      

In [160]:
# Monte Carlo Control
n,q,p = mc.MonteCarlo_Control(nb_episodes=100000)

Expected Reward after 1000 episodes
0.19
------------------------------------------------
Expected Reward after 2000 episodes
0.15
------------------------------------------------
Expected Reward after 3000 episodes
0.25
------------------------------------------------
Expected Reward after 4000 episodes
0.2
------------------------------------------------
Expected Reward after 5000 episodes
0.57
------------------------------------------------
Expected Reward after 6000 episodes
0.8
------------------------------------------------
Expected Reward after 7000 episodes
0.78
------------------------------------------------
Expected Reward after 8000 episodes
0.74
------------------------------------------------
Expected Reward after 9000 episodes
0.76
------------------------------------------------
Expected Reward after 10000 episodes
0.74
------------------------------------------------
Expected Reward after 11000 episodes
0.69
------------------------------------------------
Expected R

Expected Reward after 92000 episodes
0.81
------------------------------------------------
Expected Reward after 93000 episodes
0.75
------------------------------------------------
Expected Reward after 94000 episodes
0.66
------------------------------------------------
Expected Reward after 95000 episodes
0.71
------------------------------------------------
Expected Reward after 96000 episodes
0.73
------------------------------------------------
Expected Reward after 97000 episodes
0.74
------------------------------------------------
Expected Reward after 98000 episodes
0.75
------------------------------------------------
Expected Reward after 99000 episodes
0.72
------------------------------------------------
Expected Reward after 100000 episodes
0.68
------------------------------------------------


In [101]:
b = np.asarray([[1,1,1,1],[0,0,1,1]])
np.argmax(np.random.random(b.shape) * (b.T==b.max(axis=1)).T, axis=1)


array([1, 2])

In [47]:
np.random.randint(0,envFrozLake.action_space.n,envFrozLake.observation_space.n)

array([0, 3, 1, 0, 0, 3, 2, 2, 0, 0, 1, 3, 3, 3, 1, 3])

In [137]:
n,q,p = mc.MonteCarlo_Control(nb_episodes=100000)

Expected Reward after 1000 episodes
0.25
------------------------------------------------
Expected Reward after 2000 episodes
0.17
------------------------------------------------
Expected Reward after 3000 episodes
0.56
------------------------------------------------
Expected Reward after 4000 episodes
0.51
------------------------------------------------
Expected Reward after 5000 episodes
0.69
------------------------------------------------
Expected Reward after 6000 episodes
0.74
------------------------------------------------
Expected Reward after 7000 episodes
0.67
------------------------------------------------
Expected Reward after 8000 episodes
0.65
------------------------------------------------
Expected Reward after 9000 episodes
0.67
------------------------------------------------
Expected Reward after 10000 episodes
0.71
------------------------------------------------
Expected Reward after 11000 episodes
0.54
------------------------------------------------
Expected

Expected Reward after 92000 episodes
0.72
------------------------------------------------
Expected Reward after 93000 episodes
0.63
------------------------------------------------
Expected Reward after 94000 episodes
0.8
------------------------------------------------
Expected Reward after 95000 episodes
0.77
------------------------------------------------
Expected Reward after 96000 episodes
0.72
------------------------------------------------
Expected Reward after 97000 episodes
0.73
------------------------------------------------
Expected Reward after 98000 episodes
0.66
------------------------------------------------
Expected Reward after 99000 episodes
0.77
------------------------------------------------
Expected Reward after 100000 episodes
0.73
------------------------------------------------
