## MC first and every visit

In [1]:
import pandas as pd
import numpy as np
import gymnasium as gym
from collections import defaultdict

In [None]:
env = gym.make("CliffWalking-v0",render_mode = "human")
#Each time step incurs -1 reward, and stepping into the cliff incurs -100 reward.

In [3]:
env.reset()#generaly gives state,info so shd do state,info = env.reset()

(36, {'prob': 1})

In [4]:
env.observation_space
# 4x12 grid states

Discrete(48)

In [5]:
env.action_space
# 4 possible actions-> 0:up, 1:right, 2:down, 3:left

Discrete(4)

In [6]:
#sample run of the env
done = False
total_reward = 0
while not done:
    action = env.action_space.sample()
    print(action)
    state,reward,terminated,truncated,info = env.step(action)
    total_reward+=reward
    done = terminated or truncated
    env.render()
print(total_reward)

2
2
1
3
3
2
3
3
2
1
0
2
2
3
2
3
2
3
1
1
3
3
3
0
3
1
0
3
0
1
2
1
3
0
3
2
2
3
1
0
3
3
0
0
1
3
1
3
0
3
0
0
1
1
2
2
0
1
3
0
0
1
2
2
2
1
2
0
0
2
0
1
0
0
0
0
3
2
1
2
3
2
0
2
0
0
0
3
3
3
0
2
3
0
1
2
0
0
2
3
2
3
2
0
3
1
2
1
0
3
0
0
2
3
2
3
2
0
0
3
1
2
2
2
0
0
2
0
2
2
2
2
0
0
3
0
0
2
3
3
1
2
1
2
0
2
3
2
2
0
1
1
0
0
2
2
3
0
3
0
0
3
3
0
1
1
0
3
2
2
2
3
3
0
0
1
1
2
1
3
3
3
1
2
1
2
1
1
2
1
2
1
2
3
1
2
1
1
2
2
0
1
2
3
1
2
1
1
2
3
3
0
1
1
1
2
0
1
0
2
0
2
0
2
0
2
1
3
1
2
0
0
3
2
1
3
1
3
3
1
0
0
0
3
2
1
0
2
3
0
1
2
0
1
2
3
1
2
1
0
2
2
3
3
3
0
0
3
0
3
1
3
2
3
1
3
0
2
2
2
2
1
1
2
2
1
3
3
1
3
2
0
2
3
3
1
1
1
3
3
0
1
1
3
2
1
1
1
0
2
3
2
1
3
3
1
3
3
2
0
1
3
1
3
3
0
3
0
0
1
2
2
2
0
3
2
3
2
0
0
3
2
1
0
3
1
2
1
0
2
0
2
3
0
3
0
1
0
1
2
2
2
0
1
0
1
1
2
1
3
1
0
0
2
2
0
0
0
1
3
3
0
3
2
0
2
2
0
3
3
0
1
1
3
0
1
1
1
1
1
0
3
3
2
1
0
2
1
0
0
3
2
2
0
2
2
1
2
0
3
0
2
2
1
2
2
1
1
0
1
1
3
1
3
2
2
1
2
0
3
1
1
3
3
0
3
0
2
2
1
0
0
1
2
3
3
3
1
0
3
0
1
2
2
3
0
2
1
0
3
2
0
2
2
3
2
0
3
2
2
1
3
1
1
0
3
2
0
3
2
0
0
0
3
0
2
3
3
3
3


In [7]:
def generate_episode(env,Q,epsilon=0.1):
    episode = []
    done = False
    state,info = env.reset()
    while not done:
        if np.random.rand() < epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(Q[state])
        next_state,reward,truncated,terminated,info = env.step(action)
        episode.append((state,action,reward))#for the current state,action done and reward got
        state = next_state
        done = terminated or truncated
    return episode

In [12]:
def monte_carlo_first_visit(env,num_episodes,gamma=1.0,epsilon=0.1):
    #generate Q(s,a) -> to find the action value function which tells how good to take action in that state
    Q = defaultdict(lambda:np.zeros(4))#for each state 4 actions at start all 0
    returns = defaultdict(list)#for each state,action the returns to keep i.e G = gamma*G + reward
    
    for i in range(num_episodes):
        episode = generate_episode(env,Q)
        G = 0#start with 0
        visited_set = set()#to track each ka first visit
        for t in reversed(range(len(episode))):
            #start from end for the epsiode generated
            state,action,reward = episode[t]#get details
            G = gamma*G + reward
            if (state,action) not in visited_set:
                #if first visit only then append returns and update Q(only once in episode)
                returns[(state,action)].append(G)
                Q[state][action]=np.mean(returns[(state,action)])#basic formula apply returns by count so mean it is
                visited_set.add((state,action))
    policy ={}
    for state in Q:
        #for each state get best action and add to policy
        policy[state] = np.argmax(Q[state])
    
    return policy

In [None]:
def monte_carlo_every_visit(env,num_episodes,gamma=1.0,epislon=0.1):
    Q = defaultdict(lambda:np.zeros(4))
    returns = defaultdict(list)
    
    for i in range(num_episodes):
        episode = generate_episode(env,Q)
        G = 0
        for t in reversed(range(len(episode))):
            state,action,reward = episode[t]
            G = gamma*G + reward
            #now for each state action we do in every visit
            returns[(state,action)].append(G)
            Q[state][action] = np.mean(returns[(state,action)])
    policy = {}
    for state in Q:
        policy[state] = np.argmax(Q[state])
    
    return policy

In [None]:
def evaluate_policy(policy, env, num_episodes=100):
    total_reward = 0
    for _ in range(num_episodes):
        done = False
        state, _ = env.reset()
        while not done:
            action = policy[state]#when policy learnt use that
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
    return total_reward / num_episodes


In [18]:
mc_first_policy = monte_carlo_first_visit(env,10)#10 for testing/
mc_first_policy_reward = evaluate_policy(mc_first_policy,env)
print(mc_first_policy_reward)
mc_every_policy = monte_carlo_every_visit(env,10)
mc_every_policy_reward = evaluate_policy(mc_every_policy,env)
print(mc_every_policy_reward)
#takes time to run

KeyboardInterrupt: 