## MC Exploring starts and on-policy first visit MC(epsilon soft)

In [8]:
import pandas as pd
import numpy as np
import gymnasium as gym
from collections import defaultdict

In [2]:
env = gym.make("CliffWalking-v0")
state,_ = env.reset()

In [3]:
def generate_episode_ES(env,Q):
    #so for ES we generate by choosing random state and action
    episode = []
    state = np.random.choice(env.observation_space.n)
    action = np.random.choice(env.action_space.n)
    env.unwrapped.s = state
    next_state,reward,terminated,truncated,_ = env.step(action)
    episode.append((state,action,reward))
    state = next_state
    done = terminated,truncated
    
    while not done:
        action = np.argmax(Q[state])#here argmax action choose
        next_state,reward,terminated,truncated,_ = env.step(action)
        episode.append(state,action,reward)
        state = next_state
        done = terminated,truncated
    
    return episode

In [18]:
def exploring_states_mc(env,num_episodes,gamma=1.0):
    Q = defaultdict(lambda:np.zeros(4))
    returns = defaultdict(list)
    policy = np.zeros(env.observation_space.n,dtype =int)#for each state put 0 now
    for i in range(num_episodes):
        episode = generate_episode_ES(env,Q)#not e policy random start now
        G = 0
        visited_set = set()#as it also follow first visit only

        for t in reversed(range(len(episode))):
            state,action,reward = episode[t]
            G = gamma*G+reward
            if (state,action) not in visited_set:
                #first visit so
                returns[(state,action)].append(G)
                Q[state][action] = np.mean(returns[(state,action)])
                visited_set.add((state,action))
                #update policy also here
                policy[state] = np.argmax(Q[state])#so update policy here only
    
    return Q,policy

In [7]:
def generate_episode_epsilon(env,Q,epsilon = 0.1):
    #this is normal where we just gnereate the episode using epsilon greedy only
    episode =[]
    state,_ = env.reset()
    done = False
    while not done:
        if np.random.rand() < epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(Q[state])

        next_state,reward,terminated,truncated,_ = env.step(action)
        episode.append(state,action,reward)
        state = next_state
        done = terminated or truncated
    
    return episode

In [9]:
def on_policy_fvmc_e_soft(env,num_episodes,gamma=1.0,epsilon=0.1):#same code as of first_visit_mc(if e greedy we use for explore exploit then proper only)
    Q = defaultdict(lambda :np.zeros(4))
    returns = defaultdict(list)
    
    for i in range(num_episodes):
        episode = generate_episode_epsilon(env,Q,epsilon)
        G =0
        visited_set = set()
        for t in reversed(range(len(episode))):
            state,action,reward = episode[t]
            G = gamma*G + reward
            if (state,action) not in visited_set:
                returns[(state,action)].append(G)
                Q[state][action]=np.mean(returns[(state,action)])#basic formula apply returns by count so mean it is
                visited_set.add((state,action))
    policy ={}
    for state in Q:
        #for each state get best action and add to policy
        policy[state] = np.argmax(Q[state])
    
    return Q,policy

In [10]:
def evaluate_policy(policy, env, num_episodes=100):
    total_reward = 0
    for _ in range(num_episodes):
        done = False
        state, _ = env.reset()
        while not done:
            action = policy[state]#when policy learnt use that
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
    return total_reward / num_episodes

In [19]:
Q , exploring_states_mc_policy = exploring_states_mc(env,10)
avg_reward = evaluate_policy(exploring_states_mc_policy,env,10)#sample for 10
print(Q)
print(exploring_states_mc_policy)
print(avg_reward)

KeyboardInterrupt: 

In [None]:
Q , on_policy_fvmc_e_soft_policy = on_policy_fvmc_e_soft(env,10)
avg_reward = evaluate_policy(on_policy_fvmc_e_soft_policy,env,10)#sample for 10
print(Q)
print(on_policy_fvmc_e_soft_policy)
print(avg_reward)