In [None]:
import gym
import numpy as np

# Create the environment
env = gym.make('Blackjack-v1')

# Set the parameters
num_episodes = 100000
gamma = 1.0

# Initialize the empty dictionaries to store state-action values and visit counts
Q = {}
N = {}

# Function to choose an action based on epsilon-greedy strategy
def choose_action(state):
    if state not in Q:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

# Monte Carlo algorithm
for episode in range(num_episodes):
    states = []
    actions = []
    rewards = []

    state = env.reset()

    while True:
        action = choose_action(state)
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

        if done:
            break

    G = 0  # Initialize the return

    # Update the state-action values and visit counts
    for t in range(len(states)-1, -1, -1):
        G = gamma * G + rewards[t]
        state = states[t]
        action = actions[t]

        if (state, action) not in N:
            N[(state, action)] = 0

        N[(state, action)] += 1

        if (state, action) not in Q:
            Q[(state, action)] = 0

        Q[(state, action)] += (1 / N[(state, action)]) * (G - Q[(state, action)])

# Print the state-action values
for state, action in Q.keys():
    print(f"State: {state}, Action: {action}, Value: {Q[(state, action)]}")

# Close the environment
env.close()


In [None]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Set the parameters
num_episodes = 10000
gamma = 0.99

# Initialize the empty dictionaries to store state-action values and visit counts
Q = {}
N = {}

# Monte Carlo algorithm
for episode in range(num_episodes):
    states = []
    actions = []
    rewards = []

    state = env.reset()

    done = False
    while not done:
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    G = 0  # Initialize the return
    for t in range(len(states) - 1, -1, -1):
        G = gamma * G + rewards[t]

        state = states[t]
        action = actions[t]

        if (state, action) not in N:
            N[(state, action)] = 0

        N[(state, action)] += 1

        if (state, action) not in Q:
            Q[(state, action)] = 0

        Q[(state, action)] += (1 / N[(state, action)]) * (G - Q[(state, action)])

# Print the learned state-action values
for state, action in Q.keys():
    print(f"State: {state}, Action: {action}, Value: {Q[(state, action)]}")

# Close the environment
env.close()


In [2]:
import gym
import numpy as np

env = gym.make('FrozenLake-v1')

num_episodes = 10000
gamma = 0.99

Q = {}
N = {}

for episode in range(num_episodes):
  states = []
  actions = []
  rewards = []

  state =env.reset()
  done = False
  
  while not done:
    action = env.action_space.sample()
    next_state,reward,done,_ = env.step(action)
    states.append(state)
    actions.append(action)
    rewards.append(reward)

    state = next_state    
    G = 0
    for t in range(len(states)-1,-1,-1):
      G = gamma*G + rewards[t]
      state = states[t]
      action = actions[t]

      if(state,action) not in N:
        N[(state,action)] = 0
      
      N[(state,action)] += 1

      if(state,action) not in Q:
        Q[(state,action)] = 0
      
      Q[(state,action)] += (1/N[(state,action)]) * (G - Q[(state,action)])

    for state,action in Q.keys():
      print(f"State: {state}, Action: {action}, Value: {Q[(state,action)]}")

    env.close

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
State: 0, Action: 2, Value: 0.004569940444982512
State: 0, Action: 0, Value: 0.003290008683036257
State: 0, Action: 1, Value: 0.004365433859509718
State: 0, Action: 3, Value: 0.0028645504478773676
State: 0, Action: 2, Value: 0.004569686946691292
State: 0, Action: 0, Value: 0.0032899550005991295
State: 0, Action: 1, Value: 0.0043653519579720445
State: 0, Action: 3, Value: 0.002864482780265723
State: 0, Action: 2, Value: 0.00456939123422756
State: 0, Action: 0, Value: 0.003289901319913825
State: 0, Action: 1, Value: 0.004365270059507484
State: 0, Action: 3, Value: 0.002864415115850948
State: 0, Action: 2, Value: 0.004569095560033496
State: 0, Action: 0, Value: 0.0032898476409802583
State: 0, Action: 1, Value: 0.004365147217572353
State: 0, Action: 3, Value: 0.002864347454632817
State: 0, Action: 2, Value: 0.004568799924101672
State: 0, Action: 0, Value: 0.0032897671258642286
State: 0, Action: 1, Value: 0.004365024382550753
