In [1]:
import gymnasium as gym
import collections
from torch.utils.tensorboard import SummaryWriter

2024-07-06 21:18:14.547778: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
GAMMA = 0.9
TEST_EPISODES = 2
ENV = "FrozenLake8x8-v1"

In [10]:
class Agent():
    def __init__(self, env=ENV):
        self.env = gym.make(env)
        self.old_obs, _ = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(collections.Counter)
        self.values = collections.defaultdict(float)

    def play_n_random_steps(self, count=1000):
        for i in range(count):
            action = self.env.action_space.sample()
            new_obs, reward, terminated, truncated, info = self.env.step(action)
            self.rewards[(self.old_obs, action, new_obs)] = reward
            self.transits[(self.old_obs, action)][new_obs] += 1
            if (terminated or truncated):
                self.old_obs, _ = self.env.reset()
            else:
                self.old_obs = new_obs

    def calc_action_value(self, state, action):
        target_counts = self.transits[(state, action)]
        total = sum(target_counts.values())
        action_value = 0.0
        for target_state, count in target_counts.items():
            reward = self.rewards[(state, action, target_state)]
            val = reward + GAMMA*self.values[(target_state)]
            action_value += (count/total)*val
        return action_value

    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value<action_value:
                best_action = action
                best_value = action_value
        return best_action
    
    def play_episode(self, env):
        total_reward = 0.0
        state, _ = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, terminated, truncated, _ = env.step(action)
            self.rewards[(state, action, new_state)] = reward
            self.transits[(state, action)][new_state] += 1
            #act_val = self.calc_action_value(state, action)
            #self.values[state] = max(act_val, self.values[state]) 
            total_reward += reward
            if (terminated or truncated):
                break
            state = new_state
        return total_reward

    def value_iteration(self):
        for state in range(self.env.observation_space.n):
            state_values = [self.calc_action_value(state, action) for action in range(self.env.action_space.n)]
            self.values[state] = max(state_values)

In [17]:
if __name__=='__main__':
    test_env = gym.make(ENV)
    agent = Agent()
    iter_no = 0
    best_reward = 0.0
    agent.play_n_random_steps(100)
    while True:
        iter_no += 1
        agent.play_n_random_steps(100)
        agent.value_iteration()
        reward = 0.0
        for _ in range(TEST_EPISODES):
            reward += agent.play_episode(test_env)
            #agent.value_iteration()
        reward /=  TEST_EPISODES
        print(reward)
        if (reward > best_reward):
            best_reward = reward
            print(f"Best reward updated: {best_reward}")
            print(f"V(S) = {agent.values}")
        
        if (best_reward > 0.9):
            print("WON!")
            print(f"Solved in {iter_no} iterations")
            break
        print(iter_no)

0.0
1
0.0
2
0.0
3
0.0
4
0.0
5
0.0
6
0.0
7
0.0
8
0.0
9
0.0
10
0.0
11
0.0
12
0.0
13
0.0
14
0.0
15
0.0
16
0.0
17
0.0
18
0.0
19
0.0
20
0.0
21
0.0
22
0.0
23
0.0
24
0.0
25
0.0
26
0.0
27
0.0
28
0.0
29
0.0
30
0.0
31
0.0
32
0.0
33
0.0
34
0.0
35
0.0
36
0.0
37
0.0
38
0.0
39
0.0
40
0.0
41
0.0
42
0.0
43
0.0
44
0.0
45
0.0
46
0.0
47
0.0
48
0.0
49
0.0
50
0.0
51
0.0
52
0.0
53
0.0
54
0.0
55
0.0
56
0.0
57
0.0
58
0.0
59
0.0
60
0.0
61
0.0
62
0.0
63
0.0
64
0.0
65
0.0
66
0.0
67
0.0
68
0.0
69
0.0
70
0.0
71
0.0
72
0.0
73
0.0
74
0.0
75
0.0
76
0.0
77
0.0
78
0.0
79
0.0
80
0.0
81
0.0
82
0.0
83
0.0
84
0.0
85
0.0
86
0.0
87
0.0
88
0.0
89
0.0
90
0.0
91
0.0
92
0.0
93
0.0
94
0.0
95
0.0
96
0.0
97
0.0
98
0.0
99
0.0
100
0.0
101
0.0
102
0.0
103
0.0
104
0.0
105
0.0
106
0.0
107
0.0
108
0.0
109
0.0
110
0.0
111
0.0
112
0.0
113
0.0
114
0.0
115
0.0
116
0.0
117
0.0
118
0.0
119
0.0
120
0.0
121
0.0
122
0.0
123
0.0
124
0.0
125
0.0
126
0.0
127
0.0
128
0.0
129
0.0
130
0.0
131
0.0
132
0.0
133
0.0
134
0.0
135
0.0
136
0.0
137
0.0
138
0.0


In [39]:
test_env.close()

In [38]:
class Agent():
    def __init__(self, env=ENV):
        self.env = gym.make(env)
        self.old_obs, _ = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(collections.Counter)
        self.values = collections.defaultdict(float)

    def play_n_random_steps(self, count=1000):
        for i in range(count):
            action = self.env.action_space.sample()
            new_obs, reward, terminated, truncated, info = self.env.step(action)
            self.rewards[(self.old_obs, action, new_obs)] = reward
            self.transits[(self.old_obs, action)][new_obs] += 1
            if (terminated or truncated):
                self.old_obs, _ = self.env.reset()
            else:
                self.old_obs = new_obs

    # def calc_action_value(self, state, action):
    #     target_counts = self.transits[(state, action)]
    #     total = sum(target_counts.values())
    #     action_value = 0.0
    #     for target_state, count in target_counts.items():
    #         reward = self.rewards[(state, action, target_state)]
    #         val = reward + GAMMA*self.values[(target_state)]
    #         action_value += (count/total)*val
    #     return action_value

    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value<action_value:
                best_action = action
                best_value = action_value
        return best_action
    
    def play_episode(self, env):
        total_reward = 0.0
        state, _ = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, terminated, truncated, _ = env.step(action)
            self.rewards[(state, action, new_state)] = reward
            self.transits[(state, action)][new_state] += 1
            #act_val = self.calc_action_value(state, action)
            #self.values[state] = max(act_val, self.values[state]) 
            total_reward += reward
            if (terminated or truncated):
                break
            state = new_state
        return total_reward

    def value_iteration(self):
        for state in range(self.env.observation_space.n):
            for action in range(self.env.action_space.n):
                target_counts = self.transits[(state, action)]
                total = sum(target_counts.values())
                action_value = 0.0
                for target_state, count in target_counts.items():
                    reward = self.rewards[(state, action, target_state)]
                    best_action = self.select_action(target_state)
                    val = (count / total) * (reward + GAMMA * self.values[(target_state, best_action)])
                    action_value += (count/total)*val
                self.values[(state, action)] = val

In [42]:
TEST_EPISODES = 2
EVAL_EPISODES = 5
ENV = "FrozenLake-v1"
if __name__=='__main__':
    test_env = gym.make(ENV)
    agent = Agent()
    iter_no = 0
    best_reward = 0.0
    while True:
        iter_no += 1
        agent.play_n_random_steps(100)
        agent.value_iteration()
        reward = 0.0
        for _ in range(TEST_EPISODES):
            reward += agent.play_episode(test_env)
            #agent.value_iteration()
        reward /=  TEST_EPISODES
        print(reward)
        if (reward > best_reward):
            best_reward = reward
            print(f"Best reward updated: {best_reward}")
            print(f"V(S) = {agent.values}")
        
        if (best_reward > 0.9):
            print("WON!")
            print(f"Solved in {iter_no} iterations")
            break
        print(iter_no)
    
    test_env.close()
    test_env = gym.make(ENV, render_mode='human')
    reward = 0.0
    for i in range(EVAL_EPISODES):
        reward += agent.play_episode(test_env)
    print(f'EVAL REWARD: {reward/EVAL_EPISODES}')
    
    test_env.close()

0.0
1
0.0
2
0.0
3
0.0
4
0.0
5
0.0
6
0.0
7
0.0
8
0.0
9
0.0
10
0.0
11
0.0
12
0.0
13
0.0
14
0.0
15
0.0
16
0.0
17
0.0
18
0.0
19
0.0
20
0.0
21
0.0
22
0.0
23
0.0
24
0.0
25
0.0
26
0.0
27
0.0
28
0.0
29
0.0
30
0.0
31
0.0
32
0.0
33
0.0
34
0.0
35
0.0
36
0.0
37
0.0
38
0.0
39
0.0
40
0.0
41
0.0
42
0.0
43
0.0
44
0.0
45
0.0
46
0.5
Best reward updated: 0.5
V(S) = defaultdict(<class 'float'>, {(0, 0): 0.0, (0, 1): 0.0, (0, 2): 0.0, (0, 3): 0.0, (4, 0): 0.0, (4, 1): 0.0, (4, 2): 0.0, (4, 3): 0.0, (1, 0): 0.0, (1, 1): 0.0, (1, 2): 0.0, (1, 3): 0.0, (2, 0): 0.0, (2, 1): 0.0, (2, 2): 0.0, (2, 3): 0.0, (5, 0): 0.0, (5, 1): 0.0, (5, 2): 0.0, (5, 3): 0.0, (6, 0): 0.0075282357752230735, (6, 1): 0.0075282357752230735, (6, 2): 0.0, (6, 3): 0.0, (3, 0): 0.0, (3, 1): 0.0, (3, 2): 0.0, (3, 3): 0.0, (7, 0): 0.0, (7, 1): 0.0, (7, 2): 0.0, (7, 3): 0.0, (8, 0): 7.037076420693976e-05, (8, 1): 4.5464257774146474e-05, (8, 2): 0.0002384282910942468, (8, 3): 0.0, (12, 0): 0.0016938530494251915, (12, 1): 0.0016938530494251915