In [26]:
import gymnasium as gym
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from helper_functions import reward_print
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "cpu"
)
device = "cpu"
print(device)
CUDA_LAUNCH_BLOCKING=1

cpu


In [27]:
# AC2 algorithm 
class A2C:
    def __init__(self, env):
        self.actor = Actor(env)
        self.critic = Critic(self.actor.env.observation_space.shape[0])
    
    # Main training loop
    def train(self, episodes, gamma, greedy=False):
        total_reward = [0] * episodes
        for i in range(episodes):
            step = rewards = 0
            done = False     
            state, info = self.actor.env.reset()
            state = torch.tensor(state)
            action_list = []
            while not done:
                # Actor makes decision 
                # Environment returns state and reward
                next_state, reward, done, action = self.actor.act(state, greedy)
                next_state = torch.tensor(next_state)#.todevice
                # Critic evaluates action 
                adv = self.critic.evaluate(state, next_state, reward, gamma)
                # Pass that value to the Actor
                self.actor.evaluation(action, adv, state)
                #action_list += action
                state = next_state
                step += 1
                rewards += reward
                
                # Before it is done
                if done:
                    reward = -10
                    next_state = None
                    adv = self.critic.evaluate(state, next_state, reward, gamma ** step)
                    self.actor.evaluation(action, adv, state)
                
            total_reward[i] = rewards
            print("Episode:", i, " Reward", rewards)
            #print(action_list)
        self.actor.env.close()
        return total_reward

               
    def save(self, filename):
        with open("pickles/" + filename + "actor.pickle", 'wb') as file:
            pickle.dump(self.actor.policy_net.state_dict(), file)
        with open("pickles/" + filename + "critic.pickle", 'wb') as file:
            pickle.dump(self.critic.policy_net.state_dict(), file)

        

In [28]:
# Actor thread
class ActorNet(nn.Module):  
    def __init__(self, obs, act):
        super(ActorNet, self).__init__()
        self.layer1 = nn.Linear(obs, 128)
        self.layer2 = nn.Linear(128, act)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.softmax(self.layer2(x), dim=-1)
        return x
        

class Actor:
    def __init__(self, env):
        self.env_type = env
        self.env = gym.make(env)
        self.policy_net = ActorNet(self.env.observation_space.shape[0], self.env.action_space.shape[0])
        self.optimizer = optim.AdamW(self.policy_net.parameters(), amsgrad=True, lr=.001)
    
    def act(self, state, greedy):
        # Get the weights from the policy net
        weights = self.policy_net(state)
        # if greedy get max-arg 
        if greedy: 
            action = torch.argmax(weights)
        # Use multinomial to select probability / action
        else:
            action = torch.multinomial(weights, 1)
        # Run and return the action 
        state, reward, terminated, truncated, _ = self.env.step(action.item())
        return state, reward, terminated or truncated, action

    
    def evaluation(self, action, advantage, state):
        # Get the weights from the policy 
        weight = self.policy_net(state)
        # Calculate the log probability with the weights of the 
        # current state and action and then use the adv to get the loss 
        prob = torch.distributions.Categorical(weight).log_prob(action)  
        loss = -1 * prob * advantage.detach()
        # back prop
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
    
    def change_render(self, render):
        if render:
            self.env = gym.make(self.env_type, render_mode="human", max_episode_steps=200)
        else: 
            self.env = gym.make(self.env_type, max_episode_steps=200)

In [29]:
# Critic thread
class CriticNet(nn.Module):  
    def __init__(self, obs):
        super(CriticNet, self).__init__()
        self.layer1 = nn.Linear(obs, 128)
        self.layer2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        return  self.layer2(x)


class Critic:
    def __init__(self, obs):
        
        self.policy_net = CriticNet(obs)
        self.optimizer = optim.AdamW(self.policy_net.parameters(),amsgrad=True, lr=.001)
    
    
    def evaluate(self, state, next_state, reward, gamma):
        # Get Qvalue and next Qvalue from policy         
        Qvalue = self.policy_net(state)
        if next_state is not None:
            next_Qvalue = self.policy_net(next_state)
        else: 
            next_Qvalue = 0
        
        # Calculate the TD and advantage for the next action
        TD = reward + (gamma * next_Qvalue)
        # print(next_Qvalue)
        adv = TD - Qvalue
        TD = torch.tensor([TD])
        loss_function = nn.MSELoss()
        loss = loss_function(Qvalue, TD)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return adv

In [30]:
# AC2 Agent for Cart Pole
environment = 'BipedalWalker-v3'
agent = A2C(environment)

episodes = 10
gamma = .99

agent.actor.change_render(True)
# Main training session
total_rewards = agent.train(episodes, gamma)
print("Best reward: ", max(total_rewards))
reward_print(total_rewards, episodes, "grid world")

# Greedy run 
agent.actor.change_render(True)
total_greedy_rewards = agent.train(11, gamma, greedy=True)
reward_print(total_greedy_rewards, 10, "greedy")

TypeError: 'int' object is not subscriptable

In [20]:
environment = gym.make('BipedalWalker-v3', render_mode="human")

state, info = environment.reset()
print(environment.action_space.shape[0])
done = False
for i in range(100):
    if not done:
        state, reward, terminated, truncated, _  = environment.step(environment.action_space.sample())
        done = terminated or truncated
        print(reward)
    else:
        break
environment.close()


4
-0.21270431207492949
-0.13174029403676474
-0.13460298477610189
-0.07263028202826542
-0.11699600247542181
-0.12238068797190867
-0.13231642260154325
-0.12885902671018998
-0.11081495374441147
-0.10771559218565743
-0.10196031949917593
-0.1442953138748781
-0.1327311413884163
-0.06783421353499214
-0.12090195897221565
-0.1344563980102539
-0.1935158486366272
-0.2191252047518901
-0.21145857246716698
-0.20636485616366188
-0.22125689905881885
-0.10998067883650224
-0.10998010283708573
-0.03342592541376868
-0.001167571504909603
-0.025699895352125166
-0.13610690450171747
-0.17660249789555746
-0.22833079299579184
-0.256558383256197
-0.19351435136795045
-0.04736393974721432
-0.0723199452161789
-0.02540315878391266
0.012657308687765212
0.04901599508523942
0.11854570941627024
0.015809107462566287
0.06425259360174457
-0.04718124775091926
-0.05876371077696601
-0.06533790675798816
-0.04977824911133934
-0.0994478390614168
-0.16975913031896195
-0.12072812155137222
-0.12963821458568414
-0.14022289518515035


In [ ]:
# Add episodes to avoid having to retrain 1000+ episodes
add_episodes = 400
agent.actor.change_render(True)

# Main training session
add_total_rewards = agent.train(add_episodes, gamma)
total_rewards += add_total_rewards
episodes += add_episodes
print("Best reward: ", max(total_rewards))
reward_print(total_rewards, episodes, "Cart Pole")

# Greedy run 
agent.actor.change_render(True)
total_greedy_rewards = agent.train(11, gamma, greedy=True)
reward_print(total_greedy_rewards, 10, "greedy")

In [84]:
# Prints the final graph and saves final weights

print("Average", sum(total_rewards)/ len(total_rewards))
reward_print(total_rewards, episodes, "Bipedal Walker")
reward_print(total_greedy_rewards, 10, "Bipedal Walker")
agent.save("drpreisl_BipedalWalker")

2
