In [1]:
import random
# Install required libraries
# Import required libraries
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from helper_functions import reward_print, print_Qtable
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "cpu"
)
device = "cpu"
print(device)
CUDA_LAUNCH_BLOCKING=1

cpu


In [28]:
class Net(nn.Module):

    def __init__(self, obs, action):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(obs, 32)
        self.layer2 = nn.Linear(32, 32)
        self.layer3 = nn.Linear(32, action)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [27]:
# AC2 algorithm 
class AC2:
    def __init__(self, env):
        self.actor = Actor(env)
        self.critic = Critic(env)
    
    # Main training loop
    def train(self, episodes, gamma, greedy=False):
        total_reward = [0] * episodes
        for i in range(episodes):
            step = rewards = 0
            done = False            
            while not done:
                # Actor makes decision 
                action = self.actor.act(self, state)
                # Environment returns state and reward
                next_state, reward, terminated, truncated, _ = self.actor.step(action)
                done = terminated or truncated 
                # Critic evaluates action 
                value = self.critic.evaluate(next_state, reward)
                # Pass that value to the Actor
                self.actor.evaluation(value)
                
                step += 1
                rewards += reward
                
            total_reward[i] = rewards      
        return total_reward

               
    def save(self, filename):
        placeholder = ''
        with open("pickles/" + filename, 'wb') as file:
            pickle.dump(placeholder, file)

In [24]:
# Actor thread
class Actor:
    def __init__(self, env):
        self.env_type = env
        self.env = gym.make(env)
        state, info = self.env.reset()
        self.policy_net = Net(len(state), self.env.action_space.n)
        self.optimizer = optim.AdamW(self.policy_net.parameters(), amsgrad=True)
    
    
    def act(self, state):
        action = self.pick_action(state)
        state, reward, terminated, truncated, _ = self.env.step(action)
        return state, reward, terminated, truncated, _ 
        
        
    def pick_action(self, state):
        # Get probability list from policy net
        weights = self.policy_net(state)
        # Apply that list to the action list to get the appropriate action 
        action = torch.multinomial(weights, 1)#.todevice
        return action.item()
    
    
    def evaluation(self, values):
        # Need to update the actors policy with the critics evaluation 
        # use softmaxing...
        pass
    
    
    def change_render(self, render):
        if render:
            self.env = gym.make(self.env_type, render="human")
        else: 
            self.env = gym.make(self.env_type)

In [25]:
# Critic thread
class Critic:
    def __init__(self, obs, action):
        
        self.policy_net = Net(obs, action)
        self.optimizer = optim.AdamW(self.policy_net.parameters(),amsgrad=True )
    
    
    def evaluate(self, state, next_state, reward):
        # Need to generate an evaluation to update policy 
        # Calculate the Q value
        # Calculate the value function
        # Soft-maxing ???
        torch.gradient()
        pass

In [ ]:
# AC2 Agent for Cart Pole
environment = 'CartPole-v1'
agent = AC2(environment)

episodes = 10
gamma = 1.003

agent.actor.change_render(True)

# Main training session
total_rewards = agent.train(episodes, gamma)
print("Best reward: ", max(total_rewards))
agent.save("drpreisl_part1_assignment3.pickle")
reward_print(total_rewards, episodes, "grid world")

# Greedy run 
agent.actor.change_render(True)
total_greedy_rewards = agent.train(11, gamma, greedy=True)
reward_print(total_greedy_rewards, 10, "greedy")

In [5]:
environment = gym.make("BipedalWalker-v3", render_mode="human")

environment.reset()
done = False
for i in range(100):
    if not done:
        state, reward, terminated, truncated, _  = environment.step(environment.action_space.sample())
        print(environment.action_space.sample())
        done = terminated or truncated
        # print(reward)
    else:
        break
environment.close()


[ 0.39208955 -0.49994665 -0.8134083  -0.8935776 ]
[ 0.996924   -0.46270904  0.25332296 -0.6199383 ]
[-0.16606212  0.8293581  -0.0554404  -0.9682666 ]
[ 0.13676018  0.23906039 -0.56972533  0.92407334]
[ 0.75992817 -0.8317102  -0.05071194  0.20298636]
[-0.49637958 -0.20431677 -0.7113743  -0.91306967]
[ 0.43331325 -0.9967853  -0.21113028 -0.84131455]
[-0.653226    0.1399365   0.86226684 -0.6065078 ]
[-0.22277948  0.24012399  0.8187429   0.25846824]
[-0.7382522   0.82068515 -0.20854723 -0.5146232 ]
[-0.60003424  0.5194155  -0.44214278 -0.6722367 ]
[-0.94243145  0.7762538   0.21926855  0.17618997]
[ 0.3296068  -0.7945417  -0.68480724 -0.5729555 ]
[-0.20722602 -0.2032666  -0.40193555 -0.9935767 ]
[ 0.8102185  -0.99676305  0.41692215  0.7451281 ]
[-0.66372377 -0.8134068   0.23761183 -0.10493127]
[-0.1900641  -0.29664922  0.47275737 -0.6611509 ]
[-0.395759   -0.5186109  -0.06502593 -0.2105364 ]
[-0.15255351  0.23339204 -0.05762069  0.8820962 ]
[-0.4325765  -0.62420124  0.00941765 -0.729864  ]
