In [16]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym
import matplotlib.pyplot as plt
import torch
import pdb
import copy
import random

class ValueNet(nn.Module):
  def __init__(self,inputShape,lr=0.0005):
    super(ValueNet,self).__init__()
    self.l1=nn.Linear(*inputShape,256)
    self.l2=nn.Linear(256,256)
    self.l3=nn.Linear(256,1)
    self.optimizer=optim.Adam(self.parameters(),lr)
  def forward(self,input):
    output=F.relu(self.l1(input)) 
    output=F.relu(self.l2(output))
    return self.l3(output)
class PolicyNet(nn.Module):
  def __init__(self,inputShape,outputShape,lr):
    super(PolicyNet,self).__init__()
    self.l1=nn.Linear(*inputShape,128)
    self.l2=nn.Linear(128,128)
    self.l3=nn.Linear(128,outputShape)
    self.optimizer=optim.Adam(self.parameters(),lr)
  def forward(self,input):
    output=F.relu(self.l1(input)) 
    output=F.relu(self.l2(output))
    return self.l3(output)


class replayBuffer:
    def __init__(self, maxSize, stateDim):
        self.state = np.zeros((maxSize, stateDim))
        self.action = np.zeros(maxSize, dtype=np.int8)
        self.reward = np.zeros(maxSize)
        self.done = np.zeros(maxSize, dtype=np.int8)
        self.nextState = np.zeros((maxSize, stateDim))
        self.maxSize = maxSize
        self.curser = 0
        self.size = 0

    def save(self, state, action, reward, nextState, done):
        self.state[self.curser] = state
        self.action[self.curser] = action
        self.reward[self.curser] = reward
        self.nextState[self.curser] = nextState
        self.done[self.curser] = done
        self.curser = (self.curser + 1) % self.maxSize
        if self.size < self.maxSize:
            self.size += 1

    def sample(self, batchSize):
        batchSize = min(self.size, batchSize - 1)
        indexes = np.random.choice([i for i in range(self.size - 1)], batchSize)
        return self.state[indexes], self.action[indexes], self.reward[indexes], self.nextState[indexes], self.done[
            indexes]


class Agent():
    def __init__(self, inputShape, outputShape, gamma=0.99, lr=5e-3):
        self.policyNet = PolicyNet(inputShape, outputShape, lr)
        # self.policyNetGrad = lstm(optim.Adam(self.policyNet.parameters(), lr))
        self.valueNet = ValueNet(inputShape, lr)
        self.memory = replayBuffer(1000000, env.observation_space.shape[0])
        self.gamma = T.tensor(gamma, dtype=T.float)

    def save(self, state, action, reward, state_, done):
        self.memory.save(state, action, reward, state_, done)
        state = T.tensor([state], dtype=T.float)
        state_ = T.tensor([state_], dtype=T.float)
        with T.no_grad():
            v_ = self.valueNet(state_).detach()
        self.valueNet.optimizer.zero_grad()
        G = (reward + self.gamma * ((1 - done) * v_)) - self.valueNet(state)
        valueLoss = G ** 2
        valueLoss.backward()
        self.valueNet.optimizer.step()

    def chooseAction(self, state):
        state = T.tensor([state], dtype=T.float)
        with T.no_grad():
            probs = F.softmax(self.policyNet.forward(state))
        actionProbs = T.distributions.Categorical(probs)
        action = actionProbs.sample()
        return action.item()

    def learn(self, batchSize):
        if self.memory.size > batchSize:
            self.policyNet.optimizer.zero_grad()
            self.valueNet.optimizer.zero_grad()

            state, action, reward, state_, done = self.memory.sample(batchSize)
            state = T.tensor(state, dtype=T.float)
            state_ = T.tensor(state_, dtype=T.float)
            reward = T.unsqueeze(T.tensor(reward, dtype=T.float), axis=1)
            action = T.tensor(action, dtype=T.float)
            done = T.unsqueeze(T.tensor(done, dtype=T.float), axis=1)

            with T.no_grad():
                v_ = self.valueNet(state_).detach()
            G = (reward + self.gamma * ((1 - done) * v_)) - self.valueNet(state)
            valueLoss = T.mean(G ** 2)
            valueLoss.backward()
            self.valueNet.optimizer.step()
            probs = F.softmax(self.policyNet.forward(state))
            actionProbs = T.distributions.Categorical(probs)

            policyLoss=T.mean(T.squeeze(-actionProbs.log_prob(action) * T.squeeze(G.detach())))
            policyLoss.backward()
            self.policyNet.optimizer.step()


In [10]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/

Collecting box2d-py

  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)

[K     |████████████████████████████████| 448 kB 5.2 MB/s 

[?25hInstalling collected packages: box2d-py

Successfully installed box2d-py-2.3.8

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/









In [None]:
env = gym.make('LunarLander-v2')
agent=Agent(inputShape=[8],outputShape=4,lr=0.001)
# agent.valueNet.load_state_dict(T.load("Vmodel"))
averageRewards=[]
totalRewards=[]
batchSize=64
steps=0
for i in range(0,3000):
  done=False
  if (i%200)==0:
    batchSize=min(batchSize*2,256)
  state=env.reset()
  rewards=0
  while not done:
    action=agent.chooseAction(state)
    nextState,reward,done,info=env.step(action)
    agent.save(state,action,reward,nextState,int(done))
    agent.learn(batchSize)
    steps=+1
    rewards+=reward
    state=nextState
  totalRewards.append(rewards)
  averageRewards.append(sum(totalRewards)/len(totalRewards))
  print(f"episode: {i}   reward: {rewards}  avg so far:{averageRewards[-1]}")



episode: 0   reward: -60.04382383798534  avg so far:-60.04382383798534




episode: 1   reward: -76.84217415801568  avg so far:-68.44299899800052

episode: 2   reward: -464.384729184695  avg so far:-200.42357572689866

episode: 3   reward: -609.6819151665925  avg so far:-302.7381605868221

episode: 4   reward: -655.3919684722229  avg so far:-373.26892216390223

episode: 5   reward: -145.0436599285034  avg so far:-335.23137845800244

episode: 6   reward: -252.04270714153967  avg so far:-323.34728255565057

episode: 7   reward: -172.64575539333703  avg so far:-304.50959166036137

episode: 8   reward: -150.22797495564703  avg so far:-287.36718980428196

episode: 9   reward: -165.243077616287  avg so far:-275.15477858548246

episode: 10   reward: -173.94886642539717  avg so far:-265.9542411163838

episode: 11   reward: -114.50654498731038  avg so far:-253.33359977229438

episode: 12   reward: -110.90467576639779  avg so far:-242.37752869491771

episode: 13   reward: -116.69082575310998  avg so far:-233.39990705621716

episode: 14   reward: -190.2850391014054  avg

In [4]:
# torch.save(agent.valueNet.state_dict(), "Vmodel")

In [None]:
# agent.valueNet.load_state_dict(T.load("Vmodel"))