In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym
env=gym.make('LunarLander-v2')
print(env.action_space)
print(env.observation_space)

Discrete(4)
Box(-inf, inf, (8,), float32)


In [2]:
class PolicyNetwork(nn.Module):
    def __init__(self,lr,input_dims,fc1_dims,fc2_dims,n_actions):
        super(PolicyNetwork,self).__init__()
        self.input_dims=input_dims
        self.lr=lr
        self.fc1_dims=fc1_dims
        self.fc2_dims=fc2_dims
        self.n_actions=n_actions
        self.fc1=nn.Linear(input_dims,fc1_dims)
        self.fc2=nn.Linear(fc1_dims,fc2_dims)
        self.fc3=nn.Linear(fc2_dims,n_actions)
        self.optimizer=optim.Adam(self.parameters(),lr=lr)
        
        self.device=T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self,observation):
        observation=T.Tensor(observation).to(self.device)
        x=F.relu(self.fc1(observation))
        x=F.relu(self.fc2(x))
        x=self.fc3(x)
        
        return x

In [3]:
class Agent():
    def __init__(self,lr,input_dims,gamma=0.99,n_actions=4,l1_size=256,l2_size=256):
        
        self.gamma=gamma
        self.reward_memory=[]
        self.action_memory=[]
        self.policy=PolicyNetwork(lr,input_dims,l1_size,l2_size,n_actions)
        
    def act(self,observation):
        probs=F.softmax(self.policy.forward(observation))
        action_probs=T.distributions.Categorical(probs)
        action=action_probs.sample()
#         print(action)
        log_probs=action_probs.log_prob(action)
        self.action_memory.append(log_probs)
        
        return action.item()
    
    def store_rewards(self,reward):
        self.reward_memory.append(reward)
        
    def learn(self):
        self.policy.optimizer.zero_grad()
        G=[]
        for t in range(len(self.reward_memory)):
            G_sum=0
            discount=1
            
            for k in range(t,len(self.reward_memory)):
                G_sum+=self.reward_memory[k]*discount
                discount*=self.gamma   
            G.append(G_sum)
            
        mean=np.mean(G)
        std=np.std(G) if np.std(G)>0 else 1
        G=np.array(G)
        G=(G-mean)/std
        
        G=T.tensor(G).to(self.policy.device)
        
        loss=0
        
        for g,log_probs in zip(G,self.action_memory):
            loss+= -g*log_probs
        
        loss.backward()
        
        
        self.policy.optimizer.step()
        
        
        self.action_memory=[]
        self.reward_memory=[]
        
        
    

In [5]:

# agent=Agent(0.0005,8,0.99,4,256,256)
# PATH="Model_Weights/lunar_agent1.pt"
agent = Agent(0.00025,8,0.99,4,256,256)
# agent.policy.load_state_dict(T.load(PATH))
# agent.policy.eval()

score_history=[]
score=0
num_ep=2500

for i in range(num_ep):
    done=False
    score=0
    observation=env.reset()
    while not done:
        action=agent.act(observation)
        observation,reward,done,_ =env.step(action)
        agent.store_rewards(reward)
        score+=reward
    score_history.append(score)
    print('episode',i,'score %.3f' % score)
    agent.learn()

  # Remove the CWD from sys.path while we load stuff.


episode 0 score 13.324
episode 1 score -0.797
episode 2 score 242.123
episode 3 score 151.705
episode 4 score 150.542
episode 5 score 278.728
episode 6 score 138.147
episode 7 score 13.154
episode 8 score 261.335
episode 9 score 112.215
episode 10 score 137.639
episode 11 score 132.602
episode 12 score 170.373
episode 13 score 119.660
episode 14 score 136.098
episode 15 score 160.104
episode 16 score 194.785
episode 17 score 127.288
episode 18 score 259.441
episode 19 score 144.807
episode 20 score 184.723
episode 21 score 225.352
episode 22 score 156.792
episode 23 score 202.117
episode 24 score 241.180
episode 25 score 139.673
episode 26 score 150.946
episode 27 score 144.497
episode 28 score 119.753
episode 29 score 155.285
episode 30 score -6.979
episode 31 score 166.362
episode 32 score 122.588
episode 33 score 148.438
episode 34 score 136.420
episode 35 score 55.754
episode 36 score 270.505
episode 37 score 55.192
episode 38 score 133.931
episode 39 score 145.088
episode 40 score

KeyboardInterrupt: 

# Test the Agent

In [4]:
PATH="Model_Weights/lunar_agent1.pt"
# T.save(agent.policy.state_dict(), PATH)

In [5]:
new_agent = Agent(0.0005,8,0.99,4,256,256)
new_agent.policy.load_state_dict(T.load(PATH))
new_agent.policy.eval()

PolicyNetwork(
  (fc1): Linear(in_features=8, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=4, bias=True)
)

In [11]:

done=False
score=0
observation=env.reset()
while not done:
    action=new_agent.act(observation)
    observation,reward,done,_ =env.step(action)
    env.render()
    score+=reward
# score_history.append(score)
# print(score)

  # Remove the CWD from sys.path while we load stuff.


tensor(2)
tensor(3)
tensor(1)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(2)
tensor(3)
tensor(3)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(2)
tensor(3)
tensor(1)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(2)
tensor(3)
tensor(3)
tensor(2)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(2)
tensor(3)
tensor(1)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(1)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(3)
tensor(3)
tensor(2)
tensor(2)
tensor(1)
tensor(3)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(3)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(3)
tensor(1)
tensor(1)
tensor(2)
tensor(2)
tensor(3)
tensor(2)
tensor(1)
tensor(2)
