In [1]:
import torch
import torch.nn as nn
import gym
import numpy as np
import random
import torch.nn.functional as F
%matplotlib inline
import matplotlib.pyplot as plt
from torch.distributions import Categorical

envs = ['CartPole-v1','Acrobot-v1','MountainCar-v0','Breakout-v0','BipedalWalker-v2','LunarLander-v2']
env_to_use = 0

discrete_actions = True

env = gym.make(envs[env_to_use]).unwrapped
env.seed(1)
if discrete_actions:
    action_size = env.action_space.n
else:
    action_size = env.action_space.shape[0]
    
state_size = 4
gamma = 0.95


In [2]:
class Policy(nn.Module):
    def __init__(self,state_size,action_size):
        super(Policy, self).__init__()
        self.action_size =action_size
        self.state_size = state_size
        
        self.layer1 = nn.Linear(state_size , 6)
        self.layer2 = nn.Linear(6, 6)
        self.layer3 = nn.Linear(6, action_size)
        
    def forward(self, x):
        out = F.relu(self.layer1(x) )
        out = F.relu(self.layer2(out))
        out = F.sigmoid(self.layer3(out))
        return out
    
explore_agent = Policy(state_size,action_size)    
agent_optimizer = torch.optim.Adam(explore_agent.parameters(),lr = 0.01)


In [3]:
for e in range(1000):
    state = env.reset()
    state = torch.from_numpy(state).type("torch.FloatTensor")
    all_rewards = []
    all_actions = []
    steps = 0
    while True:
        steps+=1
        #env.render()
        agent_action = explore_agent(state)
        
        action_distribution = torch.softmax(agent_action,-1)
        
        
        m = Categorical(action_distribution)
        action = m.sample()
        all_actions.append(action_distribution[action])
        
        new_state, reward, done, info = env.step(action.item())
        all_rewards.append(reward)
        new_state = torch.from_numpy(new_state).type("torch.FloatTensor")
        
        
        if done:
            print("Episode : {}  Rewards : {}".format(e+1,np.sum(all_rewards)))
            running_add = 0
            for i in reversed(range(steps)):
                running_add = running_add * gamma + all_rewards[i]
                all_rewards[i] = running_add
               
            all_rewards = (all_rewards-np.mean(all_rewards))/np.std(all_rewards)
            
            all_actions = torch.stack(all_actions)
            all_rewards = torch.from_numpy(np.array(all_rewards)).type("torch.FloatTensor")
            agent_optimizer.zero_grad()

            loss = torch.mean(-torch.log(all_actions) * all_rewards)
            loss.backward()
            agent_optimizer.step() 
            
            break
            
        state = new_state



Episode : 1  Rewards : 24.0
Episode : 2  Rewards : 12.0
Episode : 3  Rewards : 53.0
Episode : 4  Rewards : 12.0
Episode : 5  Rewards : 26.0
Episode : 6  Rewards : 25.0
Episode : 7  Rewards : 19.0
Episode : 8  Rewards : 15.0
Episode : 9  Rewards : 27.0
Episode : 10  Rewards : 27.0
Episode : 11  Rewards : 51.0
Episode : 12  Rewards : 22.0
Episode : 13  Rewards : 22.0
Episode : 14  Rewards : 13.0
Episode : 15  Rewards : 21.0
Episode : 16  Rewards : 14.0
Episode : 17  Rewards : 45.0
Episode : 18  Rewards : 19.0
Episode : 19  Rewards : 14.0
Episode : 20  Rewards : 11.0
Episode : 21  Rewards : 26.0
Episode : 22  Rewards : 13.0
Episode : 23  Rewards : 19.0
Episode : 24  Rewards : 30.0
Episode : 25  Rewards : 35.0
Episode : 26  Rewards : 20.0
Episode : 27  Rewards : 12.0
Episode : 28  Rewards : 20.0
Episode : 29  Rewards : 18.0
Episode : 30  Rewards : 32.0
Episode : 31  Rewards : 26.0
Episode : 32  Rewards : 10.0
Episode : 33  Rewards : 18.0
Episode : 34  Rewards : 16.0
Episode : 35  Rewards :

Episode : 285  Rewards : 56.0
Episode : 286  Rewards : 65.0
Episode : 287  Rewards : 27.0
Episode : 288  Rewards : 13.0
Episode : 289  Rewards : 25.0
Episode : 290  Rewards : 33.0
Episode : 291  Rewards : 17.0
Episode : 292  Rewards : 41.0
Episode : 293  Rewards : 102.0
Episode : 294  Rewards : 15.0
Episode : 295  Rewards : 78.0
Episode : 296  Rewards : 84.0
Episode : 297  Rewards : 28.0
Episode : 298  Rewards : 31.0
Episode : 299  Rewards : 53.0
Episode : 300  Rewards : 18.0
Episode : 301  Rewards : 11.0
Episode : 302  Rewards : 15.0
Episode : 303  Rewards : 13.0
Episode : 304  Rewards : 38.0
Episode : 305  Rewards : 10.0
Episode : 306  Rewards : 13.0
Episode : 307  Rewards : 58.0
Episode : 308  Rewards : 91.0
Episode : 309  Rewards : 31.0
Episode : 310  Rewards : 39.0
Episode : 311  Rewards : 28.0
Episode : 312  Rewards : 17.0
Episode : 313  Rewards : 44.0
Episode : 314  Rewards : 19.0
Episode : 315  Rewards : 96.0
Episode : 316  Rewards : 39.0
Episode : 317  Rewards : 20.0
Episode :

Episode : 558  Rewards : 198.0
Episode : 559  Rewards : 141.0
Episode : 560  Rewards : 22.0
Episode : 561  Rewards : 87.0
Episode : 562  Rewards : 147.0
Episode : 563  Rewards : 189.0
Episode : 564  Rewards : 61.0
Episode : 565  Rewards : 57.0
Episode : 566  Rewards : 77.0
Episode : 567  Rewards : 27.0
Episode : 568  Rewards : 25.0
Episode : 569  Rewards : 33.0
Episode : 570  Rewards : 40.0
Episode : 571  Rewards : 74.0
Episode : 572  Rewards : 160.0
Episode : 573  Rewards : 42.0
Episode : 574  Rewards : 9.0
Episode : 575  Rewards : 94.0
Episode : 576  Rewards : 110.0
Episode : 577  Rewards : 66.0
Episode : 578  Rewards : 12.0
Episode : 579  Rewards : 21.0
Episode : 580  Rewards : 29.0
Episode : 581  Rewards : 17.0
Episode : 582  Rewards : 23.0
Episode : 583  Rewards : 52.0
Episode : 584  Rewards : 41.0
Episode : 585  Rewards : 141.0
Episode : 586  Rewards : 65.0
Episode : 587  Rewards : 42.0
Episode : 588  Rewards : 159.0
Episode : 589  Rewards : 28.0
Episode : 590  Rewards : 17.0
Epi

KeyboardInterrupt: 