In [1]:
#Import required libraries

import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
device = torch.device("cuda:0")
print(device)


cuda:0


In [3]:
#Set constants for training
seed = 543
log_interval = 10
gamma = 0.99

In [4]:
class Network(nn.Module):
    def __init__(self,state_shape = 4,action_size = 2 ):
        super(Network, self).__init__()
        self.affine1 = nn.Linear(state_shape, 128)
        self.action_head = nn.Linear(128, action_size)
        self.saved_actions = []
        self.episode_rewards = []
        self.episode_states = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_prob = F.softmax(self.action_head(x), dim=-1)
        return action_prob


In [5]:
class VNetwork(nn.Module):
    def __init__(self, state_size, seed, fc1_units=128, fc2_units=64):
        super(VNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [6]:
class REINFORCE_MCWB:

    def __init__(self,env):
        env = gym.make(env)
        self.env = env
        seed  =543
        self.episodes = 500
        env.reset()
        state_shape = env.observation_space.shape[0]
        no_of_actions = env.action_space.n
        policy = Network(state_shape,no_of_actions).to(device)
        self.policy = policy
        self.optimizerP = optim.Adam(policy.parameters(), lr=1e-3)
        self.vnetwork_local = VNetwork(state_shape, seed).to(device)
        self.optimizerV = optim.Adam(self.vnetwork_local.parameters(), lr=3e-2)
        self.vnetwork_target = VNetwork(state_shape, seed).to(device)
        self.max_len = 10000

    def learn_Value(self, states, actions, rewards, next_states, dones):
        next_states = torch.tensor(next_states).to(device)
        V_targets_next = self.vnetwork_target(next_states).detach()
        V_targets =  + (gamma * V_targets_next * (1 - dones))
        actions = torch.tensor(actions).view(-1, 1).to(device)
        V_expected = self.vnetwork_local(torch.tensor(states).to(device))
        loss = F.mse_loss(V_expected, V_targets)
        self.optimizerV.zero_grad()
        loss.backward()
        for param in self.vnetwork_local.parameters():
              param.grad.data.clamp_(-1, 1)
        self.optimizerV.step()
        
    def select_action(self,state):
        state = torch.from_numpy(state).float().to(device)
        probs = self.policy(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item()

    def update(self,rewards, states, actions):
        G = 0
        gamma = 0.99
        self.optimizerP.zero_grad()
        for i in reversed(range(len(rewards))):  
            reward = rewards[i]
            state = torch.tensor(states[i].reshape(1, -1),
                                 dtype=torch.float).to(device)
            action = torch.tensor(actions[i]).view(-1, 1).to(device)
            log_prob = torch.log(self.policy(state)).gather(1,action)
            #print(log_prob)
            G = gamma * G + reward
            advantage =  G  - self.vnetwork_local(state)
            #print(advantage)
            loss = -log_prob * advantage 
            loss.backward()             
        self.optimizerP.step()
        del self.policy.episode_rewards[:]
        del self.policy.saved_actions[:]
        del self.policy.episode_states[:]
     
    def train(self):
        total_reward = []
        avg_reward = []
        running_reward = 10
        # run infinitely many episodes
        for i_episode in range(self.episodes):
            # reset environment and episode reward
            state, _ = self.env.reset()
            ep_reward = 0
            for t in range(1, self.max_len):
            # select action from policy
                self.policy.episode_states.append(state)
                action = self.select_action(state)
                next_state, reward, done, _, _ = self.env.step(action)
                self.learn_Value(state, action, reward, next_state, done)
                self.policy.episode_rewards.append(reward)
                self.policy.saved_actions.append(action)
                state = next_state
                ep_reward += reward
                if done:
                   break 
            total_reward.append(ep_reward)
            self.update(self.policy.episode_rewards, self.policy.episode_states, self.policy.saved_actions)   
            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
            avg_reward.append(running_reward)
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        return avg_reward
    
    def PerformExpmt(self,num_expmt):
        reward_avgs = []
        for i in range(num_expmt):  
            print("Experiment: %d"%(i+1))
            rewards = self.train()   
            reward_avgs.append(np.asarray(rewards))
        reward_avgs_mean = np.mean(np.array(reward_avgs), axis=0)
        reward_avgs_std = np.std(reward_avgs, axis=0)
        plt.figure(figsize=(10, 6))
        plt.plot(range(self.episodes), reward_avgs_mean, label='Reward Avg', color='blue')
        plt.fill_between(range(self.episodes), reward_avgs_mean - reward_avgs_std, reward_avgs_mean + reward_avgs_std, alpha=0.3, color='blue')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.legend()
        plt.savefig('Rewards')
        plt.show()
        return reward_avgs_mean, reward_avgs_std
       

In [7]:
reinforce = REINFORCE_MCWB("Acrobot-v1")
avg_reward = reinforce.train()



In [None]:
import matplotlib.pyplot as plt
plt.plot(avg_reward)