In [None]:
#Import required libraries

import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [None]:
#Set constants for training
seed = 543
log_interval = 10
gamma = 0.99


torch.manual_seed(seed)


In [None]:
class Network(nn.Module):
    def __init__(self,state_shape = 4,action_size = 2 ):
        super(Network, self).__init__()
        self.affine1 = nn.Linear(state_shape, 128)
        self.action_head = nn.Linear(128, action_size)
        self.saved_actions = []
        self.episode_rewards = []
        self.episode_states = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_prob = F.softmax(self.action_head(x), dim=-1)
        return action_prob


In [None]:
import matplotlib.pyplot as plt

In [None]:

class REINFORCE_MCWB:

    def __init__(self,env):
        env = gym.make('CartPole-v1')
        self.env = env
        self.episodes = 500
        env.reset()
        state_shape = env.observation_space.shape[0]
        no_of_actions = env.action_space.n
        model = Network(state_shape,no_of_actions)
        self.model = model
        self.optimizer = optim.Adam(model.parameters(), lr=1e-3)
        self.max_len = 10000
        
    def select_action(self,state):
        state = torch.from_numpy(state).float()
        probs = self.model(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item()

    def update(self,rewards, states, actions):
        G = 0
        gamma = 0.99
        self.optimizer.zero_grad()
        for i in reversed(range(len(rewards))):  
            reward = rewards[i]
            state = torch.tensor(states[i].reshape(1, -1),
                                 dtype=torch.float)
            action = torch.tensor(actions[i]).view(-1, 1)
            log_prob = torch.log(self.model(state).gather(1, action))  
            G = gamma * G + reward  
            loss = -log_prob * G       
            loss.backward()             
        self.optimizer.step()
        del self.model.episode_rewards[:]
        del self.model.saved_actions[:]
        del self.model.episode_states[:]
     
    def train(self):
        total_reward = []
        running_reward = 10
        # run infinitely many episodes
        for i_episode in range(self.episodes):
            # reset environment and episode reward
            state, _ = self.env.reset()
            self.model.episode_states.append(state)
            ep_reward = 0
            for t in range(1, self.max_len):
            # select action from policy
                action = self.select_action(state)
                state, reward, done, _, _ = self.env.step(action)
                self.model.episode_states.append(state)
                self.model.episode_rewards.append(reward)
                self.model.saved_actions.append(action)
                ep_reward += reward
                if done:
                   break 
            total_reward.append(ep_reward)
            self.update(self.model.episode_rewards, self.model.episode_states, self.model.saved_actions)   
            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        return total_reward
    
    def PerformExpmt(self,num_expmt):
        reward_avgs = []
        for i in range(num_expmt):  
            print("Experiment: %d"%(i+1))
            rewards = self.train()   
            reward_avgs.append(np.asarray(rewards))
        reward_avgs_mean = np.mean(np.array(reward_avgs), axis=0)
        reward_avgs_std = np.std(reward_avgs, axis=0)
        plt.figure(figsize=(10, 6))
        plt.plot(range(self.episodes), reward_avgs_mean, label='Reward Avg', color='blue')
        plt.fill_between(range(self.episodes), reward_avgs_mean - reward_avgs_std, reward_avgs_mean + reward_avgs_std, alpha=0.3, color='blue')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.legend()
        plt.savefig('Rewards')
        plt.show()
        return reward_avgs_mean, reward_avgs_std
       

In [None]:
reinforce = REINFORCE_MCWB("Acrobot-v1")
reinforce.train()
reinforce.PerformExpmt(2)
