In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque

In [2]:
MAX_REWARD_REACH = 175
BATCH_SIZE=256
GAMMA = 0.99 #discount factor for future utilities
env = gym.make('CartPole-v1') #Make environment
NUM_EPISODES = 3000 #number of episodes to run
MAX_STEPS = 10000 #max steps per episode
SOLVED_SCORE = 195 #score agent needs for environment to be solved
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" #device to run model on 
Model_Name = "Reinforce_Base_line"
LR = 1e-3

ENV_Name='CartPole-v0'

  deprecation(
  deprecation(


In [3]:
torch.manual_seed(2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
env = gym.make(ENV_Name)
env.seed(0)

  logger.warn(
  deprecation(


[0]

In [4]:
#Using a neural network to learn our policy parameters
class PolicyNetwork(nn.Module):
    
    #Takes in observations and outputs actions
    def __init__(self, observation_space, action_space):
        super(PolicyNetwork, self).__init__()
        self.input_layer = nn.Linear(observation_space, 128)
        self.output_layer = nn.Linear(128, action_space)
    
    #forward pass
    def forward(self, x):
        #input states
        x = self.input_layer(x)
        
        #relu activation
        x = F.relu(x)
        
        #actions
        actions = self.output_layer(x)
        
        #get softmax for a probability distribution
        action_probs = F.softmax(actions, dim=1)
        
        return action_probs

In [5]:
#Using a neural network to learn state value
class StateValueNetwork(nn.Module):
    
    #Takes in state
    def __init__(self, observation_space):
        super(StateValueNetwork, self).__init__()
        
        self.input_layer = nn.Linear(observation_space, 128)
        self.output_layer = nn.Linear(128, 1)
        
    def forward(self, x):
        #input layer
        x = self.input_layer(x)
        
        #activiation relu
        x = F.relu(x)
        
        #get state value
        state_value = self.output_layer(x)
        
        return state_value

In [6]:
def select_action(network, state):
    ''' Selects an action given current state
    Args:
    - network (Torch NN): network to process state
    - state (Array): Array of action space in an environment
    
    Return:
    - (int): action that is selected
    - (float): log probability of selecting that action given state and network
    '''
    
    #convert state to float tensor, add 1 dimension, allocate tensor on device
    state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
    
    #use network to predict action probabilities
    action_probs = network(state)
    state = state.detach()
    
    #sample an action using the probability distribution
    m = Categorical(action_probs)
    action = m.sample()
    
    #return action
    return action.item(), m.log_prob(action)

In [7]:
def process_rewards(rewards):
    ''' Converts our rewards history into cumulative discounted rewards
    Args:
    - rewards (Array): array of rewards 
    
    Returns:
    - G (Array): array of cumulative discounted rewards
    '''
    #Calculate Gt (cumulative discounted rewards)
    G = []
    
    #track cumulative reward
    total_r = 0
    
    #iterate rewards from Gt to G0
    for r in reversed(rewards):
        
        #Base case: G(T) = r(T)
        #Recursive: G(t) = r(t) + G(t+1)^DISCOUNT
        total_r = r + total_r * GAMMA
        
        #add to front of G
        G.insert(0, total_r)
    
    #whitening rewards
    G = torch.tensor(G).to(DEVICE)
    G = (G - G.mean())/G.std()
    
    return G

In [8]:
def train_policy(deltas, log_probs, optimizer):
    ''' Update policy parameters
    Args:
    - deltas (Array): difference between predicted stateval and actual stateval (Gt)
    - log_probs (Array): trajectory of log probabilities of action taken
    - optimizer (Pytorch optimizer): optimizer to update policy network parameters
    '''
    
    #store updates
    policy_loss = []
    
    #calculate loss to be backpropagated
    for d, lp in zip(deltas, log_probs):
        #add negative sign since we are performing gradient ascent
        policy_loss.append(-d * lp)
    
    #Backpropagation
    optimizer.zero_grad()
    sum(policy_loss).backward()
    optimizer.step()
    

In [9]:
def train_value(G, state_vals, optimizer):
    ''' Update state-value network parameters
    Args:
    - G (Array): trajectory of cumulative discounted rewards 
    - state_vals (Array): trajectory of predicted state-value at each step
    - optimizer (Pytorch optimizer): optimizer to update state-value network parameters
    '''
    
    #calculate MSE loss
    val_loss = F.mse_loss(state_vals, G)
        
    #Backpropagate
    optimizer.zero_grad()
    val_loss.backward()
    optimizer.step()

In [10]:
def train(iteration_no):
    ## rest env ###
    random_number = np.random.randint(1, 100)
    torch.manual_seed(random_number)
    env.seed(random_number)
    env.reset()
    Reward_max = 0

    #Init network
    policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n).to(DEVICE)
    stateval_network = StateValueNetwork(env.observation_space.shape[0]).to(DEVICE)


    #Init optimizer
    policy_optimizer = optim.Adam(policy_network.parameters(), lr=LR)
    stateval_optimizer = optim.Adam(stateval_network.parameters(), lr=LR)

    scores = []
    Average_reward = []
    #recent 100 scores
    recent_scores = deque(maxlen=MAX_STEPS)

    #iterate through episodes
    for episode in tqdm_notebook(range(NUM_EPISODES)):

        #reset environment, initiable variables
        state = env.reset()
        trajectory = []
        score = 0
        rew_buffer = deque(maxlen=MAX_STEPS)

        #generate episode
        for step in range(MAX_STEPS):
            #env.render()

            #select action
            action, lp = select_action(policy_network, state)

            #execute action
            new_state, reward, done, _ = env.step(action)

            #track episode score
            score += reward

            #store into trajectory
            trajectory.append([state, action, reward, lp])

            #end episode
            if done:
                rew_buffer.append(score)
                break

            #move into new state
            state = new_state

        #append score
        scores.append(score)
        recent_scores.append(score)

        # # If we solved it lets just watch it play, put in last
        if episode % 100 == 0:
            if np.mean(rew_buffer) > MAX_REWARD_REACH:
                if np.mean(rew_buffer)> Reward_max : 
                    print(f'current_Reward_max {Reward_max}')
                    print(f'Trained_Reward_max {np.mean(rew_buffer)}')
                    Reward_max = np.mean(rew_buffer)

                    if np.mean(rew_buffer) >= MAX_REWARD_REACH:

                        checkpoint = {
                            "ENV_Name":ENV_Name,
                            "state_dict": policy_network.state_dict(),
                            "GAMMA":GAMMA,
                            "BATCH_SIZE":BATCH_SIZE,
                            "MAX_REWARD_REACH" :MAX_REWARD_REACH,
                            "LR" :LR}
                        model_name=f'iteration_{iteration_no}_{ENV_Name}_{Model_Name}_best'
                        save_checkpoint(checkpoint,model_name)


        #get items from trajectory
        states = [step[0] for step in trajectory]
        actions = [step[1] for step in trajectory]
        rewards = [step[2] for step in trajectory]
        lps = [step[3] for step in trajectory]

        #get discounted rewards
        G = process_rewards(rewards)
        #G = torch.tensor(G).to(DEVICE)

        #calculate state values and train statevalue network
        state_vals = []
        for state in states:
            state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
            state_vals.append(stateval_network(state))

        state_vals = torch.stack(state_vals).squeeze()

        train_value(G, state_vals, stateval_optimizer)

        #calculate deltas and train policy network
        deltas = [gt - val for gt, val in zip(G, state_vals)]
        deltas = torch.tensor(deltas).to(DEVICE)

        train_policy(deltas, lps, policy_optimizer)


        if episode % 10 == 0:
            if episode !=0:
#                 print(f'length--> {len(rew_buffer)}')
#                 print(f'average reward {rew_buffer[0]}')
                Average_reward.append(rew_buffer[0])
    
        if episode % 250 == 0:
            if episode !=0:
#                 print(f'length--> {len(rew_buffer)}')
                print(f'average reward {rew_buffer[0]}')
    
    checkpoint = {
        "ENV_Name":ENV_Name,
        "state_dict": policy_network.state_dict(),
        "GAMMA":GAMMA,
        "BATCH_SIZE":BATCH_SIZE,
        "MAX_REWARD_REACH" :MAX_REWARD_REACH,
        "LR" :LR}
    model_name=f'iteration_{iteration_no}_{ENV_Name}_{Model_Name}_last'
    save_checkpoint(checkpoint,model_name)
    
    return Average_reward,policy_network
    
def save_checkpoint(state,model_name):
    filename=f'./All_ckpt/{model_name}_checkpoint__.pth'
#     print("=> Saving checkpoint")
    torch.save(state, filename)    
    
    

In [None]:
Iterations =5
total_reward_list = []
for k in range(Iterations):
    print(f'\n\n\n  ITERATION   {k} \n\n ' )
    Average_reward,policy_network = train(k)
    Average_reward = np.array(Average_reward)
    total_reward_list.append(Average_reward)
    file_name = f'./npy/Q2_itertaion_{k}_{Model_Name}_{ENV_Name}_'
    np.save(f'{file_name}.npy', Average_reward)




  ITERATION   0 

 


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for episode in tqdm_notebook(range(NUM_EPISODES)):
  self.comm = Comm(**args)


  0%|          | 0/3000 [00:00<?, ?it/s]

  if not isinstance(terminated, (bool, np.bool8)):


average reward 124.0
current_Reward_max 0
Trained_Reward_max 200.0
average reward 200.0
average reward 200.0
average reward 200.0
average reward 200.0
average reward 183.0
average reward 200.0
average reward 200.0
average reward 165.0
average reward 200.0


In [None]:
Mean_Flag = True
mean_Plot_array = []
for k in range(Iterations):
    file_name = f'./npy/Q2_itertaion_{k}_{Model_Name}_{ENV_Name}_'
    loaded_array = np.load(f'{file_name}.npy')
    mean_Plot_array.append(loaded_array)

mean_scrs = np.mean(mean_Plot_array,axis = 0)

plt.plot(mean_scrs,label=f'{ENV_Name}')
plt.legend()
plt.ylabel ('Total Reward')
plt.xlabel('Episodes')
plt.title(f'{ENV_Name} with {Model_Name}(Average over {Iterations} runs)')
plt.savefig(f'{ENV_Name} with {Model_Name}.png')
plt.show()

In [None]:
obs = env.reset()
episode_length = 0
for i in range(5000):
    print(f'episode_length {episode_length}')
    episode_length +=1
#     obs = torch.as_tensor(obs, dtype=torch.float32).to(device)
    action,_ = select_action(policy_network, obs)
#     policy_network.act(obs)

    obs, _, done, _ = env.step(action)
    env.render()
    if done: 
        env.reset()
        episode_length = 0

KeyboardInterrupt: 