In [134]:
import numpy as np 
import torch 
import torch.nn as nn
import matplotlib.pyplot as plt
import gym
import torch.optim as optim
import plotly.graph_objects as go
import pandas as pd


Pandas requires version '1.3.4' or newer of 'bottleneck' (version '1.3.2' currently installed).



In [112]:
class Policy(nn.Module):
    def __init__(self, STATE_DIMENSION, ACTION_DIMENSION):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(STATE_DIMENSION, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, ACTION_DIMENSION)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.softmax(self.fc3(x), dim=0)
        return x
        

In [113]:
class StateValue(nn.Module):
    def __init__(self, STATE_DIMENSION):
        super(StateValue, self).__init__()
        self.fc1 = nn.Linear(STATE_DIMENSION, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [114]:
# env = gym.make("MountainCar-v0")
#Make environment
env = gym.make('CartPole-v1')

In [116]:
STATE_DIMENSION = env.observation_space.shape[0]
ACTION_DIMENSION = env.action_space.n
ACTIONS = np.arange(ACTION_DIMENSION)

In [125]:
REPLAY_BUFFER = []
gamma = 0.99
alpha_w = 0.001
alpha_theta = 0.001
MAX_EPISODES = 1000
MAX_EPISODES_STEPS =10000

In [126]:
policy = Policy(STATE_DIMENSION, ACTION_DIMENSION)
state_value = StateValue(STATE_DIMENSION)

In [127]:
policy_optimizer = optim.SGD(policy.parameters(), lr = alpha_w)
statevalue_optimizer = optim.SGD(state_value.parameters(), lr= alpha_theta)

In [128]:
def initialize():
    factor =1
    net_reward = []
    current_state, info = env.reset()
    terminated = False
    current_state = torch.cat([torch.tensor([current_state]), torch.tensor([0])])

In [129]:
def episode():
    factor =1
    net_reward = []
    current_state = env.reset()
    terminated = False
    # current_state = torch.cat([torch.tensor([current_state]), torch.tensor([0])])
    steps = 0
    while terminated!=  True: 
        if steps > MAX_EPISODES_STEPS:
            break
        actionProbabilities = policy(torch.tensor(current_state))
        action = np.random.choice(ACTIONS, p=actionProbabilities.detach().numpy())
        next_state, reward, terminated, truncated = env.step(action)
        REPLAY_BUFFER.append([current_state, action, reward, next_state, terminated])
        delta = reward + gamma * state_value(torch.tensor(next_state)) - state_value(torch.tensor(current_state))
        # Critic update
        policy_loss = -torch.log(actionProbabilities[action]) * delta
        critic_loss = delta**2
        policy_loss *= factor
        critic_loss *= factor
        torch.autograd.set_detect_anomaly(True)
        
        policy_optimizer.zero_grad()
        policy_loss.backward(retain_graph=True)
        policy_optimizer.step()
        statevalue_optimizer.zero_grad()
        critic_loss.backward()
        statevalue_optimizer.step()
        current_state = next_state
        factor *= gamma
        net_reward.append(reward)
        steps +=1
    return net_reward

In [130]:
def main():
    episode_rewards = []
    for episodes in range(MAX_EPISODES):
        net_reward = episode()
        if episodes % 100 == 0:
            print("Episode: ", episodes, "Reward: ", sum(net_reward))
        episode_rewards.append(sum(net_reward))
    return episode_rewards

In [131]:
episode_rewards = main()

Episode:  0 Reward:  19.0
Episode:  100 Reward:  12.0
Episode:  200 Reward:  21.0
Episode:  300 Reward:  44.0
Episode:  400 Reward:  31.0
Episode:  500 Reward:  64.0
Episode:  600 Reward:  75.0
Episode:  700 Reward:  42.0
Episode:  800 Reward:  52.0
Episode:  900 Reward:  37.0


In [135]:
window_size = 7  
moving_avg = pd.Series(episode_rewards).rolling(window=window_size).mean()

epochs = list(range(len(episode_rewards)))
fig = go.Figure()

fig.add_trace(go.Scatter(x=epochs, y=episode_rewards, mode='lines', name='Actor Critic Rewards'))

fig.add_trace(go.Scatter(x=epochs, y=moving_avg, mode='lines', name='Moving Average', line=dict(color='red')))

fig.update_layout(
    title='Loss vs. Epochs',
    xaxis=dict(title='Epochs'),
    yaxis=dict(title='Loss'),
    legend=dict(x=0.7, y=1.0),
    margin=dict(l=20, r=20, t=40, b=20),
    hovermode='x unified' 
)
fig.show()

In [136]:
epochs = np.array(epochs)
episode_rewards = np.array(episode_rewards)
slope, intercept = np.polyfit(epochs, episode_rewards, 1)
fitted_line = slope * epochs + intercept

fig = go.Figure()

fig.add_trace(go.Scatter(x=epochs, y=episode_rewards, mode='lines', name='Actor Critic Rewards'))

fig.add_trace(go.Scatter(x=epochs, y=fitted_line, mode='lines', name='Fitted Line', line=dict(color='red')))

fig.update_layout(
    title='Loss vs. Epochs with Fitted Line',
    xaxis=dict(title='Epochs'),
    yaxis=dict(title='Loss'),
    legend=dict(x=0.7, y=1.0),
    margin=dict(l=20, r=20, t=40, b=20),
    hovermode='x unified' 
)
fig.show()