In [171]:
import numpy as np
import gym 
from tiles3 import tiles, IHT
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [172]:
maxSize = 2048
iht = IHT(maxSize)
weights = [0]*maxSize
numTilings = 8
stepSize = 0.1/numTilings

In [173]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, HIDDEN1, HIDDEN2, HIDDEN3, output_size):
        super(PolicyNetwork, self).__init__()
        
        self.fc1 = nn.Linear(input_size, HIDDEN1)
        self.fc2 = nn.Linear(HIDDEN1, HIDDEN2)
        self.fc3 = nn.Linear(HIDDEN2, HIDDEN3)
        self.fc4 = nn.Linear(HIDDEN3, output_size)
        self.relu = nn.ReLU()
        
    def forward(self,x):
        x = self.fc1(x)
        x=self.relu(x)
        x = self.fc2(x)
        x=self.relu(x)
        x= self.fc3(x)
        x=self.relu(x)
        x=self.fc4(x)
        x=F.softmax(x, dim=-1)
        return x
            

In [174]:
class ValueNetwork(nn.Module):
    def __init__(self, input_size, HIDDEN1, output_size):
        super(ValueNetwork, self).__init__()
        
        self.fc1 = nn.Linear(input_size, HIDDEN1)
        self.fc2 = nn.Linear(HIDDEN1, output_size)
        self.relu = nn.ReLU()
        
        def forward(self,x):
            x = self.fc1(x)
            x=self.relu(x)
            x = self.fc2(x)
            return x
      
            

In [175]:
actor = PolicyNetwork(8,12,24,32,3)
#critic = CriticNetwork(8,2,1)

In [176]:
print("The actor is ", actor)
print("\n")
print("The critic is ", critic)

The actor is  PolicyNetwork(
  (fc1): Linear(in_features=8, out_features=12, bias=True)
  (fc2): Linear(in_features=12, out_features=24, bias=True)
  (fc3): Linear(in_features=24, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=3, bias=True)
  (relu): ReLU()
)


The critic is  CriticNetwork(
  (fc1): Linear(in_features=8, out_features=2, bias=True)
  (fc2): Linear(in_features=2, out_features=1, bias=True)
  (relu): ReLU()
)


In [177]:
env = gym.make("MountainCar-v0")

In [186]:
observation, info = env.reset(seed=42)

In [187]:
action_space=[0,1,2]

In [188]:
def one_hot_action(action):
    one_hot_action = [0 for x in range(len(action_space))]
    one_hot_action[action]=1
    return one_hot_action

In [189]:
def features_policy(observations):
    tiling = tiles(iht, 8, observation)
    return tiling

In [190]:
# state = features_policy(observation)

In [191]:
# state = torch.tensor(state, dtype=torch.float32)

In [192]:
# action_probs = actor(state)

In [193]:
def sample_action(action_probs):
    action_probs = action_probs.detach().numpy()
    return np.random.choice(len(action_probs), p=action_probs)

In [194]:
for name, param in actor.named_parameters():
    if param.requires_grad:
        print(f"Parameter name: {name}")
        print(param.data)

Parameter name: fc1.weight
tensor([[ 0.1664,  0.1724,  0.1305,  0.1466, -0.2943, -0.1886,  0.2937,  0.0431],
        [ 0.3075,  0.2769, -0.2911, -0.0188,  0.1752, -0.2420,  0.1845,  0.2821],
        [ 0.2622,  0.2769, -0.1848, -0.3252,  0.0236, -0.2848,  0.3487, -0.0610],
        [ 0.2426, -0.1397, -0.0342, -0.1581, -0.1702, -0.0833,  0.0546, -0.0761],
        [-0.0570,  0.1548, -0.0181,  0.3204, -0.1521,  0.1709,  0.2561,  0.1764],
        [ 0.0640,  0.0749,  0.0649, -0.2897,  0.3161,  0.2509, -0.2524,  0.0157],
        [-0.0936, -0.1448, -0.3402, -0.1857, -0.0855, -0.1587,  0.2765, -0.0953],
        [ 0.1113, -0.1412,  0.0884,  0.1899,  0.0036, -0.1467, -0.3080,  0.0378],
        [-0.1546, -0.1727,  0.0036, -0.0638,  0.0581,  0.1744,  0.1564, -0.2835],
        [-0.1450, -0.1706, -0.3420,  0.2604,  0.1544,  0.1911, -0.2761,  0.2599],
        [ 0.3279, -0.0319,  0.3531,  0.2443, -0.1564, -0.2169, -0.1068, -0.3306],
        [ 0.0239,  0.0303,  0.2806,  0.2057, -0.1311, -0.2790, -0.0977,

In [130]:
observation, info = env.reset(seed=42)

In [195]:
terminated = False
trajectories=[]
steps=0
prob_log=[]
while terminated !=True:
    steps+=1
    observation=torch.tensor(features_policy(observation))
    observation=observation.float()
    action_probs = actor(observation)
    log_probs = torch.log(action_probs)
    current_action = sample_action(action_probs)
    prob_log.append(log_probs[current_action])
    next_state, reward, terminated, truncated, info = env.step(current_action)
    trajectories.append([observation,current_action,reward])
    if terminated !=True:
        observation=next_state
    else:
        break
    if steps>10000:
        break

In [196]:
print(f"The length of the episode was {len(trajectories)}")

The length of the episode was 10001


In [197]:
gamma = 0.99
alpha = 0.001

In [210]:
cumulative_reward=[]
net_reward = 0
for j in range(len(trajectories)):
    returns = 0
    for i in range(j,len(trajectories)):
        returns += (gamma**(i-1)) * trajectories[i][-1]
    cumulative_reward.append(returns)
for i in range(len(prob_log)):
    net_reward += prob_log[i] * cumulative_reward[i]

In [211]:
log_probs_tensor = torch.tensor(prob_log, requires_grad=True)
cumulative_reward_tensor = torch.tensor(cumulative_reward, requires_grad=True)
loss = -torch.sum(log_probs_tensor * cumulative_reward_tensor)
loss.backward()
optimizer.step()
optimizer.zero_grad()

  log_probs_tensor = torch.tensor(prob_log, requires_grad=True)
