In [18]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy" #required to run pygame without rendering
#must be set before importing pygame 
import game
import numpy as np
import random
import sys
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [19]:
# game = game.Game(user_control = False, render = False)

In [20]:
#d = DDPGAgent(2,2)

In [21]:
#d.select_action([0.2,0.5])

In [22]:
# a = Actor(2,2)
# a(torch.FloatTensor([0.2,0.5]))

In [23]:
# agent = DDPGAgent(2, 2)
# replay_buffer = ReplayBuffer(100)
# env = game.Game()

In [24]:
# np.mean([2,3,4])

In [25]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        """Add a new experience to the replay buffer."""
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """Sample a batch of experiences from the replay buffer."""
        batch = random.sample(self.buffer, batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        return (
            np.stack(state_batch),
            np.stack(action_batch),
            np.stack(reward_batch),
            np.stack(next_state_batch),
            np.stack(done_batch)
        )

    def __len__(self):
        """Return the current size of the replay buffer."""
        return len(self.buffer)

class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, action_size)
        self.tanh = nn.Tanh()

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        output = torch.relu(self.fc2(x))
        return self.tanh(self.fc3(output))
#         print(output)
#         output[:, 0] = custom_activation1(output[:, 0])  # First neuron
#         output[:, 1] = custom_activation2(output[:, 1])  # Second neuron
#         return self.tanh(self.fc3(x))  

class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.fcs1 = nn.Linear(state_size, 400)
        self.fc2 = nn.Linear(400 + action_size, 300)
        self.fc3 = nn.Linear(300, 1)

    def forward(self, state, action):
        xs = torch.relu(self.fcs1(state))

        x = torch.cat((xs, action), dim=1)

        x = torch.relu(self.fc2(x))
        return torch.relu(self.fc3(x))

class DDPGAgent:
    def __init__(self, state_size, action_size):
        self.actor = Actor(state_size, action_size)
        self.critic = Critic(state_size, action_size)
        self.target_actor = Actor(state_size, action_size)
        self.target_critic = Critic(state_size, action_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # Initialize target networks with same weights as the original networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        
    def convert_action(self, action):  # Add 'self' here
        action[0] *= 5
        action[1] *= 20
        action[1] = np.abs(action[1])
        return action
        # Other parameters and buffers here...
    def select_action(self, state):
        self.actor.eval()
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state)  # Convert to tensor and add batch dimension
            #print(state_tensor)
            #state_tensor = self.convert_state(state_tensor)
            action = self.actor(state_tensor).cpu().data.numpy().flatten()
            action = self.convert_action(action)
            #add noise
            action += np.random.normal(0, scale=2, size=2)
        self.actor.train()
        return action
    
    def learn(self, replay_buffer, batch_size, gamma,printer = False):
        if len(replay_buffer) < batch_size:
            return  # Not enough samples in the replay buffer
        
        
        #Part 1: Sample stuff

        # Sample a batch of transitions from the replay buffer
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = replay_buffer.sample(batch_size)

        # Convert numpy arrays to PyTorch tensors
        #all of these are 10 each because of batch_size of 10
        state_batch = torch.FloatTensor(state_batch) #[x,y] tensor

        action_batch = torch.FloatTensor(action_batch) #[delta x,delta y] tensor

        reward_batch = torch.FloatTensor(reward_batch) #[reward] tensor

        next_state_batch = torch.FloatTensor(next_state_batch)  #10x2[x,y] tensor of next states
        done_batch = torch.FloatTensor(done_batch) #[10 if_dones]

        
        reward_batch = reward_batch.view(-1, 1)  # Reshape to [batch_size, 1]
        done_batch = done_batch.view(-1, 1)      # Reshape to [batch_size, 1]
        
        #the target gets the actions
        
        
        #Part 2, get next actions
        # Compute target Q values using the target critic network
        with torch.no_grad():
            next_actions = self.target_actor(next_state_batch) #10 x 2 input, 10x2 output (all -1,-1?)
            target_q_values = self.target_critic(next_state_batch, next_actions)
#           

            #10,1 * 10,1 * 10,1
            target_q_values = reward_batch + gamma * (1 - done_batch) * target_q_values 
        #only place reward is
        
        ##############################################
        ###########################################
        #part 3, learn
        # Update the critic network
        current_q_values = self.critic(state_batch, action_batch) #makes sense
        critic_loss = F.mse_loss(current_q_values, target_q_values) 

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update the actor network
        predicted_actions = self.actor(state_batch)
        actor_loss = -self.critic(state_batch, predicted_actions).mean()
        if printer:
            #print('predicted_actions')
            #print(predicted_actions)
            print('critic_loss')
            print(critic_loss)
            print('actor_loss')
            print(actor_loss)

            print('reward batch avg')
            print(torch.mean(reward_batch).item())
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft update the target networks
        self.soft_update(self.target_actor, self.actor)
        self.soft_update(self.target_critic, self.critic)

    def soft_update(self, target, source, tau=0.01):
        for target_param, source_param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(tau * source_param.data + (1.0 - tau) * target_param.data)



In [26]:
agent = DDPGAgent(2, 2)
replay_buffer = ReplayBuffer(1000)
env = game.Game()

In [27]:
for episode in range(10):
    state = env.reset()
    done = False
    steps = 0
    #for debugging
    rewards = []
    while not done:
        
        action = agent.select_action(state)
        next_state, reward, done = env.step(action)
        #print(next_state)
        rewards.append(reward)
        if steps > 3000:
            print(episode)
            print('broken')
            print(np.mean(rewards))
            break
        if done:
            print('done!')
            print(episode,steps)
            print(np.mean(rewards))
            rewards = []
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        batch_size = 10
        if len(replay_buffer) > batch_size:
            printer = False
            if steps % 100 == 0:
                printer = False #switch here
            agent.learn(replay_buffer,batch_size,0.1,printer)


done!
0 1304
-92235.80153256706
done!
1 659
-101840.86818181818
done!
2 739
-98914.94324324324
done!
3 470
-90173.59023354565
done!
4 282
-92042.33215547704
done!
5 266
-101255.85018726591
done!
6 274
-88117.08363636363
done!
7 1315
-92081.73480243162
done!
8 371
-104789.65053763441
9
broken
-115693.11292471686


In [28]:
#get action_list
state = env.reset()
done = False
episode_reward = 0
actions = []
steps = 0 
while not done and steps < 4000:
    # Select action based on the current state
    action = agent.select_action(state)
    actions.append(action)
    # Step through the environment with the selected action
    next_state, reward, done = env.step(action)

    # Update state and accumulate reward
    state = next_state
    episode_reward += reward
    steps +=1
    if done:
        print(f"Episode: {episode + 1}, Reward: {episode_reward}")
        break


Episode: 10, Reward: -36021876


In [29]:
len(actions)

422

In [30]:
import pickle

# Sample Python list

# Save the list to a file
with open('actions.pkl', 'wb') as file:
    pickle.dump(actions, file)
