In [60]:
import pygame
import numpy as np
import random
import sys
import time
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

<h1>Assets

In [62]:
# Set the size of the window and some basic parameters
size = width, height = 640, 480
speed = [0, 0]
gravity = 0.5

# Colors
black = (0, 0, 0)
blue = (0, 0, 255)
red = (255, 0, 0)
green = (0, 255, 0)
white = (255, 255, 255)
gold = (255, 215, 0)  # Color for the goal

# Create a window
screen = pygame.display.set_mode(size)

# Set the title of the window
pygame.display.set_caption('Pygame Ball Rolling Example')

# Load an image (square)
square = pygame.Surface((50, 50))
square.fill(red)
square_rect = square.get_rect()
print(square_rect)
# Define platforms as a list of rectangles
platforms = [
    pygame.Rect(0, height - 20, width, 20),  # Add ground platform
    pygame.Rect(height, 300, 200, 20), #x of top left, y of top left, width, height
    pygame.Rect(350, 200, 150, 20),
    pygame.Rect(200, 100, 200, 20)
    #pygame.Rect(0,0,20,height),
    #pygame.Rect(width-20,0,20,height),
    #pygame.Rect(0,0,width,20),

]
goal = pygame.Rect(platforms[-1].centerx - 25, platforms[-1].y - 50, 50, 50)
ground_platform_top = platforms[0].top  # The top of the ground platform
square_rect = square.get_rect(midbottom=(width // 2, ground_platform_top))


<rect(0, 0, 50, 50)>


<h1>Functions

In [63]:
# Function to draw platforms
def draw_platforms():
    for p in platforms:
        pygame.draw.rect(screen, green, p)

def check_square_collisions():
    global speed
    for platform in platforms:
        if square_rect.colliderect(platform):
            # Check if falling (i.e., moving downwards)
            if speed[1] > 0 and square_rect.bottom <= platform.top:
                square_rect.bottom = platform.top
                speed[1] = 0
                print(platform.top, square_rect.bottom, platform.bottom)
            # Check if moving upwards
            elif speed[1] < 0 and square_rect.top >= platform.bottom:
                square_rect.top = platform.bottom
                speed[1] = 0
            # Check if hitting from the side

# elif speed[0] != 0:
#     if square_rect.left < platform.right:
#         square_rect.left = platform.right
#         speed[0] = 0  # Stop horizontal movement or change direction as needed
#     elif square_rect.right > platform.left:
#         square_rect.right = platform.left
#         speed[0] = 0  # Stop horizontal movement or change direction as needed

In [64]:
def sample():
    x_scale = (np.random.rand()+1)*20
    y_scale = (np.random.rand()+1) * 10
    x = np.random.rand() * random.sample([-1,1],1)[0] * x_scale
    y = np.random.rand() * random.sample([-1,1],1)[0] * y_scale
    return [x,y]

<h1>RL Functions</h1>

In [65]:
class ValueNetwork(nn.Module):
    def __init__(self):
        super(ValueNetwork, self).__init__()
        # you may change this if you want, but you shouldn't need to
        self.layer1 = nn.Linear(2, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, 1) #need a way to output variable amount

    def forward(self, state : torch.Tensor, is_terminal : bool) -> torch.Tensor:
        # is_terminal is required for the gym interface
        if is_terminal:
            return torch.tensor([0])
        hidden = F.relu(self.layer1(state))
        hidden = F.relu(self.layer2(hidden))
        return self.layer3(hidden)

In [66]:
def custom_activation1(x):
    # Scales output to range -5 to 5
    return torch.tanh(x) * 5

def custom_activation2(x):
    # Scales output to range 0 to 20
    return torch.sigmoid(x) * 20

class CustomNet(nn.Module):
    def __init__(self):
        super(CustomNet, self).__init__()
        # Define your network layers here
        self.layer1 = nn.Linear(4, 128)  # Example layer
        self.layer2 = nn.Linear(128,128)
        self.output_layer = nn.Linear(128, 1)  # Output layer with 1 neuron

    def forward(self, x):
        x = F.relu(self.layer1(x))  # Example activation
        x = F.relu(self.layer2(x))
        output = F.relu(self.output_layer(x))
#         output = self.output_layer(x)
#         output = F.relu(output)
#         # Apply the custom activation functions to each output neuron
#         output[:, 0] = custom_activation1(output[:, 0])  # First neuron
#         output[:, 1] = custom_activation2(output[:, 1])  # Second neuron
        return output
model = CustomNet()
# Proceed with model training, evaluation, etc.


In [67]:
with torch.no_grad():
    print(model.forward(torch.tensor([[2.0,3,4,2]])))

tensor([[0.]])


Use recurrent neural network to generate action sequences 

In [68]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, action_size)
        self.tanh = nn.Tanh()

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        output = torch.relu(self.fc2(x))
        output[:, 0] = custom_activation1(output[:, 0])  # First neuron
        output[:, 1] = custom_activation2(output[:, 1])  # Second neuron
        return self.tanh(self.fc3(x))  # Assuming action space is normalized between -1 and 1


class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.fcs1 = nn.Linear(state_size, 400)
        self.fc2 = nn.Linear(400 + action_size, 300)
        self.fc3 = nn.Linear(300, 1)

    def forward(self, state, action):
        xs = torch.relu(self.fcs1(state))
        x = torch.cat((xs, action), dim=1)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DDPGAgent:
    def __init__(self, state_size, action_size):
        self.actor = Actor(state_size, action_size)
        self.critic = Critic(state_size, action_size)
        self.target_actor = Actor(state_size, action_size)
        self.target_critic = Critic(state_size, action_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # Initialize target networks with same weights as the original networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Other parameters and buffers here...
    def select_action(self, state):
        self.actor.eval()
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)  # Convert to tensor and add batch dimension
            action = self.actor(state_tensor).cpu().data.numpy().flatten()
        self.actor.train()
        return action


    # Implement methods for training, updating networks, etc.
#to train,
#get action
# agent = DDPGAgent(state_size, action_size)
# replay_buffer = ReplayBuffer(capacity)

# for episode in range(num_episodes):
#     state = env.reset()
#     while not done:
#         action = agent.select_action(state)
#         next_state, reward, done = env.step(action)  
#         replay_buffer.push(state, action, reward, next_state, done)
#         state = next_state

#         if len(replay_buffer) > batch_size:
#             agent.learn(replay_buffer)


In [69]:
#Loop
#s = current state (x,y,abc), do abc later
#take action a:
    #if random, take random action
    #else, take 


In [70]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))

<h1>Main Game Loop

In [71]:
# time.sleep(5)
# print('hi')

In [59]:
pygame.init()


# Main game loop
running = True
while running:
    # Handle events
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False
            
            
    curr_state = [square_rect.x, square_rect.y]       
            
            
            
    samp = sample()
    user_control = True
    if user_control:
    # Move the square with arrow keys
        keys = pygame.key.get_pressed()
        if keys[pygame.K_LEFT]:
            square_rect.x -= 2
        if keys[pygame.K_RIGHT]:
            square_rect.x += 2
    else:
     #random action
        square_rect.x += samp[0]
    
    #prevent off screen
    if square_rect.left < 0:
        square_rect.left = 0
    if square_rect.right > width:
        square_rect.right = width

    # Apply gravity
    speed[1] += gravity

    # Assume we're in the air unless we collide with a platform
    on_ground = False
    
#     for platform in platforms:
#         # Predict the next position
#         next_rect = square_rect.move(speed)

#         if next_rect.colliderect(platform):
#             # Check if falling (i.e., moving downwards)
#             if speed[1] > 0 and next_rect.bottom > platform.top:
#                 # Place the square on top of the platform
#                 square_rect.bottom = platform.top
#                 speed[1] = 0
#                 on_ground = True
#                 break  # No need to check other platforms
#             # Check if moving upwards and collides with the bottom of the platform
#             elif speed[1] < 0 and next_rect.top < platform.bottom:
#                 # Place the square just below the platform
#                 square_rect.top = platform.bottom
#                 speed[1] = 0
#                 break  # No need to check other platforms

    # Apply the vertical speed to the square's position if not on the ground
    if not on_ground:
        square_rect.y += int(speed[1])
            
   # Allow jumping if on the ground
    if user_control:
        if on_ground and keys[pygame.K_SPACE]:
            speed[1] -= 20
    else:
        if on_ground:
            speed[1] = -samp[1]

        
        
    # Check if the square has reached the goal
    if square_rect.colliderect(goal):
        running = False  # End the game
    
    
    
    #All actions and effects completed
    next_state = [square_rect.x,square_rect.y]
    #get distance from goal
    #print(goal.x)
    
    distance = (square_rect.x - goal.x)**2 + (square_rect.y - goal.y)**2
    print(distance)
    
    
    
    print(curr_state, next_state)
    time.sleep(0.2)
    
    
    
    ##DRAW FUNCTIONS
    ##DRAW FUNCTIONS
    ##DRAW FUNCTIONS
    ##DRAW FUNCTIONS
    
    
    # Fill the screen with a color
    screen.fill(blue)
                                                    
    # Draw the platforms
    draw_platforms()

    # Draw the goal
    pygame.draw.rect(screen, gold, goal)
    # Draw the square
    screen.blit(square, square_rect)

    # Update the display
    pygame.display.flip()

    # Cap the frame rate
    pygame.time.Clock().tick(60)

# Quit Pygame
pygame.quit()
sys.exit()

137700
[365, 411] [365, 410]


error: display Surface quit

In [14]:
# pygame.init()


# # Main game loop
# running = True
# while running:
#     # Handle events
#     for event in pygame.event.get():
#         if event.type == pygame.QUIT:
#             running = False
            
            
#     curr_state = [square_rect.x, square_rect.y]       
            
            
            
#     samp = sample()
#     user_control = True
#     if user_control:
#     # Move the square with arrow keys
#         keys = pygame.key.get_pressed()
#         if keys[pygame.K_LEFT]:
#             square_rect.x -= 2
#         if keys[pygame.K_RIGHT]:
#             square_rect.x += 2
#     else:
#      #random action
#         square_rect.x += samp[0]
    
#     #prevent off screen
#     if square_rect.left < 0:
#         square_rect.left = 0
#     if square_rect.right > width:
#         square_rect.right = width

#     # Apply gravity
#     speed[1] += gravity

#     # Assume we're in the air unless we collide with a platform
#     on_ground = False
    
#     for platform in platforms:
#         # Predict the next position
#         next_rect = square_rect.move(speed)

#         if next_rect.colliderect(platform):
#             # Check if falling (i.e., moving downwards)
#             if speed[1] > 0 and next_rect.bottom > platform.top:
#                 # Place the square on top of the platform
#                 square_rect.bottom = platform.top
#                 speed[1] = 0
#                 on_ground = True
#                 break  # No need to check other platforms
#             # Check if moving upwards and collides with the bottom of the platform
#             elif speed[1] < 0 and next_rect.top < platform.bottom:
#                 # Place the square just below the platform
#                 square_rect.top = platform.bottom
#                 speed[1] = 0
#                 break  # No need to check other platforms

#     # Apply the vertical speed to the square's position if not on the ground
#     if not on_ground:
#         square_rect.y += int(speed[1])
            
#    # Allow jumping if on the ground
#     if user_control:
#         if on_ground and keys[pygame.K_SPACE]:
#             speed[1] -= 20
#     else:
#         if on_ground:
#             speed[1] = -samp[1]

        
        
#     # Check if the square has reached the goal
#     if square_rect.colliderect(goal):
#         running = False  # End the game
    
    
    
#     #All actions and effects completed
#     next_state = [square_rect.x,square_rect.y]
#     #get distance from goal
#     #print(goal.x)
    
#     distance = (square_rect.x - goal.x)**2 + (square_rect.y - goal.y)**2
#     print(distance)
    
    
    
#     print(curr_state, next_state)
#     time.sleep(0.2)
    
    
    
#     ##DRAW FUNCTIONS
#     ##DRAW FUNCTIONS
#     ##DRAW FUNCTIONS
#     ##DRAW FUNCTIONS
    
    
#     # Fill the screen with a color
#     screen.fill(blue)
                                                    
#     # Draw the platforms
#     draw_platforms()

#     # Draw the goal
#     pygame.draw.rect(screen, gold, goal)
#     # Draw the square
#     screen.blit(square, square_rect)

#     # Update the display
#     pygame.display.flip()

#     # Cap the frame rate
#     pygame.time.Clock().tick(60)

# # Quit Pygame
# pygame.quit()
# sys.exit()

NameError: name 'square_rect' is not defined

In [56]:


# Set SDL to use the dummy NULL video driver, so it doesn't need a windowing system.
#os.environ["SDL_VIDEODRIVER"] = "dummy"
class Game:
    def __init__(self):
        pygame.init()
        # Initialize other game components here (like screen, square_rect, platforms, etc.)
        # Set the size of the window and some basic parameters
        self.size = self.width, self.height = 640, 480
        self.speed = [0, 0]
        self.gravity = 0.5

        # Colors
        self.black = (0, 0, 0)
        self.blue = (0, 0, 255)
        self.red = (255, 0, 0)
        self.green = (0, 255, 0)
        self.white = (255, 255, 255)
        self.gold = (255, 215, 0)  # Color for the goal

        # Create a window
        self.screen = pygame.display.set_mode(self.size)

        # Set the title of the window
        pygame.display.set_caption('Pygame Ball Rolling Example')

        # Load an image (square)
        self.square = pygame.Surface((50, 50))
        self.square.fill(self.red)
        self.square_rect = self.square.get_rect()
        #print(square_rect)
        # Define platforms as a list of rectangles
        self.platforms = [
            pygame.Rect(0, self.height - 20, self.width, 20),  # Add ground platform
            pygame.Rect(self.height, 300, 200, 20), #x of top left, y of top left, width, height
            pygame.Rect(350, 200, 150, 20),
            pygame.Rect(200, 100, 200, 20)
            #pygame.Rect(0,0,20,height),
            #pygame.Rect(width-20,0,20,height),
            #pygame.Rect(0,0,width,20),

        ]
        self.goal = pygame.Rect(self.platforms[-1].centerx - 25, self.platforms[-1].y - 50, 50, 50)
        self.ground_platform_top = self.platforms[0].top  # The top of the ground platform
        self.square_rect = self.square.get_rect(midbottom=(self.width // 2, self.ground_platform_top))
        self.init_state = self.square_rect.x , self.square_rect.y
        
    def reset(self):
        self.speed = [0, 0]
        self.gravity = 0.5
        self.square_rect.x = self.init_state[0]
        self.square_rect.y = self.init_state[1]
        print(self.square_rect.x, self.square_rect.y)
        return (self.square_rect.x, self.square_rect.y)

    def handle_events(self):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                return False
        return True
    
    def action_sample(self):
        x_scale = (np.random.rand()+1)*20
        y_scale = (np.random.rand()+1) * 10
        x = np.random.rand() * random.sample([-1,1],1)[0] * x_scale
        y = np.random.rand() * random.sample([-1,1],1)[0] * y_scale
        return [x,y]

    def update_game_state(self,action):
        # Sample code from your script
        curr_state = [self.square_rect.x, self.square_rect.y]
        print(curr_state)
        #samp = sample()
        user_control = False

        if user_control:
            keys = pygame.key.get_pressed()
            if keys[pygame.K_LEFT]:
                self.square_rect.x -= 2
            if keys[pygame.K_RIGHT]:
                self.square_rect.x += 2
        else:
            self.square_rect.x += action[0]

        if self.square_rect.left < 0:
            self.square_rect.left = 0
        if self.square_rect.right > self.width:
            self.square_rect.right = self.width

        self.speed[1] += self.gravity
        on_ground = False

        for platform in self.platforms:
            next_rect = self.square_rect.move(self.speed)
            if next_rect.colliderect(platform):
                        # Check if falling (i.e., moving downwards)
                        if speed[1] > 0 and next_rect.bottom > platform.top:
                            # Place the square on top of the platform
                            square_rect.bottom = platform.top
                            speed[1] = 0
                            on_ground = True
                            break  # No need to check other platforms
                        # Check if moving upwards and collides with the bottom of the platform
                        elif speed[1] < 0 and next_rect.top < platform.bottom:
                            # Place the square just below the platform
                            square_rect.top = platform.bottom
                            speed[1] = 0
                            break  # No need to check other platforms            # Rest of your collision and platform logic...

        if not on_ground:
            self.square_rect.y += int(self.speed[1])

        if user_control and on_ground and keys[pygame.K_SPACE]:
            self.speed[1] -= 20
        elif not user_control and on_ground:
            self.speed[1] = -action[1]
        
        #If goal
        done = False
        if self.square_rect.colliderect(self.goal):
            self.running = False
            done = True

        next_state = [self.square_rect.x, self.square_rect.y]
        distance = (self.square_rect.x - self.goal.x) ** 2 + (self.square_rect.y - self.goal.y) ** 2
        print(distance)
        print(curr_state, next_state)
        #time.sleep(0.2)
        reward = -distance
        return next_state, reward, done

    def render(self):
        print('rendered')
        self.screen.fill(self.blue)
        for p in self.platforms:
            pygame.draw.rect(self.screen, self.green, p)
        pygame.draw.rect(self.screen, self.gold, self.goal)
        self.screen.blit(self.square, self.square_rect)
        pygame.display.flip()

    def step(self,action):
        running = self.handle_events()
        if not running:
            return False

        next_state, reward, done = self.update_game_state(action)
        
        self.render()

        pygame.time.Clock().tick(60)
        return next_state,reward,done

    def run(self):
        self.running = True
        while self.running:
            self.running = self.step(sample())

        pygame.quit()
        sys.exit()
        
game = Game()
game.run()

# agent = DDPGAgent(2, 2)
# replay_buffer = ReplayBuffer(300)
# num_episodes = 1
# env = Game()
# for episode in range(num_episodes):
#     state = env.reset()
#     done = False
#     while not done:
#         action = agent.select_action(state)
#         next_state, reward, done = env.step(action)  


# #Have part above, now just need to have it do n step learning and look into wtf replay buffer is 
#         replay_buffer.push(state, action, reward, next_state, done)
#         print(replay_buffer)
#         state = next_state

#         if len(replay_buffer) > batch_size:
#             agent.learn(replay_buffer,batch_size,0.001)

[295, 410]
130176
[295, 410] [299, 410]
rendered
[299, 410]
132002
[299, 410] [316, 411]
rendered
[316, 411]
131573
[316, 411] [298, 412]
rendered
[298, 412]
132497
[298, 412] [274, 414]
rendered
[274, 414]
134181
[274, 414] [260, 416]
rendered
[260, 416]
136197
[260, 416] [281, 419]
rendered
[281, 419]
139060
[281, 419] [301, 422]
rendered
[301, 422]
141425
[301, 422] [268, 426]
rendered
[268, 426]
144724
[268, 426] [293, 430]
rendered
[293, 430]
150074
[293, 430] [318, 435]
rendered
[318, 435]
153189
[318, 435] [308, 440]
rendered
[308, 440]
158665
[308, 440] [318, 446]
rendered
[318, 446]
165960
[318, 446] [341, 452]
rendered
[341, 452]
168437
[341, 452] [309, 459]
rendered
[309, 459]
174905
[309, 459] [318, 466]
rendered
[318, 466]
184537
[318, 466] [344, 474]
rendered
[344, 474]
191113
[344, 474] [342, 482]
rendered
[342, 482]
197617
[342, 482] [331, 491]
rendered
[331, 491]
204804
[331, 491] [323, 500]
rendered
[323, 500]
212384
[323, 500] [303, 510]
rendered
[303, 510]
221576
[3

[134, 7216]
52570202
[134, 7216] [126, 7299]
rendered
[126, 7299]
53787465
[126, 7299] [104, 7382]
rendered
[104, 7382]
55029097
[104, 7382] [96, 7466]
rendered


SystemExit: 

In [18]:
game = Game()
game.run()

[295, 410]
129744
[295, 410] [287, 410]
[287, 410]
130517
[287, 410] [289, 411]
[289, 411]
131240
[289, 411] [261, 412]
[261, 412]
132497
[261, 412] [276, 414]
[276, 414]
134077
[276, 414] [264, 416]
[264, 416]
136282
[264, 416] [286, 419]
[286, 419]
139408
[286, 419] [307, 422]
[307, 422]
143876
[307, 422] [325, 426]
[325, 426]
145769
[325, 426] [312, 430]
[312, 430]
149594
[312, 430] [312, 435]
[312, 435]
155581
[312, 435] [334, 440]
[334, 440]
164560
[334, 440] [363, 446]
[363, 446]
168004
[363, 446] [355, 452]
[355, 452]
176306
[355, 452] [370, 459]
[370, 459]
180452
[370, 459] [361, 466]
[361, 466]
188425
[361, 466] [368, 474]
[368, 474]
196228
[368, 474] [373, 482]
[373, 482]
207477
[373, 482] [389, 491]
[389, 491]
222100
[389, 491] [415, 500]
[415, 500]
238169
[415, 500] [438, 510]
[438, 510]
251176
[438, 510] [449, 520]
[449, 520]
260945
[449, 520] [447, 531]
[447, 531]
278545
[447, 531] [466, 542]
[466, 542]
296041
[466, 542] [480, 554]
[480, 554]
316881
[480, 554] [500, 566]


[139, 10112]
103262821
[139, 10112] [145, 10211]
[145, 10211]
105289504
[145, 10211] [127, 10310]
[127, 10310]
107348921
[127, 10310] [136, 10410]
[136, 10410]
109434704
[136, 10410] [123, 10510]
[123, 10510]
111554602
[123, 10510] [134, 10611]
[134, 10611]
113707828
[134, 10611] [103, 10712]
[103, 10712]
115882465
[103, 10712] [138, 10814]
[138, 10814]
118089837
[138, 10814] [134, 10916]
[134, 10916]
120336122
[134, 10916] [144, 11019]
[144, 11019]
122602180
[144, 11019] [161, 11122]
[161, 11122]
124925777
[161, 11122] [124, 11226]
[124, 11226]
127260601
[124, 11226] [126, 11330]
[126, 11330]
129644794
[126, 11330] [112, 11435]
[112, 11435]
132037261
[112, 11435] [144, 11540]
[144, 11540]
134483345
[144, 11540] [148, 11646]
[148, 11646]
136950493
[148, 11646] [158, 11752]
[158, 11752]
139462885
[158, 11752] [173, 11859]
[173, 11859]
141997780
[173, 11859] [193, 11966]
[193, 11966]
144586180
[193, 11966] [177, 12074]
[177, 12074]
147197305
[177, 12074] [166, 12182]
[166, 12182]
1498628

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [131]:
for i in range(25):
    game.step(game.action_sample())
    
print("*************")
game.reset()
print('*************')
for i in range(5):
    game.step(game.action_sample())

[295, 410]
130756
[295, 410] [309, 410]
[309, 410]
130682
[309, 410] [294, 411]
[294, 411]
131080
[294, 411] [269, 412]
[269, 412]
133585
[269, 412] [242, 414]
[242, 414]
135112
[242, 414] [241, 416]
[241, 416]
136737
[241, 416] [251, 419]
[251, 419]
139540
[251, 419] [241, 422]
[241, 422]
141700
[241, 422] [257, 426]
[257, 426]
144409
[257, 426] [272, 430]
[272, 430]
148709
[272, 430] [253, 435]
[253, 435]
152101
[253, 435] [276, 440]
[276, 440]
157441
[276, 440] [300, 446]
[300, 446]
162760
[300, 446] [309, 452]
[309, 452]
170306
[309, 452] [330, 459]
[330, 459]
177152
[330, 459] [339, 466]
[339, 466]
181145
[339, 466] [312, 474]
[312, 474]
187353
[312, 474] [302, 482]
[302, 482]
194650
[302, 482] [288, 491]
[288, 491]
202600
[288, 491] [265, 500]
[265, 500]
212129
[265, 500] [252, 510]
[252, 510]
221096
[252, 510] [261, 520]
[261, 520]
231365
[261, 520] [273, 531]
[273, 531]
242233
[273, 531] [288, 542]
[288, 542]
254212
[288, 542] [289, 554]
[289, 554]
268657
[289, 554] [324, 566]


In [92]:
# max_moves = 3000
# batch_size = 30
# agent = DDPGAgent()
# for episode in range(num_episodes):
#     state = env.reset()
#     counter = 0 
#     while not done or counter < max_moves :
#         action = agent.select_action(state)
#         next_state, reward, done = env.step(action)  
#         replay_buffer.push(state, action, reward, next_state, done)
#         state = next_state
#         counter += 1
#         if len(replay_buffer) > batch_size:
#             agent.learn(replay_buffer)

In [142]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        """Add a new experience to the replay buffer."""
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """Sample a batch of experiences from the replay buffer."""
        batch = random.sample(self.buffer, batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        return (
            np.stack(state_batch),
            np.stack(action_batch),
            np.stack(reward_batch),
            np.stack(next_state_batch),
            np.stack(done_batch)
        )

    def __len__(self):
        """Return the current size of the replay buffer."""
        return len(self.buffer)
    
class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, action_size)
        self.tanh = nn.Tanh()

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        output = torch.relu(self.fc2(x))
        output[:, 0] = custom_activation1(output[:, 0])  # First neuron
        output[:, 1] = custom_activation2(output[:, 1])  # Second neuron
        return self.tanh(self.fc3(output))  # Assuming action space is normalized between -1 and 1


class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.fcs1 = nn.Linear(state_size, 400)
        self.fc2 = nn.Linear(400 + action_size, 300)
        self.fc3 = nn.Linear(300, 1)

    def forward(self, state, action):
        xs = torch.relu(self.fcs1(state))
        x = torch.cat((xs, action), dim=1)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DDPGAgent:
    def __init__(self, state_size, action_size):
        self.actor = Actor(state_size, action_size)
        self.critic = Critic(state_size, action_size)
        self.target_actor = Actor(state_size, action_size)
        self.target_critic = Critic(state_size, action_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # Initialize target networks with same weights as the original networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Other parameters and buffers here...
    def select_action(self, state):
        self.actor.eval()
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)  # Convert to tensor and add batch dimension
            action = self.actor(state_tensor).cpu().data.numpy().flatten()
        self.actor.train()
        return action
    
    def learn(self, replay_buffer, batch_size, gamma):
        if len(replay_buffer) < batch_size:
            return  # Not enough samples in the replay buffer

        # Sample a batch of transitions from the replay buffer
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = replay_buffer.sample(batch_size)

        # Convert numpy arrays to PyTorch tensors
        state_batch = torch.FloatTensor(state_batch)
        action_batch = torch.FloatTensor(action_batch)
        reward_batch = torch.FloatTensor(reward_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        done_batch = torch.FloatTensor(done_batch)

        # Compute target Q values using the target critic network
        with torch.no_grad():
            next_actions = self.target_actor(next_state_batch)
            target_q_values = self.target_critic(next_state_batch, next_actions)
            target_q_values = reward_batch + gamma * (1 - done_batch) * target_q_values

        # Update the critic network
        current_q_values = self.critic(state_batch, action_batch)
        critic_loss = F.mse_loss(current_q_values, target_q_values)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update the actor network
        predicted_actions = self.actor(state_batch)
        actor_loss = -self.critic(state_batch, predicted_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft update the target networks
        self.soft_update(self.target_actor, self.actor)
        self.soft_update(self.target_critic, self.critic)

    def soft_update(self, target, source, tau=0.001):
        for target_param, source_param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(tau * source_param.data + (1.0 - tau) * target_param.data)



    # Implement methods for training, updating networks, etc.
#to train,
#get action
torch.autograd.set_detect_anomaly(True)
agent = DDPGAgent(2, 2)
replay_buffer = ReplayBuffer(300)
num_episodes = 1
env = Game()
for episode in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.select_action(state)
        next_state, reward, done = env.step(action)  


#Have part above, now just need to have it do n step learning and look into wtf replay buffer is 
        replay_buffer.push(state, action, reward, next_state, done)
        print(replay_buffer)
        state = next_state

        if len(replay_buffer) > batch_size:
            agent.learn(replay_buffer,batch_size,0.001)


295 410
[295, 410]
129961
[295, 410] [294, 410]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[294, 410]
130645
[294, 410] [293, 411]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[293, 411]
131333
[293, 411] [292, 412]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[292, 412]
132752
[292, 412] [291, 414]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[291, 414]
134181
[291, 414] [290, 416]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[290, 416]
136357
[290, 416] [289, 419]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[289, 419]
138553
[289, 419] [288, 422]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[288, 422]
141520
[288, 422] [287, 426]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[287, 426]
144521
[287, 426] [286, 430]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[286, 430]
148325
[286, 430] [285, 435]
<__main__.ReplayBuffer object at 0x000001CAF90A7730>
[285, 435]
152181
[285, 435] [284, 440]
<__main__.ReplayBuffer

  critic_loss = F.mse_loss(current_q_values, target_q_values)
  File "C:\Users\mikef\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\mikef\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\mikef\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\mikef\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "C:\Users\mikef\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
    self.io_loop.start()
  File "C:\Users\mikef\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\mikef\anaconda3\lib\asyncio\base_events.py", line 596, in run_forever
    self._run_once()
  File "C:\Users\mikef\anaconda3\lib\asyncio\base_events.py", line 1890, in _run_once
    ha

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [30, 300]], which is output 0 of ReluBackward0, is at version 2; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [96]:
a = DDPGAgent(2,2)
s = torch.tensor([0.5,0.2])
a.select_action(s)

array([0.48983067, 0.26474813], dtype=float32)