In [1]:
!pip install pygame
!pip install gym

Defaulting to user installation because normal site-packages is not writeable
Collecting pygame
  Downloading pygame-2.6.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading pygame-2.6.1-cp39-cp39-macosx_11_0_arm64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.6.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m1.1 MB/s

In [None]:
import pygame
import numpy as np
import gym
from gym import spaces
import math
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt

class TankEnv(gym.Env):
    def __init__(self, render_mode=None):
        super(TankEnv, self).__init__()
        
        self.map_width = 800
        self.map_height = 600
        self.tank_size = 20
        self.target_size = 15
        self.render_mode = render_mode
        
        self.max_steps = 500
        self.current_step = 0
        
        if self.render_mode == 'human':
            pygame.init()
            self.screen = pygame.display.set_mode((self.map_width, self.map_height))
            pygame.display.set_caption("Tank Reinforcement Learning")
            self.clock = pygame.time.Clock()
        
        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Box(
            low=np.array([0, 0, -180, 0, -180, 0, 0, 0, 0], dtype=np.float32),
            high=np.array([self.map_width, self.map_height, 180, 
                          math.sqrt(self.map_width**2 + self.map_height**2), 
                          180, self.map_width, self.map_height, self.map_width, self.map_height], 
                         dtype=np.float32)
        )
        
        self.obstacles = [
            pygame.Rect(100, 100, 50, 200),
            pygame.Rect(300, 400, 200, 50),
            pygame.Rect(600, 200, 50, 300),
            pygame.Rect(200, 300, 150, 50)
        ]
        
        self.reset()
    
    def reset(self):
        self.tank_pos = np.array([
            random.randint(self.tank_size, self.map_width - self.tank_size),
            random.randint(self.tank_size, self.map_height - self.tank_size)
        ])
        self.tank_angle = random.randint(0, 359)
        
        while True:
            self.target_pos = np.array([
                random.randint(self.target_size, self.map_width - self.target_size),
                random.randint(self.target_size, self.map_height - self.target_size)
            ])
            target_rect = pygame.Rect(
                self.target_pos[0] - self.target_size//2, 
                self.target_pos[1] - self.target_size//2,
                self.target_size, self.target_size
            )
            
            collision = False
            for obstacle in self.obstacles:
                if target_rect.colliderect(obstacle):
                    collision = True
                    break
            
            if not collision and np.linalg.norm(self.tank_pos - self.target_pos) > 200:
                break
        
        self.current_step = 0
        self.bullet_pos = None
        self.bullet_speed = 10
        self.bullet_direction = 0
        
        return self._get_state()
    
    def _get_state(self):
        distance_to_target = np.linalg.norm(self.tank_pos - self.target_pos)
        
        dx = self.target_pos[0] - self.tank_pos[0]
        dy = self.target_pos[1] - self.tank_pos[1]
        angle_to_target = math.degrees(math.atan2(dy, dx)) - self.tank_angle
        angle_to_target = (angle_to_target + 180) % 360 - 180
        
        obstacle_distances = []
        for angle_offset in [0, 180, 90, -90]:
            angle = (self.tank_angle + angle_offset) % 360
            obstacle_distances.append(self._get_distance_to_obstacle(angle))
        
        bullet_x = self.bullet_pos[0] if self.bullet_pos is not None else 0
        bullet_y = self.bullet_pos[1] if self.bullet_pos is not None else 0
        
        return np.array([
            self.tank_pos[0], self.tank_pos[1], self.tank_angle,
            distance_to_target, angle_to_target,
            *obstacle_distances,
            bullet_x, bullet_y
        ], dtype=np.float32)
    
    def _get_distance_to_obstacle(self, angle):
        step = 5
        distance = 0
        rad_angle = math.radians(angle)
        
        while distance < 300:
            distance += step
            x = self.tank_pos[0] + distance * math.cos(rad_angle)
            y = self.tank_pos[1] + distance * math.sin(rad_angle)
            
            if x < 0 or x >= self.map_width or y < 0 or y >= self.map_height:
                return distance
            
            point_rect = pygame.Rect(x-2, y-2, 4, 4)
            for obstacle in self.obstacles:
                if point_rect.colliderect(obstacle):
                    return distance
        
        return distance
    
    def step(self, action):
        reward = -1
        done = False
        info = {}
        
        if action == 0:
            self._move_tank(5)
        elif action == 1:
            self._move_tank(-3)
        elif action == 2:
            self.tank_angle = (self.tank_angle - 5) % 360
        elif action == 3:
            self.tank_angle = (self.tank_angle + 5) % 360
        elif action == 4:
            if self.bullet_pos is None:
                self.bullet_pos = np.array(self.tank_pos)
                self.bullet_direction = self.tank_angle
        
        if self.bullet_pos is not None:
            rad_angle = math.radians(self.bullet_direction)
            self.bullet_pos[0] += self.bullet_speed * math.cos(rad_angle)
            self.bullet_pos[1] += self.bullet_speed * math.sin(rad_angle)
            
            if (self.bullet_pos[0] < 0 or self.bullet_pos[0] >= self.map_width or
                self.bullet_pos[1] < 0 or self.bullet_pos[1] >= self.map_height):
                self.bullet_pos = None
            
            if self.bullet_pos is not None:
                bullet_rect = pygame.Rect(self.bullet_pos[0] - 2, self.bullet_pos[1] - 2, 4, 4)
                target_rect = pygame.Rect(
                    self.target_pos[0] - self.target_size//2, 
                    self.target_pos[1] - self.target_size//2,
                    self.target_size, self.target_size
                )
                
                if bullet_rect.colliderect(target_rect):
                    reward += 100
                    done = True
                    info['result'] = 'target_hit'
                    self.bullet_pos = None
        
        tank_rect = pygame.Rect(
            self.tank_pos[0] - self.tank_size//2, 
            self.tank_pos[1] - self.tank_size//2,
            self.tank_size, self.tank_size
        )
        
        for obstacle in self.obstacles:
            if tank_rect.colliderect(obstacle):
                reward -= 10
                done = True
                info['result'] = 'obstacle_hit'
                break
        
        prev_distance = np.linalg.norm(self.tank_pos - self.target_pos)
        new_distance = np.linalg.norm(self.tank_pos - self.target_pos)
        
        if new_distance < prev_distance:
            reward += 10 * (prev_distance - new_distance) / prev_distance
        else:
            reward -= 5 * (new_distance - prev_distance) / prev_distance
        
        self.current_step += 1
        if self.current_step >= self.max_steps:
            done = True
            info['result'] = 'timeout'
        
        if self.render_mode == 'human':
            self.render()
        
        return self._get_state(), reward, done, info
    
    def _move_tank(self, distance):
        rad_angle = math.radians(self.tank_angle)
        new_pos = np.array([
            self.tank_pos[0] + distance * math.cos(rad_angle),
            self.tank_pos[1] + distance * math.sin(rad_angle)
        ])
        
        if (self.tank_size <= new_pos[0] <= self.map_width - self.tank_size and
            self.tank_size <= new_pos[1] <= self.map_height - self.tank_size):
            
            tank_rect = pygame.Rect(
                new_pos[0] - self.tank_size//2, 
                new_pos[1] - self.tank_size//2,
                self.tank_size, self.tank_size
            )
            
            collision = False
            for obstacle in self.obstacles:
                if tank_rect.colliderect(obstacle):
                    collision = True
                    break
            
            if not collision:
                self.tank_pos = new_pos
    
    def render(self):
        if self.render_mode != 'human':
            return
        
        self.screen.fill((255, 255, 255))
        
        for obstacle in self.obstacles:
            pygame.draw.rect(self.screen, (100, 100, 100), obstacle)
        
        pygame.draw.circle(
            self.screen, (255, 0, 0),
            (int(self.target_pos[0]), int(self.target_pos[1])),
            self.target_size
        )
        
        tank_center = (int(self.tank_pos[0]), int(self.tank_pos[1]))
        tank_rect = pygame.Rect(
            tank_center[0] - self.tank_size//2,
            tank_center[1] - self.tank_size//2,
            self.tank_size, self.tank_size
        )
        pygame.draw.rect(self.screen, (0, 0, 255), tank_rect)
        
        end_pos = (
            tank_center[0] + self.tank_size * math.cos(math.radians(self.tank_angle)),
            tank_center[1] + self.tank_size * math.sin(math.radians(self.tank_angle))
        )
        pygame.draw.line(self.screen, (0, 255, 0), tank_center, end_pos, 3)
        
        if self.bullet_pos is not None:
            pygame.draw.circle(
                self.screen, (255, 165, 0),
                (int(self.bullet_pos[0]), int(self.bullet_pos[1])),
                3
            )
        
        pygame.display.flip()
        self.clock.tick(30)
    
    def close(self):
        if self.render_mode == 'human':
            pygame.quit()

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=10000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state)
        act_values = self.model(state)
        return torch.argmax(act_values).item()
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        minibatch = random.sample(self.memory, batch_size)
        states = torch.FloatTensor(np.array([t[0] for t in minibatch]))
        actions = torch.LongTensor(np.array([t[1] for t in minibatch]))
        rewards = torch.FloatTensor(np.array([t[2] for t in minibatch]))
        next_states = torch.FloatTensor(np.array([t[3] for t in minibatch]))
        dones = torch.FloatTensor(np.array([t[4] for t in minibatch]))
        
        current_q = self.model(states).gather(1, actions.unsqueeze(1))
        next_q = self.model(next_states).max(1)[0].detach()
        target = rewards + (1 - dones) * self.gamma * next_q
        
        loss = F.mse_loss(current_q.squeeze(), target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def save(self, filename):
        torch.save(self.model.state_dict(), filename)
    
    def load(self, filename):
        self.model.load_state_dict(torch.load(filename))
        self.model.eval()

def train_agent(env, agent, episodes=1000, batch_size=32, render_every=100):
    rewards_history = []
    
    for e in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        render = (e % render_every == 0) and (env.render_mode == 'human')
        
        while not done:
            if render:
                env.render()
            
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            
            if done:
                rewards_history.append(total_reward)
                print(f"Episode: {e+1}/{episodes}, Total reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")
                
            agent.replay(batch_size)
    
    return rewards_history

def test_agent(env, agent, episodes=5):
    total_rewards = []
    
    for e in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            env.render()
            action = agent.act(state)
            state, reward, done, _ = env.step(action)
            total_reward += reward
            
            if done:
                print(f"Test Episode {e+1}, Total reward: {total_reward}")
        
        total_rewards.append(total_reward)
    
    return total_rewards

def main():
    env = TankEnv(render_mode='human')
    
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    
    try:
        print("Starting training...")
        rewards_history = train_agent(env, agent, episodes=500, render_every=50)
        
        plt.figure(figsize=(10, 5))
        plt.plot(rewards_history)
        plt.title('Rewards per Episode During Training')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.grid(True)
        plt.show()
        
        print("Testing trained agent...")
        test_rewards = test_agent(env, agent, episodes=5)
        print(f"Average test reward: {np.mean(test_rewards):.2f}")
        
    finally:
        env.close()

main()


pygame 2.6.1 (SDL 2.28.4, Python 3.9.6)
Hello from the pygame community. https://www.pygame.org/contribute.html
Starting training...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x11 and 9x64)