In [70]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

import random
from einops.layers.torch import Rearrange
from einops import rearrange

from typing import Any, Dict, Tuple, Optional
from game_mechanics import (
    ChooseMoveCheckpoint,
    ShooterEnv,
    checkpoint_model,
    choose_move_randomly,
    human_player,
    load_network,
    play_shooter,
    save_network,
)
from tqdm.notebook import tqdm

from functools import partial
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from copy import deepcopy
from functools import partial

from utils import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
TEAM_NAME = "Hristo"  # <---- Enter your team name here!
assert TEAM_NAME != "Team Name", "Please change your TEAM_NAME!"

In [80]:
device = 'cuda:0'

In [71]:
def choose_move(state, neural_network: nn.Module) -> int:
    probs = neural_network(state)
    probs = probs.cpu().detach().numpy()
    move = np.random.choice(range(6), p=probs)
    return int(move)

In [72]:
policy = nn.Sequential(
    nn.Linear(24, 100),
    nn.LeakyReLU(),
    nn.Linear(100, 100),
    nn.LeakyReLU(),
    nn.Linear(100, 6),
    nn.Softmax(dim=-1)
)

V = nn.Sequential(
    nn.Linear(24, 100),
    nn.LeakyReLU(),
    nn.Linear(100, 1)
)



gamma = 0.99
lamda = 0.99
erm = EpisodeReplayMemory(gamma, lamda)
optimizer_policy = torch.optim.Adam(policy.parameters(), lr=0.001)
optimizer_value = torch.optim.Adam(V.parameters(), lr=0.001)

episodes_per_stage = 2
batch_size = 10  
n_stages = 1
gradient_steps = 5
env = ShooterEnv(opponent_choose_move=choose_move_randomly)

In [81]:
for stage in range(n_stages):
    opponent = deepcopy(policy)
    env = ShooterEnv(opponent_choose_move=partial(choose_move, neural_network = opponent), game_speed_multiplier=100_000)
    for episode in range(episodes_per_stage):
        old_observation, reward, done, info = env.reset()
        old_value = V(old_observation)
        while not done:
            probs = policy(old_observation)
            chosen_move = np.random.choice(range(0,6), p=probs.detach().numpy())
            observation, reward, done, info = env.step(int(chosen_move))
            value = V(observation)
            
            erm.append({
                'old_observation': [old_observation],
                'observation': [observation],
                'reward': reward,
                'done': done,
                'chosen_move': chosen_move,
                'value': value.item(),
                'old_value': old_value.item()
            })
            old_value = value
            
        if len(erm) >= batch_size:
            data = erm.sample_with_remove(batch_size)
            states = data['old_observation'].to(device)
            old_probs = policy(states)
            old_values = V(states)
            
            # value function
            loss_v = F.smooth_l1_loss(old_values[:,0], data['gae'][0])
            optimizer_value.zero_grad()
            loss_v.backward()
            optimizer_value.step()
            
            
            # policy
            epsilon = 0.01
            old_probs = old_probs[range(batch_size), data['chosen_move'].long()].detach()
            for step in range(gradient_steps):
                print('========')
                print('making a grad step...')
                new_probs = policy(data['old_observation'])#[0]
                new_probs = new_probs[range(batch_size), data['chosen_move'].long()]
                print(f'unclipped prob_ratio: {new_probs / old_probs}')
                prob_ratio = new_probs / old_probs
                prob_ratio = torch.clamp(prob_ratio, 1-epsilon, 1+epsilon)
                print(f'after clipping: {prob_ratio}')
                loss_policy = (- prob_ratio * data['gae']).sum()
                print(f'loss: {loss_policy.item()}')
                optimizer_policy.zero_grad()
                loss_policy.backward()
                optimizer_policy.step()
                old_probs = new_probs.detach()

AssertionError: Torch not compiled with CUDA enabled

In [69]:
len(erm)

1214

In [67]:
new_probs

tensor([0.1775, 0.1418, 0.1555, 0.1769, 0.1842, 0.1641],
       grad_fn=<SelectBackward>)

In [18]:
np.atan2(state[2], state[3])

AttributeError: module 'numpy' has no attribute 'atan2'

In [None]:
state.move()

In [4]:
state

tensor([ 8.0000e-01,  0.0000e+00, -1.0000e+00, -1.8370e-16, -8.0000e-01,
         0.0000e+00, -1.0000e+00, -1.8370e-16, -1.0000e+00, -1.0000e+00,
         0.0000e+00,  1.0000e+00, -1.0000e+00, -1.0000e+00,  0.0000e+00,
         1.0000e+00, -1.0000e+00, -1.0000e+00,  0.0000e+00,  1.0000e+00,
        -1.0000e+00, -1.0000e+00,  0.0000e+00,  1.0000e+00])

In [17]:
state

tensor([ 8.0000e-01,  0.0000e+00, -1.0000e+00, -1.8370e-16, -8.0000e-01,
         0.0000e+00, -1.0000e+00, -1.8370e-16, -1.0000e+00, -1.0000e+00,
         0.0000e+00,  1.0000e+00, -1.0000e+00, -1.0000e+00,  0.0000e+00,
         1.0000e+00, -1.0000e+00, -1.0000e+00,  0.0000e+00,  1.0000e+00,
        -1.0000e+00, -1.0000e+00,  0.0000e+00,  1.0000e+00])

In [11]:
torch.round?

In [2]:
play_shooter(
    your_choose_move=human_player,
    opponent_choose_move=choose_move_randomly,
    game_speed_multiplier=1,
    render=True,
    include_barriers=False,
    half_game_size=False,
)

1.0