In [None]:
from mss import mss
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.backends.cudnn as cudnn
from tensorflow.keras.utils import to_categorical
from PIL import Image
#import pyautogui
import keyboard
import mouse
import pytesseract
from time import sleep
import winsound
from re import sub
import pickle
import copy

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True

In [None]:
class Dataset(torch.utils.data.Dataset):
    '''
    Generates commands for Hakisa. Based on NLP/Classic RL approach.

        command_type = list of command types (rightclick, click, keyboard).
        action1 = list of actions1 (up, down, press, X_coordinate). X_coordinate is for mouse actions.
        action2 = list of actions2 (keyboard_key, Y_coordinate). The keyboard key must be lowered.
        top, left, width, height = denotes the capture space for the frame capture.
        resize = A tuple (Height, Width), if you'd like to resize your image in order to consume less memory.
    '''
    
    def __init__(
        self,
        command_type=None,
        actions1=None,
        actions2=None,
        top=0,
        left=0,
        width=1920,
        height=1080,
        resize=None
    ):

        # Window resolutions for the screen grabber
        self.top = top
        self.left = left
        self.width = width
        self.height = height

        self.resize = resize # For reducing the images. Must be a tuple (Height, Width)

        self.command_type = command_type
        self.actions1 = actions1
        self.actions2 = actions2

        # For Study Mode
        self.data = None
        self.encoded_command_type = None
        self.encoded_actions1 = None
        self.encoded_actions2 = None


    # Pytorch's Dataset functions will only be used in Study mode
    def __getitem__(self, idx):

        frames = self.data[idx]
        command_type = self.encoded_command_type[idx]
        action1 = self.encoded_actions1[idx]
        action2 = self.encoded_actions2[idx]

        return frames, command_type, action1, action2


    def __len__(self):

        return len(self.data)


    def _grab_frame(self):
        # Unfortunately, this whole operation takes about 0.6 seconds, so we'll probably have to deal with a single frame each 1~3 seconds.
        with mss() as sct:
            frame = sct.grab(monitor={"top": self.top, "left": self.left, "width": self.width, "height": self.height})
            frame = Image.frombytes("RGB", frame.size, frame.bgra, 'raw', 'BGRX')

            if self.resize:
                frame = frame.resize(self.resize)

            frame = np.array(frame, dtype=np.float32)

            frame = torch.from_numpy(frame)
        
        frame = frame.view(1, frame.size(2), frame.size(0), frame.size(1)).to(device) # (Batch, Channels, Height, Width)

        return frame


    def get_command(self, command_type_idx, action1_idx, action2_idx):
        '''
        Hakisa output for true commands = (command_type, action1, action2)

        Remember to use int(command_idx.detach().cpu().item()). before passing the inputs.
        '''

        command_type = self.command_type[command_type_idx]
        action1 = self.actions1[action1_idx]
        action2 = self.actions2[action2_idx]

        command = (command_type, action1, action2)

        return command

    def get_consequences(self, top, left, width, height, togray=False, threshold=False, thresh_gauss=171, thresh_C=13, tesseract_config='--psm 8'):
        '''
        Used after Hakisa performed an input, in order to get its consequences(ex: score change, bombs, kills, deaths...).
        Returns a string according to Tesseract's OCR.

        With a reward model, this function might be used to generate an input for the reward model.
        '''

        with mss() as sct:
            consequence = sct.grab(monitor={"top": top, "left": left, "width": width, "height": height})

            consequence = Image.frombytes("RGB", consequence.size, consequence.bgra, 'raw', 'BGRX')

        if togray is True:

            consequence = consequence.convert("P") # Sometimes, simply converting to grayscale is enough

            if threshold is True:
                if "ADAPTIVE_THRESH_GAUSSIAN_C" and "adaptiveThreshold" and "THRESH_BINARY" not in dir():
                    from cv2 import adaptiveThreshold, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY

                consequence = adaptiveThreshold(np.array(consequence),255,ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY,thresh_gauss,thresh_C)
                consequence = Image.fromarray(consequence)
        
        consequence = pytesseract.image_to_string(consequence, config=tesseract_config) 

        # OCR adds some strange characters(even with the whitelist function). Let's remove them.

        consequence = sub('[^A-Za-z0-9/.]', '', consequence) # Attention: 0, 1 and 8 can be seen as O, l and B.

        return consequence

    def record_gameplay(self, number_of_screenshots, screenshot_delay, grayscale=False, resize=False, path=None):

        # Resizing and grayscaling isn't really necessary here, but can save you some time later.
        # Both saving you from writing more code and from making your hardware having to process more and more data at once.

        print(f"Ok. Screenshot capture will begin in 5 seconds")

        sleep(5)

        winsound.PlaySound('D:/Python/Audio/English/chiara_hacking_1_en.wav', winsound.SND_FILENAME) # Just to know if everything's ok

        for i in range(number_of_screenshots):

            with mss() as sct:

                frame = sct.grab(monitor={"top": self.top, "left": self.left, "width": self.width, "height": self.height})
                frame = Image.frombytes("RGB", frame.size, frame.bgra, 'raw', 'BGRX')

            if grayscale:

                frame = frame.convert('L')

            if resize:

                frame = frame.resize(self.resize)

            frame.save(f"{path}/{i}.png")

            sleep(screenshot_delay)
        
        print("Screenshot capture finished!")

        winsound.PlaySound('D:/Python/Audio/English/chiara_hacking_1_en.wav', winsound.SND_FILENAME)

    def create_data(self, data, commands):
        '''
        If you'd like, you can also generate a dataset for dataloader for Study Mode.

        data = tensor of size (N_samples, Channels, Height, Width) containing game frames.
            The range of pixel values must be the same range
            that will be used during Reinforcement Learning, that is, if you use scaled images here, you must also use the same scaling during RL.
            Unscaled data in Regression tasks are prone to exploding gradients. However, since we're using HuberLoss and clippings, this won't be a problem.
            There wasn't any problem during tests with the ClassicRL model.

            Remarks: PPO Atari version scales the input frames from [0, 255] to [0, 1]. It's not clear whether Ruo-Ze et al used scaled data or not.

        labels = a list of tuples with length (N_samples), with each sample being a tuple composed of (command_type, action1, action2), where:

            command_type: a tensor of the action command type index-encoded with indices within range [0, len(command_types)].
            action1: the action1 index-encoded with indices within range [0, len(actions1)].
            action2: the action2 index-encoded with indices within range [0, len(action2)].

            Providing a reward is optional and up to you during the Study Mode. The main focus on Study Mode is to train the Policy.
        '''

        # HierNet uses data in sequences, but this might be too costly and I don't know how much this would improve performance.
        # Let's just stick to normal TD-Learning.

        self.data = data

        encoded_command_type = []
        encoded_actions1 = []
        encoded_actions2 = []

        for sample in commands:

            command_type = to_categorical(sample[0], len(self.command_type))
            command_type = torch.from_numpy(command_type)
            command_type = command_type.unsqueeze(0).to(device) # So you don't have to use [number] for your commands tuple to get a command_type with shape [N_samples, 1]
            encoded_command_type.append(command_type)

            encoded_action1 = to_categorical(sample[1], len(self.actions1))
            encoded_action1 = torch.from_numpy(encoded_action1)
            encoded_action1 = encoded_action1.unsqueeze(0).to(device)
            encoded_actions1.append(encoded_action1)

            encoded_action2 = to_categorical(sample[2], len(self.actions2))
            encoded_action2 = torch.from_numpy(encoded_action2)
            encoded_action2 = encoded_action2.unsqueeze(0).to(device)
            encoded_actions2.append(encoded_action2)

        encoded_command_type = torch.cat(encoded_command_type, 0)
        encoded_actions1 = torch.cat(encoded_actions1, 0)
        encoded_actions2 = torch.cat(encoded_actions2, 0)

        self.encoded_command_type = encoded_command_type
        self.encoded_actions1 = encoded_actions1
        self.encoded_actions2 = encoded_actions2

        print("All done! Use Hakisa in the Study Mode to properly train her Policy Network(mapping states to certain actions)")

In [None]:
# Jigoku Kisetsukan

command_types = ['key']

actions1 = ['Down', 'Up']

actions2 = ['up', 'down', 'left', 'right', 'z', 'x', 'shift']

dataset = Dataset(command_types, actions1, actions2, resize=(200, 200))

In [None]:
def preprocess_Jigoku(score):
    # When using Tesseract for the game Jigoku Kisetsukan: Sense of the Seasons
    # Not recommended: Prefer training your own OCR model specifically for this game.

    score = score.replace('S', '5').replace('s', '8').replace('e', '2').replace('O', '0').replace('B', '8').replace('o', '4').replace('b', '4')
    score = score.replace('I', '1').replace('l', '1').replace('.', '')

    try:
        score = float(score)

    except ValueError:
            score = 1.0

    return score

In [None]:
# Bullet Heaven

command_types = ['move', 'click', 'rightclick']

actions1 = [i for i in range(1, 1919)] # Attention: Discarding (0,0) might cause trouble in tf.to_categorical()

actions2 = [i for i in range(1, 1079)] # In this case, use range(0, 1079) and do the appropriate modification in SL function

dataset = Dataset(command_types, actions1, actions2, resize=(200,200))

In [None]:
def preprocess_BH2(score):
    # For the game Bullet Heaven 2
    # Also not recommended: Prefer training your own OCR model specifically for this game.

    score = score.replace('S', '5').replace('s', '8').replace('e', '2').replace('O', '0').replace('B', '8').replace('o', '0').replace('.', '')
    score = sub('[^0-9]', '', score)

    try:
        score = float(score)

    except ValueError:
        score = 1.0

    return score

In [None]:
# If you'd like to optimize memory, use the effective X and Y action space, that is, the screen area where the actions really take place.

example = plt.imread("D:/SerpentAI/datasets/current/bullet_heaven_reduced.png")
plt.imshow(example)
plt.show()

In [None]:
class Hakisa(torch.nn.Module):

    '''
    Hakisa itself, properly optimized to use probability distribution of actions instead of vectors.

    In order to avoid pollution, Hakisa will have a single mode.

    She will receive as inputs grame frames and the previous cumulative reward and, in the end, will generate 4 outputs:

        output 1: a tuple of probability distributions (possible_command_types, possible_actions1, possible_actions2),
        each one with sizes (Batch, len(command))

        output 2: a tuple of true action, that is, the action selected to be executed (command_type, action1, action2),
        each one with sizes (Batch, 1).

        output 3: sum of the average of rewards that can be obtained through each action

            output3 = avg_reward(command_type) + avg_reward(action1) + avg_reward(action2)

            Note that this approach is not mathematically correct and will provide a different result
            than it would if the model predicted the reward for every possible option.
            The deviation tend to be low when the reward is low, but it gets greater as the reward increases:

                27 possible actions, sum of rewards = 344 ---> mean = 12.74
                output3 = 12.63
                deviation = 0.11

                sum of rewards = 34,400,000 ---> mean = 1,274,000
                output3 = 1,263,000
                deviation = 11,000

        output 4: the predicted reward for the true action
    '''

    def __init__(self, command_types, actions1, actions2, epsilon):

        super(Hakisa, self).__init__()

        self.command_types = len(command_types)
        self.actions1 = len(actions1)
        self.actions2 = len(actions2)
        self.epsilon = epsilon # Used to determine whether to explore or simply select the best action.
        # This method is more used in Q-Learning, but can be used in Actor-Critic as well.

        # Sticking to the traditional approach first. We might use Attention Layers if those are indeed effective.

        self.conv1 = torch.nn.Conv2d(3, 100, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm1 = torch.nn.BatchNorm2d(100)
        self.conv2 = torch.nn.Conv2d(100, 100, kernel_size=3, stride=1, padding=1, bias=False) # 200x200
        self.batchnorm2 = torch.nn.BatchNorm2d(100)
        # Add pool 2x2 ---> 100x100
        self.conv3 = torch.nn.Conv2d(100, 200, kernel_size=3, stride=1, padding=1, bias=False) # 100x100
        self.batchnorm3 = torch.nn.BatchNorm2d(200)
        self.conv4 = torch.nn.Conv2d(200, 200, kernel_size=3, stride=1, padding=1, bias=False) # 100x100
        self.batchnorm4 = torch.nn.BatchNorm2d(200)
        # Add pool 2x2 ---> 50x50
        self.conv5 = torch.nn.Conv2d(200, 400, kernel_size=3, stride=1, padding=1, bias=False) # 50x50
        self.batchnorm5 = torch.nn.BatchNorm2d(400)
        self.conv6 = torch.nn.Conv2d(400, 400, kernel_size=3, stride=1, padding=1, bias=False) # 50x50
        self.batchnorm6 = torch.nn.BatchNorm2d(400)
        # Add pool 2x2 ---> 25x25
        self.conv7 = torch.nn.Conv2d(400, 800, kernel_size=4, stride=1, bias=False) # 22x22
        self.batchnorm7 = torch.nn.BatchNorm2d(800)
        self.conv8 = torch.nn.Conv2d(800, 800, kernel_size=3, stride=1, bias=False) # 20x20
        self.batchnorm8 = torch.nn.BatchNorm2d(800)
        # Add pool 2x2 ---> 10x10
        self.conv9 = torch.nn.Conv2d(800, 1000, kernel_size=3, stride=1, padding=1, bias=False) # 10x10
        self.batchnorm9 = torch.nn.BatchNorm2d(1000)
        self.conv10 = torch.nn.Conv2d(1000, 1000, kernel_size=3, stride=1, padding=1, bias=False) # 10x10
        self.batchnorm10 = torch.nn.BatchNorm2d(1000)
        # Add pool 2x2 ---> 5x5

        self.neuron_in = torch.nn.Linear(1000*5*5, 100, bias=False) # Bottleneck layer.

        if self.command_types > 1:

            self.neuron_command_type = torch.nn.Linear(100, self.command_types, bias=False)

            # Considering the command type that has been predicted, what should be action1 and action2?

            self.neuron_action1 = torch.nn.Linear(100+1, self.actions1, bias=False)
            self.neuron_action2 = torch.nn.Linear(100+1, self.actions2, bias=False)

        else: # The command type index is always 0

            self.neuron_action1 = torch.nn.Linear(100, self.actions1, bias=False)
            self.neuron_action2 = torch.nn.Linear(100, self.actions2, bias=False)

        self.pred_reward_command_type = torch.nn.Linear(100+1+self.command_types, 1, bias=False)
        self.pred_reward_action1 = torch.nn.Linear(100+1+self.actions1, 1, bias=False)
        self.pred_reward_action2 = torch.nn.Linear(100+1+self.actions2, 1, bias=False)

        self.pool2x2 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        self.LRelu = torch.nn.LeakyReLU(0.25)
        self.softmax = torch.nn.Softmax(-1)

    def forward(self, input_frame, previous_cumulative_reward):

        x = self.conv1(input_frame)
        x = self.batchnorm1(x)
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.LRelu(x)

        x = self.pool2x2(x)

        x = self.conv3(x)
        x = self.batchnorm4(x)
        x = self.conv4(x)
        x = self.batchnorm4(x)
        x = self.LRelu(x)

        x = self.pool2x2(x)

        x = self.conv5(x)
        x = self.batchnorm5(x)
        x = self.conv6(x)
        x = self.batchnorm6(x)
        x = self.LRelu(x)

        x = self.pool2x2(x)

        x = self.conv7(x)
        x = self.batchnorm7(x)
        x = self.conv8(x)
        x = self.batchnorm8(x)
        x = self.LRelu(x)

        x = self.pool2x2(x)

        x = self.conv9(x)
        x = self.batchnorm9(x)
        x = self.conv10(x)
        x = self.batchnorm10(x)
        x = self.LRelu(x)

        x = self.pool2x2(x)

        x = x.view(x.size(0), -1)

        x = self.neuron_in(x) # (Batch, 100). Since we're capturing a single frame at time, our Batch = 1.
        # If we were running more game instances in parallel, our Batch would be equal to the number of game instances.

        if self.command_types > 1:

            possible_command_types = self.neuron_command_type(x) # (Batch, n_command_types)
            possible_command_types = self.softmax(possible_command_types)

            # Sampling command type to determine the actions
            if torch.rand((1,)) < self.epsilon:
                one_hot = torch.zeros_like(possible_command_types, device=device)
                one_hot[:, torch.multinomial(possible_command_types, 1, replacement=True).item()] = 1.
                #idx = torch.randint(0, self.command_types, (possible_command_types.size(0), 1), device=device)
                #one_hot = torch.zeros_like(possible_command_types, device=device)
                #one_hot[:, idx.item()] = 1.
                true_command_type = possible_command_types * one_hot
                true_command_type = torch.sum(true_command_type, dim=-1, keepdim=True)
            
            else: # We can't use .argmax() directly as this detaches the tensor's graphs, since argmax isn't differentiable.

                one_hot = torch.zeros_like(possible_command_types, device=device)
                one_hot[:, possible_command_types.argmax(-1)] = 1.
                true_command_type = possible_command_types * one_hot
                true_command_type = torch.sum(true_command_type, dim=-1, keepdim=True)

            y = torch.cat((x, true_command_type), -1) # (Batch, 100+1)
            possible_actions1 = self.neuron_action1(y)
            possible_actions1 = self.softmax(possible_actions1)
            possible_actions2 = self.neuron_action2(y)
            possible_actions2 = self.softmax(possible_actions2)

        else:
            possible_command_types = torch.zeros((x.size(0), 1), device=device)
            possible_actions1 = self.neuron_action1(x)
            possible_actions1 = self.softmax(possible_actions1)
            possible_actions2 = self.neuron_action2(x)
            possible_actions2 = self.softmax(possible_actions2)

        # Calculating possible reward for each command_type, for each action1 and for each action2

        expected_reward_ct = torch.zeros_like(possible_command_types, device=device)
        expected_reward_a1 = torch.zeros_like(possible_actions1, device=device)
        expected_reward_a2 = torch.zeros_like(possible_actions2, device=device)

        for batch in range(expected_reward_ct.size(0)):
            for action in range(self.command_types):

                y = torch.cat((x[batch], previous_cumulative_reward[batch], possible_command_types[batch]), -1) # (100+1+n_commands)
                y = self.pred_reward_command_type(y.unsqueeze(0)) # (1, 1)
                expected_reward_ct[batch, action] = y

                del y
        
        for batch in range(expected_reward_a1.size(0)):
            for action in range(self.actions1):

                y = torch.cat((x[batch], previous_cumulative_reward[batch], possible_actions1[batch]), -1)
                y = self.pred_reward_action1(y.unsqueeze(0))
                expected_reward_a1[batch, action] = y
                
                del y
        
        for batch in range(expected_reward_a2.size(0)):
            for action in range(self.actions2):

                y = torch.cat((x[batch], previous_cumulative_reward[batch], possible_actions2[batch]), -1)
                y = self.pred_reward_action2(y.unsqueeze(0))
                expected_reward_a2[batch, action] = y
                
                del y

        predicted_reward_ct = one_hot * expected_reward_ct # One-hot has the index of the chosen command_type
        predicted_reward_ct = torch.sum(predicted_reward_ct, dim=-1, keepdim=True)

        # Now, sampling the actions1 and actions2

        if torch.rand((1,)) < self.epsilon:
            one_hot = torch.zeros_like(possible_actions1, device=device)
            one_hot[:, torch.multinomial(possible_actions1, 1, replacement=True).item()] = 1.
            #idx = torch.randint(0, self.actions1, (possible_actions1.size(0), 1), device=device)
            #one_hot = torch.zeros_like(possible_actions1, device=device)
            #one_hot[:, idx.item()] = 1.
            true_action1 = possible_actions1 * one_hot
            true_action1 = torch.sum(true_action1, dim=-1, keepdim=True)
            
        else:
            one_hot = torch.zeros_like(possible_actions1, device=device)
            one_hot[:, possible_actions1.argmax(-1)] = 1.
            true_action1 = possible_actions1 * one_hot
            true_action1 = torch.sum(true_action1, dim=-1, keepdim=True)

        predicted_reward_a1 = one_hot * expected_reward_a1
        predicted_reward_a1 = torch.sum(predicted_reward_a1, dim=-1, keepdim=True)

        if torch.rand((1,)) < self.epsilon:
            one_hot = torch.zeros_like(possible_actions2, device=device)
            one_hot[:, torch.multinomial(possible_actions2, 1, replacement=True).item()] = 1.
            #idx = torch.randint(0, self.actions2, (possible_actions2.size(0), 1), device=device)
            #one_hot = torch.zeros_like(possible_actions2, device=device)
            #one_hot[:, idx.item()] = 1.
            true_action2 = possible_actions2 * one_hot
            true_action2 = torch.sum(true_action2, dim=-1, keepdim=True)
            
        else:
            one_hot = torch.zeros_like(possible_actions2, device=device)
            one_hot[:, possible_actions2.argmax(-1)] = 1.
            true_action2 = possible_actions2 * one_hot
            true_action2 = torch.sum(true_action2, dim=-1, keepdim=True)

        one_hot = torch.zeros_like(possible_actions2, device=device)
        one_hot[:, possible_actions2.argmax(-1)] = 1
        predicted_reward_a2 = one_hot * expected_reward_a2
        predicted_reward_a2 = torch.sum(predicted_reward_a2, dim=-1, keepdim=True)

        predicted_reward = predicted_reward_ct + predicted_reward_a1 + predicted_reward_a2

        del expected_reward_ct, expected_reward_a1, expected_reward_a2 # Actually not used in PPO

        possible_actions = (possible_command_types, possible_actions1, possible_actions2)

        true_action = (true_command_type, true_action1, true_action2)

        return possible_actions, true_action, predicted_reward

    def execute_command(self, command):
        '''
        Command must be a tuple(command_type, action1, action2), where:

            command_type: key(keyboard) or move, rightclick, click(mouse)
            action1: Up, Down, press(keyboard), X coordinate(mouse) or None(no mouse movement)
            action2: 'a', 'z', 'shift'...(keyboard), Y coordinate(mouse) or None(no mouse movement)

        Make sure all key actions(action2) are lowered.

        Have in mind that Hakisa might output command_type 'key' and action1 that is equivalent to a mouse action.
        In this case, the command is ignored.
        '''

        if "key" in command[0]:

            try:
                
                if "Up" in command[1]:

                    try:
                        #pyautogui.keyUp(command[2])
                        keyboard.release(command[2])
                
                    except:
                        pass # If Hakisa predicts a mouse action for a keyboard command, she won't do anything.

                elif "Down" in command[1]:

                    try:
                        #pyautogui.keyDown(command[2])
                        keyboard.press(command[2])

                    except:
                        pass

                elif "press" in command[1]:

                    try:
                        keyboard.send(command[2]) # Some games won't work with pyautogui.press(), so use keyboard module, since we'll import it for Play Mode.
                    
                    except:
                        pass

                else:
                    pass
            
            except:

                pass # If Hakisa predicts a keyboard command, but outputs a mouse action, she won't do anything.

        elif "move" in command[0]:

            try:
                #pyautogui.moveTo(command[1], command[2], duration=0.19) # Duration = 0.19 seconds to be more realistic
                mouse.move(command[1], command[2], duration=0.1)

            except:
                pass # If Hakisa predict a mouse command, but outputs a keyboard action, she won't do anything.

        elif "rightclick" in command[0]:
            
            try:
                mouse.move(command[1], command[2], duration=0.1)
                mouse.right_click()
            except:
                pass

        elif "click" in command[0]:

            try:
                #pyautogui.moveTo(command[1], command[2], duration=0.19)
                mouse.move(command[1], command[2], duration=0.1)
                mouse.click() # Same case as press. Use mouse module.
            except:
                pass

        else:

            raise ValueError # It was probably you who made a mistake.

In [None]:
hakisa = Hakisa(command_types, actions1, actions2, epsilon=0.5).to(device)

In [None]:
'''
Initializing Hakisa weights: https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/ - Item 2

Ruo-Ze Liu didn't use this. My experiments indicate that this may actually sabotage the model rather
than helping it. It makes the model more prone to vanishing gradients. In SL, the loss gets more resilient.

'''
torch.nn.init.orthogonal_(hakisa.neuron_in.weight, np.sqrt([2]).item())
torch.nn.init.orthogonal_(hakisa.neuron_command_type.weight, 0.01)
torch.nn.init.orthogonal_(hakisa.neuron_action1.weight, 0.01)
torch.nn.init.orthogonal_(hakisa.neuron_action2.weight, 0.01)
torch.nn.init.orthogonal_(hakisa.pred_reward_command_type.weight, 1)
torch.nn.init.orthogonal_(hakisa.pred_reward_action1.weight, 1)
torch.nn.init.orthogonal_(hakisa.pred_reward_action2.weight, 1)

In [None]:
# Preparing data for Study Phase
# To label your data, consider Supervised Learning + Self-Learning.
# For really big datasets, consider using Google Colabs.
# Its RAM supports around 42,000 200x200 images (personal experience with Cocogoat dataset)

import os

images_by_order = []

for directory, _, files in os.walk("D:/Python/Projects/Hakisa/Hakisa/BH_gameplay"):

    for file in files:

        file = file.split('.')
        file = file[0] # Getting exclusively the number

        images_by_order.append(file)

images_by_order = sorted([int(x) for x in images_by_order])

# Problem: for strings, Python considers that 1000 < 2. Maybe something related to how the string is assembled?

images_data = []

for i in images_by_order[0:10]: # 10 samples for testing

    i = directory + '/' + str(i) + '.png'
    image = Image.open(i)
    image = image.resize((200, 200))
    array = np.array(image, dtype=np.float32)
    image.close()
    array = array/255 # Note that the data must be within [0, 1] for matplotlib.
    images_data.append(array)

images_data = np.stack(images_data, 0)

In [None]:
images_data = torch.from_numpy(images_data)
images_data = images_data.view(images_data.size(0), images_data.size(3), images_data.size(1), images_data.size(2))
print(images_data.size())

In [None]:
with open("D:/Python/Projects/Hakisa/Preprocessing/commands_05000.pkl", 'rb') as f:
    labels = pickle.load(f)

f.close()

labels.insert(2, (0, 860, 550))

In [None]:
dataset.create_data(data=images_data[:5000], commands=labels)

In [None]:
# Study Phase, or Supervised Learning Phase. What matters here is making Hakisa correlate states to commands

'''
"In SL training, we found a learning rate of 1e-4 and 10 training epochs achieve the
best result. The best model achieves a 0.15 win rate against the level-1 built-in AI. Note
that though this result is not as good as that we acquire in the HRL method, the training
here faces 564 actions, thus is much difficult. The 1e-4 learning rate is also selected by
experiments and is different from the default 1e-3 in the AlphaStar pseudocodes. We find
that training more than 10 epochs will easily fall in overfitting, making the agent can't do
any meaningful things." - Liu, Ruo-Ze et al. On Efficient Reinforcement Learning for Full-length Game of StarCraft II
'''

# Since our batch size is 1, beware of vanishing gradients and overfitting.

dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

# Using a lr lower than 1e-4 mitigates vanishing gradients. Trick used in Generative Models.
optimizer = torch.optim.Adam(hakisa.parameters(), lr=1e-6, eps=1e-8)
# TO CONSIDER: Using weight decay to prevent overfitting.

losses = []
action_grads = []
policy_loss = torch.nn.CrossEntropyLoss()

dummy_reward = torch.zeros((1, 1), device=device) # Don't worry about the reward. It won't be used during the Study Phase.

grad_clip = None
save_path = None
EPOCHS = 10

In [None]:
for epoch in range(EPOCHS):

    epoch_loss = 0.

    for i, (frame, encoded_command_type, encoded_action1, encoded_action2) in enumerate(dataloader):

        hakisa.zero_grad()

        frame = frame.to(device)
        encoded_command_type = encoded_command_type.to(device)
        encoded_action1 = encoded_action1[:, 1:].to(device) # Removing index 0 ---> Coordinate 0,0 in mouse
        encoded_action2 = encoded_action2[:, 1:].to(device) # Removing index 0 ---> Coordinate 0,0 in mouse

        possible_actions, true_action, predicted_reward = hakisa(frame, dummy_reward)

        # Deleting Agent variables. Remember that our goal is to pretrain the Policy("Vectorizer")
        del true_action, predicted_reward
        
        command_type_loss = policy_loss(possible_actions[0], encoded_command_type)
        action1_loss = policy_loss(possible_actions[1], encoded_action1)
        action2_loss = policy_loss(possible_actions[2], encoded_action2)

        total_loss = command_type_loss + action1_loss + action2_loss

        total_loss.backward()

        epoch_loss += total_loss.item()

        action_grads.append(torch.mean(hakisa.neuron_in.weight.grad))

        optimizer.step()

        
        #if i % 100 == 0: # On GTX 1650 Ti, 5000 iterations(1 epoch) = +- 66 minutes.

    print(f"{epoch}/{EPOCHS}\nCurrent Loss: {total_loss.item()}\tTotal Epoch Loss: {epoch_loss/(i+1)}\tGradients Average: {action_grads[-1]}")
    print(f"Command Type Loss: {command_type_loss.item()}\nAction 1 Loss: {action1_loss.item()}\nAction 2 Loss: {action2_loss.item()}")

If you'd like to use continuous rewards, it might be necessary to use a Reward Model.

In this case, you'll have to recreate the Reward Model here.

**Remember that Neural Networks can be seen as functions with learning parameters. Thus, they can make good reward functions.**

Careful with overfitting and memory issues.

In [None]:
class ResidualBlock(torch.nn.Module):

    def __init__(self, input_channels, kernel_size, strides=1, padding=1):

        super(ResidualBlock, self).__init__()

        self.convA = torch.nn.Conv2d(input_channels, input_channels, kernel_size, strides, padding, bias=False)
        self.batchnormA = torch.nn.BatchNorm2d(input_channels)
        self.convB = torch.nn.Conv2d(input_channels, input_channels, kernel_size, strides, padding, bias=False)
        self.batchnormB = torch.nn.BatchNorm2d(input_channels)

        self.PRelu = torch.nn.PReLU()

    def forward(self, input):

        x = self.convA(input)
        x = self.batchnormA(x)
        x = self.PRelu(x)
        x = self.convB(x)
        x = self.batchnormB(x)

        output = input + x

        return output

In [None]:
class PapezCircuit(torch.nn.Module):
    '''
    Reward model, based on ResNet architecture
    '''

    def __init__(self):

        super(PapezCircuit, self).__init__()

        self.conv_in = torch.nn.Conv2d(3, 64, 3, 1, 1, bias=False)
        
        self.resblock1 = ResidualBlock(64, 3, 1, 1)
        self.resblock2 = ResidualBlock(64, 3, 1, 1)
        self.resblock3 = ResidualBlock(64, 3, 1, 1)
        self.conv4 = torch.nn.Conv2d(64, 128, 4, 2, 1, bias=False)
        self.conv5 = torch.nn.Conv2d(128, 128, 3, 1, 1, bias=False)
        self.resblock6 = ResidualBlock(128, 3, 1, 1)
        self.resblock7 = ResidualBlock(128, 3, 1, 1)
        self.resblock8 = ResidualBlock(128, 3, 1, 1)
        self.conv9 = torch.nn.Conv2d(128, 256, 4, 2, 1, bias=False)
        self.conv10 = torch.nn.Conv2d(256, 256, 3, 1, 1, bias=False)
        self.resblock11 = ResidualBlock(256, 3, 1, 1)
        self.resblock12 = ResidualBlock(256, 3, 1, 1)
        self.resblock13 = ResidualBlock(256, 3, 1, 1)
        self.conv14 = torch.nn.Conv2d(256, 512, 4, 2, 1, bias=False)
        self.conv15 = torch.nn.Conv2d(512, 512, 3, 1, 1, bias=False)
        self.resblock16 = ResidualBlock(512, 3, 1, 1)
        self.resblock17 = ResidualBlock(512, 3, 1, 1)
        self.resblock18 = ResidualBlock(512, 3, 1, 1)

        self.neuron_out = torch.nn.Linear(18432, 1, bias=False)

        self.pool = torch.nn.AvgPool2d(2, 2)
        self.dropout = torch.nn.Dropout(0.35)

        self.LRelu = torch.nn.LeakyReLU(0.25)

    def forward(self, input):

        x = self.conv_in(input)
        x = self.LRelu(x)
        x = self.pool(x)
        x = self.dropout(x)

        x = self.resblock1(x)
        x = self.resblock2(x)
        x = self.resblock3(x)
        x = self.dropout(x)

        x = self.conv4(x)
        x = self.conv5(x)
        x = self.LRelu(x)
        x = self.dropout(x)

        x = self.resblock6(x)
        x = self.resblock7(x)
        x = self.resblock8(x)
        x = self.dropout(x)

        x = self.conv9(x)
        x = self.conv10(x)
        x = self.LRelu(x)
        x = self.dropout(x)

        x = self.resblock11(x)
        x = self.resblock12(x)
        x = self.resblock13(x)
        x = self.dropout(x)

        x = self.conv14(x)
        x = self.conv15(x)
        x = self.LRelu(x)
        x = self.dropout(x)

        x = self.resblock16(x)
        x = self.resblock17(x)
        x = self.resblock18(x)
        x = self.dropout(x)

        x = self.pool(x)

        x = x.view(x.size(0), -1)

        output = self.neuron_out(x)

        return output

In [None]:
class Sensei(torch.nn.Module):

    def __init__(self, save_pathA, save_pathB, save_pathC):

        super(Sensei, self).__init__()

        # Different metrics can require different reward models architectures.
        # Don't be afraid to use simpler and shallow models.

        self.score_model = PapezCircuit()
        self.power_model = PapezCircuit()
        self.life_model = PapezCircuit()

        self.score_model.load_state_dict(torch.load(save_pathA))
        self.power_model.load_state_dict(torch.load(save_pathB))
        self.life_model.load_state_dict(torch.load(save_pathC))

    def forward(self, score, power, life):

        score_reward = self.score_model(score)
        power_reward = self.power_model(power)
        life_reward = self.life_model(life)

        return score_reward, power_reward, life_reward

In [None]:
sensei = Sensei(
    save_pathA="D:/Python/Projects/Hakisa/Hakisa/Sensei_JK_Score.tar",
    save_pathB="D:/Python/Projects/Hakisa/Hakisa/Sensei_JK_Power.tar",
    save_pathC="D:/Python/Projects/Hakisa/Hakisa/Sensei_JK_Life.tar"
).to(device).eval()

score_region = (1640, 790, 1640+280, 790+290)

power_region = (1620, 0, 1620+200, 30)

life_region = (1115, 915, 1115+210, 915+130)

In [None]:
def capture_regions():
    # ATTENTION: Remember that scaling is crucial
    # PIL.Image only works with uint8 arrays(integers, 0 to 255).
    # Matplotlib, for floats(type used in the models), considers 0 to 1.
        
    with mss() as sct:

            score = sct.grab(monitor={"top": 0, "left": 1620, "width": 200, "height": 30})
            score = Image.frombytes("RGB", score.size, score.bgra, 'raw', 'BGRX')
            score = np.array(score, dtype=np.float32)
            score = score/255
            score = torch.from_numpy(score).unsqueeze(0)
            score = score.view(score.size(0), score.size(3), score.size(1), score.size(2))

            power = sct.grab(monitor={"top": 790, "left": 1640, "width": 280, "height": 290})
            power = Image.frombytes("RGB", power.size, power.bgra, 'raw', 'BGRX')
            power = np.array(power, dtype=np.float32)
            power = power/255
            power = torch.from_numpy(power).unsqueeze(0)
            power = power.view(power.size(0), power.size(3), power.size(1), power.size(2))

            life = sct.grab(monitor={"top": 915, "left": 1115, "width": 210, "height": 130})
            life = Image.frombytes("RGB", life.size, life.bgra, 'raw', 'BGRX')
            life = np.array(life, dtype=np.float32)
            life = life/255
            life = torch.from_numpy(life).unsqueeze(0)
            life = life.view(life.size(0), life.size(3), life.size(1), life.size(2))
        
    return score, power, life

In [None]:
# Playthrough Phase, or Reinforcement Learning Phase, where the magic really happens.
'''
"The Supervised Learning trained model has a natural “domain shift” to the RL environment.
The model of high Supervised Learning accuracy may not behave well in the RL domain"
'''

reward = torch.zeros((1, 1), device=device) # Cumulative reward
advantage = []
steps = 0
save_point = 10 # Also optimization point
action_grads = []
reward_grads = []
previous_predicted_reward = None # BEWARE: this will be the basis for backpropagation (TD-Learning)

optimizer = torch.optim.Adam(hakisa.parameters(), lr=1e-4, eps=1e-8)
# 1e-4 is a common LR. But 1e-6 is also used by RainbowDQN and it's the best one for HierNet. Change it as needed.
# Note: PPO also uses epsilon = 1e-5, but it seems that eps=1e-7 or 1e-8 are better.

old_policy = copy.deepcopy(hakisa) # For Surrogate Loss. Creating here to reserve some space in memory
old_policy.eval()

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10000, gamma=0.1) # The learning rate should decay linearly until it vanishes

value_criterion = torch.nn.MSELoss()

PPO Implementation based on
https://github.com/liuruoze/HierNet-SC2/blob/main/algo/ppo.py

In [None]:
sleep(5)

winsound.PlaySound(f'D:/Python/Audio/English/chiara_craftEpic_1_en', winsound.SND_FILENAME)


while keyboard.is_pressed('esc') == False: # Exit loop when Esc is pressed

    frame = dataset._grab_frame()
    frame = frame/255 # Scaling data, since we did so in the Supervised Learning

    reward_input = reward.clone() # To avoid issues with inplace operations(optimizer)

    if previous_predicted_reward is None:

        _, _, previous_predicted_reward = hakisa(frame, reward_input)

    possible_actions, true_action, predicted_reward = hakisa(frame, reward_input)

    # For Surrogate Loss

    with torch.no_grad():

        previous_possible_actions, _, _ = old_policy(frame, reward_input)

    true_command_type = (possible_actions[0] == true_action[0]).nonzero(as_tuple=True)[1][0].item()
    true_action1 = (possible_actions[1] == true_action[1]).nonzero(as_tuple=True)[1][0].item()
    true_action2 = (possible_actions[2] == true_action[2]).nonzero(as_tuple=True)[1][0].item()

    command = dataset.get_command(true_command_type, true_action1, true_action2)

    hakisa.execute_command(command)

    #score = reward_BH2() # Change this function as you'd like.

    score, power, life = capture_regions()

    with torch.no_grad():

        score_reward, power_reward, life_reward = sensei(score.to(device), power.to(device), life.to(device))

    scoring = score_reward + (power_reward * 1) + (life_reward * 2.0)

    reward += scoring

    delta = scoring + (0.9995 * predicted_reward.item()) - previous_predicted_reward.item()
    advantage.append(delta)

    for t in reversed(range(len(advantage) - 1)):

        advantage[t] = advantage[t] + 0.9995 * 0.9995 * advantage[t+1]

    one_hot = torch.zeros_like(possible_actions[0], device=device)
    one_hot[0, possible_actions[0].argmax(-1)] = 1.
    possible_command_type = possible_actions[0] * one_hot
    possible_command_type = torch.sum(possible_command_type, dim=-1)

    one_hot = torch.zeros_like(previous_possible_actions[0], device=device)
    one_hot[0, previous_possible_actions[0].argmax(-1)] = 1.
    previous_possible_command_type = previous_possible_actions[0] * one_hot
    previous_possible_command_type = torch.sum(previous_possible_command_type, dim=-1)

    one_hot = torch.zeros_like(possible_actions[1], device=device)
    one_hot[0, possible_actions[1].argmax(-1)] = 1.
    possible_action1 = possible_actions[1] * one_hot
    possible_action1 = torch.sum(possible_action1, dim=-1)

    one_hot = torch.zeros_like(previous_possible_actions[1], device=device)
    one_hot[0, previous_possible_actions[1].argmax(-1)] = 1.
    previous_possible_action1 = previous_possible_actions[1] * one_hot
    previous_possible_action1 = torch.sum(previous_possible_action1, dim=-1)

    one_hot = torch.zeros_like(possible_actions[2], device=device)
    one_hot[0, possible_actions[2].argmax(-1)] = 1.
    possible_action2 = possible_actions[2] * one_hot
    possible_action2 = torch.sum(possible_action2, dim=-1)

    one_hot = torch.zeros_like(previous_possible_actions[2], device=device)
    one_hot[0, previous_possible_actions[2].argmax(-1)] = 1.
    previous_possible_action2 = previous_possible_actions[2] * one_hot
    previous_possible_action2 = torch.sum(previous_possible_action1, dim=-1)

    # Since we're dealing with a probability distribution, using exp(log) is more mathmatically correct(and stable)
    # In practice, we're doing a KL-Divergence. To make it less computationally expensive, applying clip.

    possible_command_type = torch.clamp(possible_command_type, 1e-10, 1.0)
    previous_possible_command_type = torch.clamp(previous_possible_command_type, 1e-10, 1.0)
    possible_possible_action1 = torch.clamp(possible_action1, 1e-10, 1.0)
    previous_possible_action1 = torch.clamp(previous_possible_action1, 1e-10, 1.0)
    possible_possible_action2 = torch.clamp(possible_action2, 1e-10, 1.0)
    previous_possible_action2 = torch.clamp(previous_possible_action2, 1e-10, 1.0)

    ratio_command_type = torch.exp(torch.log(possible_command_type) - torch.log(previous_possible_command_type))
    ratio_action1 = torch.exp(torch.log(possible_action1) - torch.log(previous_possible_action1))
    ratio_action2 = torch.exp(torch.log(possible_action2) - torch.log(previous_possible_action2))

    clipped_ratio_command_type = torch.clamp(ratio_command_type, min=0.8, max=1.2)
    clipped_ratio_action1 = torch.clamp(ratio_action1, min=0.8, max=1.2)
    clipped_ratio_action2 = torch.clamp(ratio_action2, min=0.8, max=1.2)

    surrogate_loss_command_type = -torch.mean(torch.minimum(torch.mul(ratio_command_type, advantage[-1]), torch.mul(clipped_ratio_command_type, advantage[-1])))
    surrogate_loss_action1 = -torch.mean(torch.minimum(torch.mul(ratio_action1, advantage[-1]), torch.mul(clipped_ratio_action1, advantage[-1])))
    surrogate_loss_action2 = -torch.mean(torch.minimum(torch.mul(ratio_action2, advantage[-1]), torch.mul(clipped_ratio_action2, advantage[-1])))

    total_surrogate_loss = surrogate_loss_command_type + surrogate_loss_action1 + surrogate_loss_action2

    # We can use both the actual cumulative (discounted) reward, or the predicted reward for target. Following Ruo-Ze Liu's code.
    value_loss = value_criterion(previous_predicted_reward, (scoring + 0.9995 * predicted_reward.detach()))

    total_loss = total_surrogate_loss + (value_loss * 0.5)

    total_loss.backward()

    # Performing a single iteration in order to get previous predicted reward for backpropagation

    _, _, previous_predicted_reward = hakisa(frame, reward_input)

    del frame, _

    try:
        action_grads.append(torch.mean(hakisa.neuron_in.weight.grad))
        reward_grads.append(torch.mean(hakisa.pred_reward_command_type.weight.grad))
    
    except:
        pass

    steps += 1

    if steps % save_point == 0:

        old_policy = copy.deepcopy(hakisa)
        old_policy.eval()

        optimizer.step()
        scheduler.step()

        hakisa.zero_grad()

        print(f"Current step: {steps}")
        print(f"Current Loss: {total_loss.item()}")
        print(f"Surrogate Loss: {total_surrogate_loss.item()}\tValue Loss: {value_loss.item()}\tAdvantage: {advantage[-1].item()}")
        print(f"Command Type loss: {surrogate_loss_command_type.item()}\tRatio: {ratio_command_type.item()}")
        print(f"Action1 loss: {surrogate_loss_action1.item()}\tRatio: {ratio_action1.item()}")
        print(f"Action2 loss: {surrogate_loss_action2.item()}\tRatio: {ratio_action2.item()}")
        print(f"Predicted Reward: {predicted_reward.item()}\tCurrent Reward: {reward.item()}")
        print(f"Score Reward: {score_reward.item()}\tPower: {power_reward.item()}\tLife: {life_reward.item()}")
        print(command)

        # Avoid saving your model during gameplay, 
        # as this process takes too much time (some seconds).

        '''torch.save({
            'Steps': steps,
            'Hakisa_params': hakisa.state_dict(),
            'Hakisa_LR': scheduler.get_last_lr()[0]
        }, f"{save_path}/Hakisa_checkpoint.tar")'''

        winsound.PlaySound(f'D:/Python/Audio/English/chiara_craftEpic_1_en', winsound.SND_FILENAME)

        # Performing a single iteration in order to get previous predicted reward for backpropagation

        frame = dataset._grab_frame()
        frame = frame/255 # Scaling data, since we did so in the Supervised Learning

        reward_input = reward.clone()

        _, _, previous_predicted_reward = hakisa(frame, reward_input)

In [None]:
# Checking the Generalist Advantage Estimative(GAE)

plt.plot(advantage)
plt.show()

In [None]:
# Seeing how the grads behave

_, ax = plt.subplots(1, 2)

ax[0].plot(action_grads[:100])
ax[1].plot(reward_grads[:100])
plt.show()

In [None]:
# Saving Hakisa params

torch.save({
            'Steps': steps,
            'Hakisa_params': hakisa.state_dict(),
            'Hakisa_LR': scheduler.get_last_lr()[0]
        }, "D:/Python/Projects/Hakisa/Hakisa/Hakisa_checkpoint.tar")

In [None]:
# Loading Hakisa params

params = torch.load("D:/Python/Projects/Hakisa/Hakisa/Hakisa_checkpoint.tar")

steps = params['Steps']
hakisa.load_state_dict(params['Hakisa_params'])
print(f"Last LR: {params['Hakisa_LR']}")