In [1]:
from mss import mss
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.backends.cudnn as cudnn
from tensorflow.keras.utils import to_categorical
from PIL import Image
from sklearn.neighbors import NearestNeighbors as KNN
import pytesseract
from time import sleep
import winsound
from re import sub
import pickle

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True

In [None]:
# Testing Screen region capture and OCR

from cv2 import adaptiveThreshold, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY

sleep(3)

with mss() as sct:
    monitor = {"top": 180, "left": 1, "width": 249-1, "height": 213-180} # Memory efficiency.
    data = sct.grab(monitor)
    data = Image.frombytes("RGB", data.size, data.bgra, 'raw', 'BGRX')
    data = data.convert("P") # Thresholding only works on grayscaled images.
    data = np.array(data)

winsound.PlaySound('D:/Python/Audio/English/chiara_hacking_1_en.wav', winsound.SND_FILENAME)

plt.imshow(data)
plt.show()

consequence = pytesseract.image_to_string(data, config='--psm 6')
print(consequence)

consequence = sub('[^A-Za-z0-9\/\.]', '', consequence)

print(consequence)

datathresh = adaptiveThreshold(data,255,ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY,63,20)

consequence = pytesseract.image_to_string(datathresh, config='--psm 6')
print(consequence)

#replace w --> 1, f --> / ---> y ---> 1 ---> e --> 2
consequence = sub('[^A-Za-z0-9\/\.]', '', consequence)

print(consequence)

plt.imshow(datathresh)
plt.show()

In [None]:
datathresh = adaptiveThreshold(data,255,ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY,11,5) # 11, 5

consequence = pytesseract.image_to_string(datathresh, config='--psm 8')
print(consequence)

#replace w --> 1, f --> / ---> y ---> 1 ---> e --> 2
consequence = sub('[^A-Za-z0-9\/\.]', '', consequence)

print(consequence)

plt.imshow(datathresh)
plt.show()

In [4]:
class Dataset(torch.utils.data.Dataset):
    '''
    Generates input maps and commands for Hakisa.

    Remember: command_types = list of strings, actions1 and 2 = list of strings(keyboard), X coordinates or None(mouse)
    '''
    
    def __init__(
        self,
        command_types = None,
        actions1 = None,
        actions2 = None,
        top=0,
        left=0,
        width=1920,
        height=1080,
        resize=None
    ):

        # Window resolutions for the screen grabber
        self.top = top
        self.left = left
        self.width = width
        self.height = height

        self.resize = resize # For reducing the images. Must be a tuple (Height, Width)

        self.data = None # This will be created during training. However, it's possible to load a ready-made data for training.

        # Initially, we'll be using lists. After our vector embedding has been properly trained, we'll create a dictionary
        # of input mappings with it.

        self.command_type = command_types
        self.actions1 = actions1
        self.actions2 = actions2

        self.encoded_command_type = to_categorical(np.arange(0, len(command_types)), len(command_types))
        self.encoded_command_type = torch.from_numpy(self.encoded_command_type)

        self.key_actions1 = None # Dictionary of vectors for each action1
        self.key_actions2 = None # Dictionary of vectors for each action2

        self.knn_actions1 = None # Where we'll store our fitted KNN
        self.knn_actions2 = None


    def __getitem__(self, idx):

        frames = self.data[idx]
        encoded_command_type = self.encoded_command_type[idx]
        encoded_actions1 = self.encoded_actions1[idx]
        encoded_actions2 = self.encoded_actions2[idx]

        return frames, encoded_command_type, encoded_actions1, encoded_actions2
        

    def __len__(self):

        return len(self.data)


    def create_commands_dictionary(self, map2vec_model):

        map2vec_model.evaluate = True

        # I don't really know how we could handle vector dimensions

        dictionary_actions1 = {}
        dictionary_actions2 = {}

        empty1 = torch.empty_like(self.encoded_actions1, device=device) # The vectorizer demands both actions as input, but they're vectorized independently.
        empty2 = torch.empty_like(self.encoded_actions2, device=device)

        empty_frame = torch.empty((1, 400*5*5), device=device) # The vectorizer requires a context as input, but for evaluation this isn't necessary.

        empty_type = torch.empty_like(self.encoded_command_type, device=device)

        for i in range(len(self.actions1)):

            with torch.no_grad():

                output, _ = map2vec_model(empty_frame, empty_type[0].unsqueeze(0), self.encoded_actions1[i].unsqueeze(0), empty2[0].unsqueeze(0))

                output = output.view(-1)

                vector = output[torch.argmax(output)].item()
            
            dictionary_actions1[self.actions1[i]] = vector

        self.key_actions1 = dictionary_actions1

        del dictionary_actions1

        for i in range(len(self.actions2)):

            with torch.no_grad():

                _, output = map2vec_model(empty_frame, empty_type[0].unsqueeze(0), empty1[0].unsqueeze(0), self.encoded_actions2[i].unsqueeze(0))

                output = output.view(-1)

                vector = output[torch.argmax(output)].item()

            dictionary_actions2[self.actions2[i]] = vector

        self.key_actions2 = dictionary_actions2

        del dictionary_actions2

        print(f"Dict input maps created successfully!\nActions 1 dict length: {len(self.key_actions1)}\nActions 2 dict length: {len(self.key_actions2)}")

        self.knn_actions1 = self._fit_knn(self.key_actions1)
        self.knn_actions2 = self._fit_knn(self.key_actions2)

        print("All action maps have been properly fitted by their respective KNN algorithm")


    def _fit_knn(self, dictionary):
        
        values = list(dictionary.values())

        values = np.array(values).reshape(-1,1)

        knn = KNN(n_neighbors=1, algorithm='kd_tree').fit(values)

        del values

        return knn


    def record_gameplay(self, number_of_screenshots, screenshot_delay, grayscale=False, resize=False, path=None):

        # Resizing and grayscaling isn't really necessary here, but can save you some time later.
        # Both saving you from writing more code and from making your hardware having to process more and more data at once.

        print(f"Ok. Screenshot capture will begin in 5 seconds")

        sleep(5)

        winsound.PlaySound('D:/Python/Audio/English/chiara_hacking_1_en.wav', winsound.SND_FILENAME) # Just to know if everything's ok

        for i in range(number_of_screenshots):

            with mss() as sct:

                frame = sct.grab(monitor={"top": self.top, "left": self.left, "width": self.width, "height": self.height})
                frame = Image.frombytes("RGB", frame.size, frame.bgra, 'raw', 'BGRX')

            if grayscale:

                frame = frame.convert('L')

            if resize:

                frame = frame.resize(self.resize)

            frame.save(f"{path}/Screenshot_{i}.png")

            sleep(screenshot_delay)
        
        print("Screenshot capture finished!")

        winsound.PlaySound('D:/Python/Audio/English/chiara_hacking_1_en.wav', winsound.SND_FILENAME)


    def create_data(self, data, commands):
        '''
        data: a tensor of size (N_Samples, Channels, Height, Width) containing the game frames. The pixels values must be within range [0., 255.].
        commands: a list of tuples with length (N_samples), with each sample being a tuple composed of (command_type, action1, action2), where:

            command_type: a tensor the action command type index-encoded with indices within range [0, len(command_types)].
            action1: the action1 index-encoded with indices within range [0, len(actions1)].
            action2: the action2 index-encoded with indices within range [0, len(action2)].
        '''

        # We aren't using data in time_steps mode, like we do for gifs, time series and forecasting in general.
        # I thought it might be a good idea to also train Hakisa with that.
        # This might also be the best way to train her in frames forecasting, as the process is probably too slow to be made while playing.

        self.data = data

        encoded_command_type = []
        encoded_actions1 = []
        encoded_actions2 = []

        for sample in commands:

            command_type = to_categorical(sample[0], len(self.command_type))
            command_type = torch.from_numpy(command_type)
            command_type = command_type.unsqueeze(0).to(device) # So you don't have to use [number] for your commands tuple to get a command_type with shape [N_samples, 1]
            encoded_command_type.append(command_type)

            encoded_action1 = to_categorical(sample[1], len(self.actions1))
            encoded_action1 = torch.from_numpy(encoded_action1)
            encoded_action1 = encoded_action1.unsqueeze(0).to(device)
            encoded_actions1.append(encoded_action1)

            encoded_action2 = to_categorical(sample[2], len(self.actions2))
            encoded_action2 = torch.from_numpy(encoded_action2)
            encoded_action2 = encoded_action2.unsqueeze(0).to(device)
            encoded_actions2.append(encoded_action2)

        encoded_command_type = torch.cat(encoded_command_type, 0)
        encoded_actions1 = torch.cat(encoded_actions1, 0)
        encoded_actions2 = torch.cat(encoded_actions2, 0)

        self.encoded_command_type = encoded_command_type
        self.encoded_actions1 = encoded_actions1
        self.encoded_actions2 = encoded_actions2

        print("All done! Train the vectorizer and then use it to generate the input mapping dictionary")

    def save_dicts(self, path, file_name):

        with open(f'{path}/{file_name}_actions1.pkl', 'wb') as f:
            pickle.dump(self.key_actions1, f)
        f.close()
        
        with open(f'{path}/{file_name}_KNNactions1.pkl', 'wb') as f:
            pickle.dump(self.knn_actions1, f)
        f.close()

        with open(f'{path}/{file_name}_actions2.pkl', 'wb') as f:
            pickle.dump(self.key_actions2, f)
        f.close()

        with open(f'{path}/{file_name}_KNNactions2.pkl', 'wb') as f:
            pickle.dump(self.knn_actions2, f)
        f.close()

        print(f"Dictionaries saved at {path}/{file_name} with their respectives KNNs.\nRemember: Each dict must be correctly fit with its respective KNN algorithm.")
        print(f"When using Hakisa, simply use the dataset function '.load_dicts(path, file_name)")

In [5]:
# Jigoku Kisetsukan

command_type = ['key']

actions1 = ['Down', 'Up']

actions2 = ['up', 'down', 'left', 'right', 'z', 'x', 'shift']

In [None]:
# Bullet Heaven

command_types = ['move', 'click', 'rightclick']

actions1 = [i for i in range(1, 1919)] # Avoiding using the extremes so we don't have to shut down PyAutoGUI safety lock.

actions2 = [i for i in range(1, 1079)]

In [6]:
dataset = Dataset(command_types=command_type, actions1=actions1, actions2=actions2, resize=(200, 200))

In [None]:
dataset.record_gameplay(2000, 1, grayscale=False, resize=False, path="Hakisa/JK_gameplay/")

In [7]:
import os

images_path = []

for directory, _, files in os.walk("Hakisa/JK_gameplay/"):

    for file in files:

        images_path.append(directory+"/"+file)

# Problem: for strings, Python considers that 1000 < 2. Maybe something related to how the string is assembled? I don't know how to fix this yet.

images_data = []

for i in images_path[0:10]:

    image = Image.open(i)
    image = image.resize((200, 200))
    array = np.array(image, dtype=np.float32)
    image.close()
    images_data.append(array)

images_data = np.stack(images_data, 0)

In [8]:
# Now, the boring part: visualizing each screenshot and labeling them

image = Image.open(images_path[9]) # Remember that [0] = screenshot 0, but [1] = screenshot 10, due to Python considering 10 < 2 and 100000 < 2 and so on...
image.show()
image.close()

In [9]:
commands = [(0, 0, 3), (0, 0, 4), (0, 0, 6), (0, 0, 2), (0, 0, 0), (0, 0, 0), (0, 1, 0), (0, 0, 2), (0, 0, 6), (0, 0, 3)]

In [10]:
images_data = torch.from_numpy(images_data)
images_data = images_data.view(images_data.size(0), images_data.size(3), images_data.size(1), images_data.size(2))

In [11]:
dataset.create_data(images_data, commands)

All done! Train the vectorizer and then use it to generate the input mapping dictionary


In [12]:
print(dataset.encoded_command_type.size())
print(dataset.encoded_actions1.size())
print(dataset.encoded_actions2.size())

torch.Size([10, 1])
torch.Size([10, 2])
torch.Size([10, 7])


In [13]:
class Action2Vec(torch.nn.Module):

    '''
    The Vectorizer model will assign vectors to each action1 and each action2 according to its context.
    In NLP, the context is determined by the position of certain word according to other words.

    For us, we could determine the context according to the game state(the frame) and the command used in that state.

    But it might be interesting to use other metrics for context, such as HP, MP, Power, Aura, Score...

    In order to correctly get the context, we'll be using feature extraction with Conv2Ds on the frames.
    This context(or the features extracted from the frames) is gonna be used to condition the action vector.


    Game Frame ------> Feature Extraction (Conv2D + MaxPool) ----> Context
    O-H action ------> FCC layer --------------------------------> some output?

    concatenation(Context, some output) ---> FCC layer ----------> Vector
    '''

    def __init__(self, command_type, actions1, actions2, evaluate=False):

        super(Action2Vec, self).__init__()

        self.command_type = len(command_type) # For initialization, the length is what matters.
        self.actions1 = len(actions1)
        self.actions2 = len(actions2)

        self.evaluate = evaluate

        # Considering a frame size 200x200x3

        self.conv1 = torch.nn.Conv2d(3, 100, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm1 = torch.nn.BatchNorm2d(100)
        self.conv2 = torch.nn.Conv2d(100, 200, kernel_size=3, stride=1, padding=1, bias=False) # 200x200
        self.batchnorm2 = torch.nn.BatchNorm2d(200)
        self.pool2x2 = torch.nn.MaxPool2d(kernel_size=2, stride=2) # 100x100
        self.conv3 = torch.nn.Conv2d(200, 400, kernel_size=3, stride=1, padding=1, bias=False) # 100x100
        self.batchnorm3 = torch.nn.BatchNorm2d(400)
        self.conv4 = torch.nn.Conv2d(400, 600, kernel_size=3, stride=1, padding=1, bias=False) # 100x100
        self.batchnorm4 = torch.nn.BatchNorm2d(600)
        # Add pool 2x2 ---> 50x50
        self.conv5 = torch.nn.Conv2d(600, 800, kernel_size=3, stride=1, padding=1, bias=False) # 50x50
        self.batchnorm5 = torch.nn.BatchNorm2d(800)
        self.conv6 = torch.nn.Conv2d(800, 1000, kernel_size=3, stride=1, padding=1, bias=False) # 50x50
        self.batchnorm6 = torch.nn.BatchNorm2d(1000)
        # Add pool 2x2 ---> 25x25
        self.conv7 = torch.nn.Conv2d(1000, 1200, kernel_size=4, stride=1, bias=False) # 22x22
        self.batchnorm7 = torch.nn.BatchNorm2d(1200)
        self.conv8 = torch.nn.Conv2d(1200, 1000, kernel_size=3, stride=1, bias=False) # 20x20
        self.batchnorm8 = torch.nn.BatchNorm2d(1000)
        # Add pool 2x2 ---> 10x10
        self.conv9 = torch.nn.Conv2d(1000, 800, kernel_size=3, stride=1, padding=1, bias=False) # 10x10
        self.batchnorm9 = torch.nn.BatchNorm2d(800)
        self.conv10 = torch.nn.Conv2d(800, 400, kernel_size=3, stride=1, padding=1, bias=False) # 10x10
        self.batchnorm10 = torch.nn.BatchNorm2d(400)
        # Add pool 2x2 ---> 5x5
        self.neuron_frames = torch.nn.Linear(400*5*5, 200*2*2, bias=False)

        self.neuron_command_type1 = torch.nn.Linear(self.command_type, 200*2*2, bias=False) # The command type will be used to condition the actions
        self.neuron_actions1A = torch.nn.Linear(self.actions1, 200*2*2, bias=False)
        self.neuron_actions2A = torch.nn.Linear(self.actions2, 200*2*2, bias=False)

        #self.neuron_actions1B = torch.nn.Linear(200*2*6, self.actions1, bias=False)
        #self.neuron_actions2B = torch.nn.Linear(200*2*6, self.actions2, bias=False)

        self.neuron_actions1B = torch.nn.Linear(200*2*6, self.actions1, bias=False)
        self.neuron_actions2B = torch.nn.Linear(200*2*6, self.actions2, bias=False)

        self.layer_normA = torch.nn.LayerNorm(200*2*6)
        self.layer_normB = torch.nn.LayerNorm(200*2*6)

        self.leakyrelu = torch.nn.LeakyReLU(0.25)
        #self.softmax = torch.nn.LogSoftmax(-1) # Won't be used ----> Already included in Pytorch's Cross Entropy Loss

    def forward(self, game_frame, encoded_command_type, encoded_action1, encoded_action2):

        if self.evaluate == False:

            x = self.conv1(game_frame)
            x = self.leakyrelu(x)
            x = self.batchnorm1(x)
            x = self.conv2(x)
            x = self.leakyrelu(x)
            x = self.batchnorm2(x)

            x = self.pool2x2(x)

            x = self.conv3(x)
            x = self.leakyrelu(x)
            x = self.batchnorm3(x)
            x = self.conv4(x)
            x = self.leakyrelu(x)
            x = self.batchnorm4(x)

            x = self.pool2x2(x)

            x = self.conv5(x)
            x = self.leakyrelu(x)
            x = self.batchnorm5(x)
            x = self.conv6(x)
            x = self.leakyrelu(x)
            x = self.batchnorm6(x)

            x = self.pool2x2(x)

            x = self.conv7(x)
            x = self.leakyrelu(x)
            x = self.batchnorm7(x)
            x = self.conv8(x)
            x = self.leakyrelu(x)
            x = self.batchnorm8(x)

            x = self.pool2x2(x)

            x = self.conv9(x)
            x = self.leakyrelu(x)
            x = self.batchnorm9(x)
            x = self.conv10(x)
            x = self.leakyrelu(x)
            x = self.batchnorm10(x)

            x = self.pool2x2(x)

            x = x.view(x.size(0), -1)

            context = self.neuron_frames(x) # (Batch, 200*2*2)

            encoded_command_type = self.neuron_command_type1(encoded_command_type) # (Batch, 200*2*2)

            context = torch.cat((context, encoded_command_type), -1) # (Batch, 200*2*4)

            x = self.neuron_actions1A(encoded_action1)
            
            x = torch.cat((context, x), -1) # (Batch, 200*2*6)

            x = self.layer_normA(x)

            output1 = self.neuron_actions1B(x)

            x = self.neuron_actions2A(encoded_action2)

            x = torch.cat((context, x), -1)

            x = self.layer_normB(x)

            output2 = self.neuron_actions2B(x)

            del x

            return output1, output2
        
        else:

            context = self.neuron_frames(game_frame) # (Batch, 200*2*2)

            encoded_command_type = self.neuron_command_type1(encoded_command_type) # (Batch, 200*2*2)

            context = torch.cat((context, encoded_command_type), -1) # (Batch, 200*2*4)

            x = self.neuron_actions1A(encoded_action1)
            
            x = torch.cat((context, x), -1) # (Batch, 200*2*6)

            x = self.layer_normA(x)

            output1 = self.neuron_actions1B(x)

            x = self.neuron_actions2A(encoded_action2)

            x = torch.cat((context, x), -1)

            x = self.layer_normB(x)

            output2 = self.neuron_actions2B(x)

            del x

            return output1, output2

In [14]:
action2vec_model = Action2Vec(command_type, actions1, actions2, evaluate=False).to(device)

In [15]:
optimizer = torch.optim.Adam(action2vec_model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1000, gamma=0.1)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
loss = torch.nn.CrossEntropyLoss()

grads = []

#epochs = 10000
epochs = 100

In [16]:
for epoch in range(epochs):
    for i, (frames, encoded_command_type, encoded_actions1, encoded_actions2) in enumerate(dataloader):
        action2vec_model.zero_grad()

        frames = frames.to(device)
        encoded_command_type = encoded_command_type.to(device)
        encoded_actions1 = encoded_actions1.to(device)
        encoded_actions2 = encoded_actions2.to(device)

        output1, output2 = action2vec_model(frames, encoded_command_type, encoded_actions1, encoded_actions2)

        cost1 = loss(output1, encoded_actions1)

        cost2 = loss(output2, encoded_actions2)

        cost = cost1 + cost2

        cost.backward()

        for n, p in action2vec_model.named_parameters():
            if 'neuron_frames.weight' in n:
                grads.append(torch.mean(p.grad))

        optimizer.step()

    scheduler.step()

    if epoch % 10 == 0:
        print(f"{epoch}/{epochs}\tCurrent Loss: {cost.item()}\tCurrent Learning Rate: {scheduler.get_last_lr()[0]}")
        print(f"Gradients Average: {grads[-1]}")

0/100	Current Loss: 2.683117628097534	Current Learning Rate: 1e-05
Gradients Average: -7.03003752278164e-05
10/100	Current Loss: 0.0019489850383251905	Current Learning Rate: 1e-05
Gradients Average: 1.1707386420312105e-06
20/100	Current Loss: 0.005426548887044191	Current Learning Rate: 1e-05
Gradients Average: 8.033214271563338e-07
30/100	Current Loss: 0.0031433142721652985	Current Learning Rate: 1e-05
Gradients Average: -2.583366551789368e-07


KeyboardInterrupt: 

In [17]:
print(encoded_actions1)
print(encoded_actions2)

tensor([[0., 1.],
        [1., 0.]], device='cuda:0')
tensor([[1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.]], device='cuda:0')


In [18]:
print(output1)
print(output2)

tensor([[ 3.1970, -5.3581],
        [ 3.1824, -4.5489]], device='cuda:0', grad_fn=<MmBackward0>)
tensor([[ 6.5323, -2.0989, -0.6335, -0.4155, -1.5959, -2.7090, -1.9323],
        [-2.0638, -2.1036, -0.8067, -0.2807, -0.6650, -3.2912,  7.3846]],
       device='cuda:0', grad_fn=<MmBackward0>)


In [19]:
dataset.create_commands_dictionary(action2vec_model)

Dict input maps created successfully!
Actions 1 dict length: 2
Actions 2 dict length: 7
All action maps have been properly fitted by their respective KNN algorithm


In [20]:
print(dataset.key_actions1)
print(dataset.key_actions2)

{'Down': 2.2056455612182617, 'Up': 2.2056455612182617}
{'up': 1.4485101699829102, 'down': 1.355112075805664, 'left': 1.3761229515075684, 'right': 1.4581246376037598, 'z': 1.3342045545578003, 'x': 1.3342045545578003, 'shift': 1.3342045545578003}
