In [None]:
from mss import mss
import numpy as np
import matplotlib.pyplot as plt
import torch
import math
import torch.backends.cudnn as cudnn
from tensorflow.keras.utils import to_categorical
from PIL import Image
from time import sleep
import winsound
import pickle

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True

In [None]:
class Dataset(torch.utils.data.Dataset):
    '''
    Generates input maps and commands for Hakisa.

    Remember: command_types = list of strings, actions1 and 2 = list of strings(keyboard), X coordinates or None(mouse)
    '''
    
    def __init__(
        self,
        command_types = None,
        actions1 = None,
        actions2 = None,
        top=0,
        left=0,
        width=1920,
        height=1080,
        resize=None
    ):

        # Window resolutions for the screen grabber
        self.top = top
        self.left = left
        self.width = width
        self.height = height

        self.resize = resize # For reducing the images. Must be a tuple (Height, Width)

        self.data = None # This will be created during training. However, it's possible to load a ready-made data for training.

        self.command_type = command_types
        self.actions1 = actions1
        self.actions2 = actions2

    def __getitem__(self, idx):

        frames = self.data[idx]
        encoded_command_type = self.encoded_command_type[idx]
        encoded_actions1 = self.encoded_actions1[idx]
        encoded_actions2 = self.encoded_actions2[idx]

        return frames, encoded_command_type, encoded_actions1, encoded_actions2
        

    def __len__(self):

        return len(self.data)


    def record_gameplay(self, number_of_screenshots, screenshot_delay, grayscale=False, resize=False, path=None):

        # Resizing and grayscaling isn't really necessary here, but can save you some time later.
        # Both saving you from writing more code and from making your hardware having to process more and more data at once.

        print(f"Ok. Screenshot capture will begin in 5 seconds")

        sleep(5)

        winsound.PlaySound('D:/Python/Audio/English/chiara_hacking_1_en.wav', winsound.SND_FILENAME) # Just to know if everything's ok

        for i in range(number_of_screenshots):

            with mss() as sct:

                frame = sct.grab(monitor={"top": self.top, "left": self.left, "width": self.width, "height": self.height})
                frame = Image.frombytes("RGB", frame.size, frame.bgra, 'raw', 'BGRX')

            if grayscale:

                frame = frame.convert('L')

            if resize:

                frame = frame.resize(self.resize)

            frame.save(f"{path}/{i}.png")

            sleep(screenshot_delay)
        
        print("Screenshot capture finished!")

        winsound.PlaySound('D:/Python/Audio/English/chiara_hacking_1_en.wav', winsound.SND_FILENAME)

    def create_data(self, data, commands):
        '''
        data: a tensor of size (N_Samples, Channels, Height, Width) containing the game frames. The pixels values must be within range [0., 255.].
        commands: a list of tuples with length (N_samples), with each sample being a tuple composed of (command_type, action1, action2), where:

            command_type: a tensor the action command type index-encoded with indices within range [0, len(command_types)].
            action1: the action1 index-encoded with indices within range [0, len(actions1)].
            action2: the action2 index-encoded with indices within range [0, len(action2)].
        '''

        # We aren't using data in time_steps mode, like we do for gifs, time series and forecasting in general.
        # I thought it might be a good idea to also train Hakisa with that.
        # This might also be the best way to train her in frames forecasting, as the process is probably too slow to be made while playing.

        self.data = data

        encoded_command_type = []
        encoded_actions1 = []
        encoded_actions2 = []

        for sample in commands:

            command_type = to_categorical(sample[0], len(self.command_type))
            command_type = torch.from_numpy(command_type)
            command_type = command_type.unsqueeze(0).to(device) # So you don't have to use [number] for your commands tuple to get a command_type with shape [N_samples, 1]
            encoded_command_type.append(command_type)

            encoded_action1 = to_categorical(sample[1], len(self.actions1))
            encoded_action1 = torch.from_numpy(encoded_action1)
            encoded_action1 = encoded_action1.unsqueeze(0).to(device)
            encoded_actions1.append(encoded_action1)

            encoded_action2 = to_categorical(sample[2], len(self.actions2))
            encoded_action2 = torch.from_numpy(encoded_action2)
            encoded_action2 = encoded_action2.unsqueeze(0).to(device)
            encoded_actions2.append(encoded_action2)

        encoded_command_type = torch.cat(encoded_command_type, 0)
        encoded_actions1 = torch.cat(encoded_actions1, 0)
        encoded_actions2 = torch.cat(encoded_actions2, 0)

        self.encoded_command_type = encoded_command_type
        self.encoded_actions1 = encoded_actions1
        self.encoded_actions2 = encoded_actions2

        print("All done! Train the vectorizer and then use it to generate the input mapping dictionary")

In [None]:
# Jigoku Kisetsukan

command_types = ['key']

actions1 = ['Down', 'Up']

actions2 = ['up', 'down', 'left', 'right', 'z', 'x', 'shift']

dataset = Dataset(command_types, actions1, actions2, resize=(200,200))

In [None]:
commands = [
    (0, 0, 4), (0, 0, 6), (0, 0, 2), (0, 1, 2), (0, 0, 3)
]

In [None]:
import os

images_by_order = []

for directory, _, files in os.walk("D:/Python/Projects/Hakisa/Hakisa/JK_gameplay"):

    for file in files:

        file = file.split('.')
        file = file[0] # Getting exclusively the number

        images_by_order.append(file)

images_by_order = sorted([int(x) for x in images_by_order])

# Problem: for strings, Python considers that 1000 < 2. Maybe something related to how the string is assembled?

images_data = []

for i in images_by_order:

    i = directory + '/' + str(i) + '.png'
    image = Image.open(i)
    image = image.resize((200, 200))
    array = np.array(image, dtype=np.float32)
    image.close()
    array = array/255
    images_data.append(array)

images_data = np.stack(images_data, 0)

In [None]:
# Saving data

with open("D:/Python/Projects/Hakisa/Preprocessing/JK_commands_05000.pkl", 'wb') as f:
    pickle.dump(commands, f)

f.close()

with open("D:/Python/Projects/Hakisa/Preprocessing/JK_screenshots_05000.pkl", 'wb') as f:
    pickle.dump(images_data, f)

f.close()

In [None]:
# Loading data - If you bump into some problems with this, try loading it into chunks of data

with open("D:/Python/Projects/Hakisa/Preprocessing/JK_commands_05000.pkl", 'rb') as f:
    commands = pickle.load(f)

f.close()

with open("D:/Python/Projects/Hakisa/Preprocessing/JK_screenshots_05000.pkl", 'rb') as f:
    images_data = pickle.load(f)

f.close()

In [None]:
images_data = torch.from_numpy(images_data)
images_data = images_data.view(images_data.size(0), images_data.size(3), images_data.size(1), images_data.size(2))

In [None]:
dataset.create_data(images_data[0:5000], commands)

## Alternative Architecture

Instead of using the classic Feature Extractor architecture from VGG19, we'll be using Attention Layers in order to assign weights to each feature.

This way, most relevant features should get higher weights, thus, will be remarked through Convolution and Linear Layers

Since we'll be using big amounts of data (200x200x3 = 120,000 data points), we'll avoid using linear layers and matrix multiplications, resorting to arrays multiplications,
due to those being element-wise operations.

For the output, we can simply use Linear Layers. Conv2Ds can be used between Attention Layers and the Output layers in order to filter the amount of data.

 Input Image -------------> MultiHead Attention ---------> Image with weighted features ------------> (Conv2D -----> MultiHead Attention ----->) Linear Layers ------> Output

**Follow the Transformer Encoder pattern.**

In [None]:
class HeadAttention(torch.nn.Module):

    '''
    Attention Layer might be useful to detect most relevant features in the images.

    Adapted in order to be used with element-wise operations directly to images and feature maps.

    There's no pad masks and, instead of sequence of vectors, we'll be dealing with entires feature maps.
    '''

    def __init__(self, batch_size, input_channels, input_height, input_width):

        super(HeadAttention, self).__init__()

        # Creating array of weights for element-wise operations
        self.queries_weights = torch.randn((batch_size, input_channels, input_height, input_width), device=device, requires_grad=True)
        self.keys_weights = torch.randn((batch_size, input_channels, input_height, input_width), device=device, requires_grad=True)
        self.values_weights = torch.randn((batch_size, input_channels, input_height, input_width), device=device, requires_grad=True)

        self.batchnorm = torch.nn.BatchNorm2d(input_channels) # To compensate the sqrt(d_key) scaling factor

        self.softmax = torch.nn.Softmax2d() # Computes softmax over each channel

    def forward(self, input):

        batch_size = input.size(0) # (Batch, Sequences, d_model)

        queries = input * self.queries_weights # (Batch, channels, height, width)
        keys = input * self.keys_weights
        values = input * self.values_weights

        similarity_matrix = queries * keys # (Batch, channels, height, width)

        similarity_matrix = self.batchnorm(similarity_matrix)

        attention_weights = self.softmax(similarity_matrix) # (Batch, channels, height, width)

        attention_output = values * attention_weights # (Batch, channels, height, width)

        return attention_output

In [None]:
class FeatureExtractor(torch.nn.Module):
    '''
    Uses the MultiHead Attention to extract the most relevant features
    through element-wise operations.

    It's similar to the Transformer Encoder, but discards the necessity of PositionWise FeedForward layers, since those were
    used to compensate the lack of information about positions in the input.
    Since we're dealing with feature maps, we already have that information.
    '''

    def __init__(self, n_heads, batch_size, n_channels, height, width):

        super(FeatureExtractor, self).__init__()

        self.n_heads = n_heads
        
        self.attention_heads = torch.nn.ModuleList([HeadAttention(batch_size, n_channels, height, width) for i in range(n_heads)]) # Extract most relevant features

        self.conv = torch.nn.Conv2d(n_channels*n_heads, n_channels, kernel_size=3, stride=1, padding=1)
        self.batchnorm = torch.nn.BatchNorm2d(n_channels)

        self.dropout = torch.nn.Dropout(0.3)
        self.Relu = torch.nn.ReLU()

    
    def forward(self, input):

        residual_block1 = input # (Batch, n_channels, Height, Width)

        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_heads[head](input)

            attention_output.append(x)

            del x
        
        attention_output = torch.cat(attention_output, 1) # (Batch, n_channels*n_heads, Height, Width)

        attention_output = self.conv(attention_output) # (Batch, n_channels, Height, Width)

        del input

        attention_output = residual_block1 + attention_output # (Batch, 3, Height, Width)

        attention_output = self.dropout(attention_output)

        attention_output = self.batchnorm(attention_output)

        attention_output = self.Relu(attention_output)

        return attention_output

In [None]:
class Coach(torch.nn.Module):

    '''
    Self-Learning Model that uses modified MultiHead Attention Layers in order to extract most relevant features
    and returns pseudolabels, which should be the best command choices to be made in the state given by the input.

    Warning: It appears that using a batch higher than 1 tends to make all outputs conditioned by the first batch.
    I'm not sure, though. It's just my impression.
    '''

    def __init__(self, command_types, actions1, actions2, n_heads, batch_size):

        super(Coach, self).__init__()

        self.command_types = len(command_types) # For initialization, the length is what matters.
        self.actions1 = len(actions1)
        self.actions2 = len(actions2)

        self.n_heads = n_heads

        # Considering a frame size 200x200x3

        self.feature_extractor1 = FeatureExtractor(n_heads, batch_size, 3, 200, 200)

        self.conv1 = torch.nn.Conv2d(3, 100, kernel_size=2, stride=2, bias=False) # 100x100
        self.batchnorm1 = torch.nn.BatchNorm2d(100)

        self.feature_extractor2 = FeatureExtractor(n_heads, batch_size, 100, 100, 100)

        self.conv2 = torch.nn.Conv2d(100, 3, kernel_size=2, stride=2, bias=False) # 50x50
        self.batchnorm2 = torch.nn.BatchNorm2d(3)

        self.feature_extractor3 = FeatureExtractor(n_heads, batch_size, 3, 50, 50)

        self.neuron_type = torch.nn.Linear(3*50*50, self.command_types, bias=True)
        self.neuron_action1 = torch.nn.Linear(3*50*50, self.actions1, bias=True)
        self.neuron_action2 = torch.nn.Linear(3*50*50, self.actions2, bias=True)

        self.dropout = torch.nn.Dropout(0.3) # Adds randomness and makes the classification task more robust. Essential for Self-Learning
        self.PRelu = torch.nn.PReLU()
        self.softmax = torch.nn.LogSoftmax(-1)

    
    def forward(self, input):

        x = self.feature_extractor1(input) # (Batch, 3, 200, 200)

        x = self.conv1(x)
        x = self.dropout(x)
        x = self.batchnorm1(x)
        x = self.PRelu(x)

        x = self.feature_extractor2(x) # (Batch, 100, 100, 100)
        
        x = self.conv2(x)
        x = self.dropout(x)
        x = self.batchnorm2(x)
        x = self.PRelu(x)

        x = self.feature_extractor3(x) # (Batch, 3, 50, 50)

        x = x.view(x.size(0), -1)

        command_type = self.neuron_type(x) # (Batch, n_command_types)
        command_type = self.softmax(command_type)
        action1 = self.neuron_action1(x)
        action1 = self.softmax(action1)
        action2 = self.neuron_action2(x)
        action2 = self.softmax(action2)

        del x

        return (command_type, action1, action2)

In [None]:
coach = Coach(command_types=command_types, actions1=actions1, actions2=actions2, n_heads=4, batch_size=4).to(device)

In [None]:
# Torch Summary uses a batch_size = 2. So this won't work if you've initialized your model with batch_size != 2. Use 2 just to check how things are going here.

from torchsummary import summary

summary(coach, (3, 200, 200))

Mixing Self-Supervised training and Supervised fine-tuning:

https://lilianweng.github.io/posts/2021-12-05-semi-supervised/#consistency-regularization

"Chen et al. (2020) proposed a three-step procedure to merge the benefits of self-supervised pretraining, supervised fine-tuning and self-training together:

Unsupervised or self-supervised pretrain a big model.
Supervised fine-tune it on a few labeled examples. It is important to use a big (deep and wide) neural network. **Bigger models yield better performance with fewer labeled samples.**"

In [None]:
optimizer = torch.optim.Adam(coach.parameters(), lr=1e-3) # The Transformer began with lr=5, but we don't really need to be that radical.
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1000, gamma=0.1)
alpha = 0.99 # This will be a discount factor through training. The discount factor must be maximum with epoch = 0 and decay overtime. Tip: 0.99 decays 10x faster than 0.999
loss = torch.nn.NLLLoss()
best_loss = float('inf')

supervised_learning = 10 # After each N epochs, apply supervised fine-tuning.

grads = []

dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)
epochs = 10000
start_epoch = 0

In [None]:
# Checkpoint

params = torch.load("AutoLabeler_Checkpoint.tar")

start_epoch = params['Epoch']
coach.load_state_dict(params['Best_Params'])
optimizer = params['Optimizer']

In [None]:
print(f"Starting Training from epoch {start_epoch}")

for epoch in range(start_epoch, epochs):

    epoch_loss = 0.

    for i, (frames, _, _, _) in enumerate(dataloader):
        coach.zero_grad()

        frames = frames.to(device)

        output = coach(frames)
        outputalt = coach(frames)

        command_type_consistency = loss(output[0], torch.argmax(outputalt[0].detach(), -1))
        action1_consistency = loss(output[1], torch.argmax(outputalt[1].detach(), -1))
        action2_consistency = loss(output[2], torch.argmax(outputalt[2].detach(), -1))

        unsupervised_batch_loss = (command_type_consistency + action1_consistency + action2_consistency) * (1 - (alpha**epoch))

        unsupervised_batch_loss.backward()

        epoch_loss += unsupervised_batch_loss.item()

        for n, p in coach.named_parameters():
            if 'feature_extractor3.conv.weight' in n:
                grads.append(torch.mean(p.grad))

        optimizer.step()

    scheduler.step()

    del output, outputalt, frames, unsupervised_batch_loss

    if 0 < epoch_loss < best_loss and epoch > 0:

        best_loss = epoch_loss

    if epoch % 10 == 0:
        print(f"{epoch}/{epochs}\nCurrent Loss: {epoch_loss}\tBest Loss: {best_loss}\tCurrent Learning Rate: {scheduler.get_last_lr()[0]}")
        print(f"Command Type Consistency: {command_type_consistency}\nAction1 Consistency: {action1_consistency}\nAction2 Consistency: {action2_consistency}")
        print(f"Gradients Average: {grads[-1]}")

        try:
            torch.save({
                        'Epoch': epoch,
                        'Best_Params': coach.state_dict(),
                        'Optimizer': optimizer
                    }, f"AutoLabeler_Checkpoint.tar")
        
        except:
            continue

    if epoch % supervised_learning == 0 and epoch > 0: # Beginning supervised learning fine-tuning. Using less labeled data and a single epoch for this.

        print("Beginning Supervised Fine-Tuning")

        supervised_loss = 0.

        for i, (frames, encoded_command_type, encoded_actions1, encoded_actions2) in enumerate(dataloader):

            if i < 3000:
                pass
            
            coach.zero_grad()

            frames = frames.to(device)
            encoded_command_type = encoded_command_type.to(device)
            encoded_actions1 = encoded_actions1.to(device)
            encoded_actions2 = encoded_actions2.to(device)

            output = coach(frames)

            command_type_loss = loss(output[0], torch.argmax(encoded_command_type, -1))
            action1_loss = loss(output[1], torch.argmax(encoded_actions1, -1))
            action2_loss = loss(output[2], torch.argmax(encoded_actions2, -1))

            batch_loss = command_type_loss + action1_loss + action2_loss

            batch_loss.backward()

            supervised_loss += batch_loss.item()

            for n, p in coach.named_parameters():
                if 'feature_extractor3.conv.weight' in n:
                    grads.append(torch.mean(p.grad))

            optimizer.step()

        print(f"Fine-tuning complete!")
        print(f"Last Losses:\nCommand Type: {command_type_loss}\tAction 1: {action1_loss}\tAction 2: {action2_loss}\nTotal Batch Loss: {batch_loss}")
        print(f"Total Loss: {supervised_loss}")
        print(f"Gradients Average: {grads[-1]}")

print(f"{epoch}/{epochs}\nCurrent Loss: {epoch_loss}\tBest Loss: {best_loss}\tCurrent Learning Rate: {scheduler.get_last_lr()[0]}")
print(f"Command Type Consistency: {command_type_consistency}\nAction1 Consistency: {action1_consistency}\nAction2 Consistency: {action2_consistency}")
print(f"Gradients Average: {grads[-1]}")

In [None]:
test_images = dataset.data[0:10]

test_command_type = dataset.encoded_command_type[0:10]
test_action1 = dataset.encoded_actions1[0:10]
test_action2 = dataset.encoded_actions2[0:10]

In [None]:
coach.eval()

for i in range(10):
    output = coach(test_images[i].to(device).unsqueeze(0))

    print(output[0][0].argmax(), output[1][0].argmax(), output[2][0].argmax()) # Remember that output[0] will have size (Batch, command_types)
    print(test_command_type[i].argmax(), test_action1[i].argmax(), test_action2[i].argmax())