In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import os
import time
import pickle

In [None]:
os.getcwd()

In [None]:
data = pd.read_csv("/kaggle/input/chess/games.csv")

In [None]:
data.columns

In [None]:
#indexing out onlly the useful data
data = data[["moves", "winner"]]

In [None]:
#preprocessing the data
#white will be 1, black will be 0
data.winner = data.winner.apply(lambda x : 1 if x == "white" else 0)

In [None]:
#tokenizing the moves list
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(data.moves, target_vocab_size = 2e15)

In [None]:
#Tokenize the data
data.moves = data.moves.apply(lambda x : tokenizer.encode(x))

# #removing rows with too many tokens so the model can train faster
data.moves = data.moves.apply(lambda x : x[:100] if len(x) > 100 else x)

#Add start and end tokens
data.moves = data.moves.apply(lambda x : [tokenizer.vocab_size] + x + [tokenizer.vocab_size + 1])

# #removing rows with too many tokens so the model can train faster
# data = data[data.moves.apply(lambda x : len(x) <= 100)].reset_index(drop = True)

#pad the data moves to fit into the neural network
data.moves = pd.Series(list(tf.keras.preprocessing.sequence.pad_sequences(data.moves, padding = "post")))

In [None]:
data

In [None]:
#convert to tf dataset to easily fit the NN
tf_data = tf.data.Dataset.from_tensor_slices((list(data.moves), list(data.winner)))
tf_data = tf_data.cache()
tf_data = tf_data.shuffle(10).padded_batch(50)

In [None]:
#strategy for Neural Network
#Will build the Encoder portion of the Transformer for the NN to make its own representation of the board, and then will add on top
    #a few dense layers to output the prediction
#Will still perform positional encoding, but will not do any type of masking

In [None]:
#Defining necessary functions to create the Encoder
#Code from https://www.tensorflow.org/tutorials/text/transformer

#positional encoding to give the NN context about the order of words
#Signature: 2DArray, 2DArray, Num
#Effects: return an n x m matrix, where n is the number of pos elements, and m is the dimension of each pos element
def get_angles(pos, i, model_depth):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(model_depth))
    return pos * angle_rates

#Signature: Num, Num
            #total num words, num of layers in model
#Effects: Returns a modified result of get_angles to reflect positional encoding
def positional_encoding(position, model_depth):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          model_depth)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

#Padding mask so the model does not treat the padded tokens as input
#Signature: listOfNum (listOfTokens)
#Effects: Produces 1 if the token in the sequence is a padding token, 0 otherwise
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

#scaled_dot_product_attention
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.

    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
    output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output

#Multihead Attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output

#Point wise feed forward NN
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

#Encoder Layer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

        #dropout layers aren't necessary per se, but they're good for the NN (like residual connections). They randomly set outgoing
            #edges of hidden nodes to 0 during training phase to reduce the chance of overfitting
    def call(self, x, training, mask):

        attn_output = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

#Encoder
#The input is embedded and then summed with the positional encoding, and then passed onto the N Encoder layers
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)


        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)
    
#The full model
class Model(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               pe_input, rate=0.1):
        super(Model, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                               input_vocab_size, pe_input, rate)

        self.final_layer = tf.keras.layers.Dense(1, activation = "sigmoid")
    
    def call(self, inp, training, enc_padding_mask):

        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        final_output = self.final_layer(enc_output)  # (batch_size, tar_seq_len, 1)

        return final_output


    
#Optimizer, Loss, Accuracy metrics
# loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none')

# def loss_function(real, pred):
#     mask = tf.math.logical_not(tf.math.equal(real, 0))
#     loss_ = loss_object(real, pred)

#     mask = tf.cast(mask, dtype=loss_.dtype)
#     loss_ *= mask

#     return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

optimizer = tf.keras.optimizers.Adam(beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)
loss_function = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.05, reduction = "none")
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')


In [None]:
#Hyper paramaters 
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
input_vocab_size = tokenizer.vocab_size + 2
dropout_rate = 0.1

In [None]:
model = Model(num_layers, d_model, num_heads, dff, input_vocab_size, input_vocab_size, dropout_rate)

In [None]:
def train_step(inp, tar):

    enc_padding_mask = create_padding_mask(inp)
#     tar = tf.reshape(tar, (50,1))
    with tf.GradientTape() as tape:
        
        predictions = model(inp, True, enc_padding_mask)
        

        loss = loss_function(tar, predictions)


    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))


    train_loss(loss)
    train_accuracy(tar, predictions)
    
# Creating checkpoint paths
checkpoint_path = "/kaggle/working/"

ckpt = tf.train.Checkpoint(model=model,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)
# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')
    
def train_model(tf_dataset, EPOCHS, verbose_interval):
    for epoch in range(EPOCHS):
        start = time.time()

        train_loss.reset_states()
        train_accuracy.reset_states()

        for (batch, (inp, tar)) in enumerate(tf_dataset):
            
            train_step(inp, tar)

            if batch % verbose_interval == 0:
                  print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                  epoch + 1, batch, train_loss.result(), train_accuracy.result()))

        if (epoch + 1) % 5 == 0:
            ckpt_save_path = ckpt_manager.save()
            print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                                 ckpt_save_path))

        print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                    train_loss.result(), 
                                                    train_accuracy.result()))

        print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
        
def evaluate(inp_sentence):
    start_token = [tokenizer.vocab_size]
    end_token = [tokenizer.vocab_size + 1]
  
    inp_sentence = start_token + inp_sentence + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)
  
    enc_padding_mask = create_padding_mask(encoder_input)
    print(enc_padding_mask)
  
    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions = model(encoder_input, 
                                     False,
                                     enc_padding_mask)
    
    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
    
    return predictions

#     predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
#     # return the result if the predicted_id is equal to the end token
#     if predicted_id == tokenizer_en.vocab_size+1:
#       return tf.squeeze(output, axis=0), attention_weights
    
#     # concatentate the predicted_id to the output which is given to the decoder
#     # as its input.
#     output = tf.concat([output, predicted_id], axis=-1)

#   return tf.squeeze(output, axis=0), attention_weights


In [None]:
train_model(tf_data, 10, 10)

In [None]:
tokenizer.save_to_file('/kaggle/working/tokenizer_file')

In [None]:
os.chdir("/kaggle/input/chess-win-predictor-model-and-tokenizer-checkpoint/")

In [None]:
os.listdir()

In [None]:
ckpt_manager = tf.train.CheckpointManager(ckpt, os.getcwd(), max_to_keep=1)
# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [None]:
os.chdir("/kaggle/working/")

In [None]:
pickle_model = {"model" : model}
pickle.dump(pickle, open( 'model_file' + ".p", "wb" ))

In [None]:
test_seq = data.iloc[0].moves
evaluate(test_seq)