In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import os
import time
import pickle
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split

In [None]:
os.getcwd()

In [None]:
data = pd.read_csv("/kaggle/input/chess/games.csv")

In [None]:
data.columns

In [None]:
#indexing out onlly the useful data
data = data[["moves", "winner"]]

In [None]:
#preprocessing the data
#white will be 1, black will be 0
data.winner = data.winner.apply(lambda x : 1 if x == "white" else 0)

In [None]:
#visualizing the average length of FEN Strings
#code from https://www.kaggle.com/sanxuwen/shopee-sentiment-analysis-2nd-place-solution

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.distplot(data['moves'].str.len())
plt.show()

In [None]:
#it looks like padding all strings to length 400 would be safe

#Defining Hyper Parameters here
PADDED_LEN = 20 #400
EPOCHS = 2
BATCH_SIZE = 10
base_model_name ='roberta-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

base_model = TFAutoModelForSequenceClassification.from_pretrained(base_model_name)

base_model.layers[0].trainable = False

In [None]:
X = data.moves[:200]
y = data.winner[:200]

In [None]:
X = tokenizer.batch_encode_plus(X, 
                                   return_attention_masks=False,
                                   pad_to_max_length=True,
                                   max_length=PADDED_LEN)
X = np.array(X['input_ids'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
#Making a tf dataset to feed into the model
train_data = (tf.data.Dataset
             .from_tensor_slices((X_train, y_train))
             .repeat()
             .shuffle(20)
             .batch(BATCH_SIZE))

test_data = (tf.data.Dataset
            .from_tensor_slices((X_test, y_test))
            .shuffle(20)
            .batch(BATCH_SIZE))

In [None]:
#Building the modl
def build_model(base_model, num_labels):
    
    inputs = tf.keras.layers.Input(shape = (PADDED_LEN,), dtype=tf.int32)
    
    embeddings = base_model(inputs)[0] #[:,0,:] 
    #the results of the model are wrapped in a tuple (), so we index [0] to extract the results
    
    out = tf.keras.layers.Reshape((1,num_labels))(embeddings)
    
    out = tf.keras.layers.Dropout(0.2)(out)
    
    out = tf.keras.layers.Conv1D(num_labels * 8, 1, padding = "causal", activation = 'relu')(out)
    
    out = tf.keras.layers.Conv1D(num_labels * 4, num_labels, padding = "causal", activation = 'relu')(out)
    
    out = tf.keras.layers.Conv1D(num_labels, num_labels, padding = "causal", activation = 'relu')(out)
    
    out = tf.keras.layers.GlobalAveragePooling1D()(out)
    
    added = tf.keras.layers.Add()([embeddings, out]) #Residual connection
    
    out = tf.keras.layers.Dense(1, activation = 'sigmoid')(added)
    
    model = tf.keras.Model(inputs = inputs, outputs = out)
    
    model.compile(optimizer = "adam", loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [None]:
model = build_model(base_model, base_model.config.num_labels)

In [None]:
model.summary()

In [None]:
model.fit(train_data,
         steps_per_epoch = X_train.shape[0],
         validation_data=test_data,
         epochs = EPOCHS)

In [None]:
n = np.random.randint(200)
test_moves = X[n]

In [None]:
model.predict(test_moves.reshape((1,PADDED_LEN))) #Model thinks currently black is winning

In [None]:
y[n] #model got it horribly wrong hahaha

In [None]:
#It looks like at the moment, Reading the FEN string isn't enough for the model to predict anything. That being said, 
    #a model advanced enough most likely can be built to predict the winner based on the FEN String.
    #with a lot more training time, the model's accuracy is expected to improve a lot