# Chess Move Legality Classification Using The BERT Model

In this notebook I will attempt to fine-tune the BERT model for classification of legality of the chess moves. The model will be presented a chess-game reresentation in the form of the FEN (Forsyth–Edwards Notation) and a move in the form of a string like e2e3 and will be forced to classify the move as legal or illegal.

## Installs and Imports

In [1]:
!pip install transformers



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import pandas as pd
import numpy as np
from torch.optim import AdamW
import warnings

In [3]:
warnings.filterwarnings("ignore")

## Data Drive Mounting and Constants Declarations

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
EPOCHS = 40
BATCH_SIZE = 16
PADDING = 100
main_directory = "/content/drive/MyDrive/NLP-Chess_Data"
train_path = "NLP_Train.csv.gz"
val_path = "NLP_Val.csv.gz"
test_path = "NLP_Test.csv.gz"
model_path = "BERT_Classifier.pth"

## Functions

In [6]:
def preprocess(df, board_col = "prev_board", move_col = "move"):
    df["model_input"] = df[board_col] + " [SEP] " + df[move_col]
    return df

In [7]:
def transform_BERT_input(BERT_input, device):
    batch_size = BERT_input["input_ids"].shape[0]
    padding_size  = BERT_input["input_ids"].shape[-1]
    BERT_input["input_ids"] = BERT_input["input_ids"].view(batch_size, padding_size).to(device)
    BERT_input["token_type_ids"] = BERT_input["token_type_ids"].view(batch_size, padding_size).to(device)
    BERT_input["attention_mask"] = BERT_input["attention_mask"].view(batch_size, padding_size).to(device)
    return BERT_input

In [8]:
class Chess_Dataset(Dataset):
    def __init__(self, X, y, tokenizer, padding):
        self.tokenizer = tokenizer
        self.X = X.values.reshape(-1)
        self.y = y.astype(np.int8)
        self.padding = padding

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        input_text = self.X[idx]
        label = self.y[idx]
        encoded_input = tokenizer(input_text, padding='max_length', max_length=self.padding, return_tensors='pt')
        label_tensor = torch.tensor(label)
        return encoded_input, label_tensor

In [9]:
def train(epoch, model, optimizer, train_loader, criterion, num_examples, device):
    model.train()
    total_loss = 0
    total_accuracy = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, labels = batch
        input_ids = transform_BERT_input(input_ids, device)
        labels.to(device)
        input_ids.to(device)
        outputs = model(**input_ids)
        outputs = outputs.logits.to(device)
        loss = criterion(outputs.squeeze(-1), labels.float().to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        binarized_outputs = nn.Sigmoid()(outputs.detach().squeeze(-1))
        binarized_outputs = binarized_outputs.cpu().numpy()
        binarized_outputs[binarized_outputs>=0.5] = 1
        binarized_outputs[binarized_outputs<0.5] = 0
        binarized_outputs = binarized_outputs.astype(np.int8)
        labels_coped = labels.squeeze(-1).detach().numpy()
        total_accuracy += sum(binarized_outputs == labels_coped.astype(np.int8))
    print(f'Epoch: {epoch} Training loss: {round(total_loss/len(train_loader), 4)} Training accuracy: {round((total_accuracy/num_examples)*100, 4)}%')

In [10]:
def evaluate(epoch, model, test_loader, criterion, num_examples, device, loss_type = "Validation"):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    with torch.no_grad():
        for batch in test_loader:
            optimizer.zero_grad()
            input_ids, labels = batch
            transform_BERT_input(input_ids, device)
            labels.to(device)
            input_ids.to(device)
            outputs = model(**input_ids)
            outputs = outputs.logits.to(device)
            loss = criterion(outputs.squeeze(-1), labels.float().to(device))
            total_loss += loss.item()
            binarized_outputs = nn.Sigmoid()(outputs.detach().squeeze(-1))
            binarized_outputs = binarized_outputs.cpu().numpy()
            binarized_outputs[binarized_outputs>=0.5] = 1
            binarized_outputs[binarized_outputs<0.5] = 0
            binarized_outputs = binarized_outputs.astype(np.int8)
            labels_coped = labels.squeeze(-1).detach().numpy()
            total_accuracy += sum(binarized_outputs == labels_coped.astype(np.int8))
    print(f'Epoch {epoch}: {loss_type} loss: {round(total_loss/len(test_loader), 4)} {loss_type} accuracy: {round((total_accuracy/num_examples)*100, 4)}%')

## Implementation

### Data Reading

In [11]:
train_data = pd.read_csv(f"{main_directory}/{train_path}", compression="gzip")
val_data = pd.read_csv(f"{main_directory}/{val_path}", compression="gzip")
test_data = pd.read_csv(f"{main_directory}/{test_path}", compression="gzip")

In [12]:
train_data = train_data.head(100000)
val_data = val_data.head(5000)
test_data = test_data.head(5000)

In [13]:
X_train = train_data.drop(columns = list(set(train_data.columns) - set(["prev_board", "move"])))
y_train = train_data["legal"].values.astype(bool)
X_val = val_data.drop(columns = list(set(train_data.columns) - set(["prev_board", "move"])))
y_val = val_data["legal"].values.astype(bool)
X_test = test_data.drop(columns = list(set(train_data.columns) - set(["prev_board", "move"])))
y_test = test_data["legal"].values.astype(bool)
train_data = None
val_data = None
test_data = None
X_train.head()

Unnamed: 0,move,prev_board
0,g7d8,4Q2k/p4Qrp/1p6/8/3R3p/8/PPP3PP/5RK1 b - - 0 37
1,f3f2,8/6p1/5p1p/1p1p1P2/1P1K1kP1/P4P1P/8/8 b - - 0 38
2,a6g1,4rb2/5kp1/pp1p1pp1/2pP4/P1P3R1/1PB3PP/5PK1/8 w...
3,c8b8,2r2k2/p4p2/2B2p2/P2Pp3/4PnR1/4KP1p/5P1P/8 b - ...
4,e6f5,6k1/6pp/p3p3/3bKR2/1p6/1P2R3/P7/6r1 b - - 0 33


In [14]:
X_train = preprocess(X_train)
X_val = preprocess(X_val)
X_test = preprocess(X_test)
X_train = X_train.drop(columns = ["move", "prev_board"])
X_val = X_val.drop(columns = ["move", "prev_board"])
X_test = X_test.drop(columns = ["move", "prev_board"])
X_train.head()

Unnamed: 0,model_input
0,4Q2k/p4Qrp/1p6/8/3R3p/8/PPP3PP/5RK1 b - - 0 37...
1,8/6p1/5p1p/1p1p1P2/1P1K1kP1/P4P1P/8/8 b - - 0 ...
2,4rb2/5kp1/pp1p1pp1/2pP4/P1P3R1/1PB3PP/5PK1/8 w...
3,2r2k2/p4p2/2B2p2/P2Pp3/4PnR1/4KP1p/5P1P/8 b - ...
4,6k1/6pp/p3p3/3bKR2/1p6/1P2R3/P7/6r1 b - - 0 33...


Making Sure the data split is mostly even

In [15]:
unique_values, counts = np.unique(y_train, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"{value}: {count} occurrences")

False: 48850 occurrences
True: 51150 occurrences


In [16]:
unique_values, counts = np.unique(y_val, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"{value}: {count} occurrences")

False: 2419 occurrences
True: 2581 occurrences


In [17]:
unique_values, counts = np.unique(y_test, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"{value}: {count} occurrences")

False: 2463 occurrences
True: 2537 occurrences


### Model Creation

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_dataset = Chess_Dataset(X_train, y_train, tokenizer, PADDING)
val_dataset = Chess_Dataset(X_val, y_val, tokenizer, PADDING)
test_dataset = Chess_Dataset(X_test, y_test, tokenizer, PADDING)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle = False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle = False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle = False)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Available Device: {device}")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels = 1,
    output_attentions = False,
    output_hidden_states = False)
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=1e-4)

Available Device: cuda


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training

In [21]:
num_training_examples = len(X_train)
num_val_examples = len(X_val)
num_test_examples = len(X_test)

In [22]:
for epoch in range(EPOCHS):
    train(epoch, model, optimizer, train_loader, criterion, num_training_examples, device)
    evaluate(epoch, model, val_loader, criterion, num_val_examples, device)

Epoch: 0 Training loss: 0.6988 Training accuracy: 50.162%
Epoch 0: Validation loss: 0.6931 Validation accuracy: 51.62%
Epoch: 1 Training loss: 0.6981 Training accuracy: 50.055%
Epoch 1: Validation loss: 0.693 Validation accuracy: 51.62%
Epoch: 2 Training loss: 0.6971 Training accuracy: 50.157%
Epoch 2: Validation loss: 0.6929 Validation accuracy: 51.62%
Epoch: 3 Training loss: 0.6966 Training accuracy: 50.072%
Epoch 3: Validation loss: 0.6927 Validation accuracy: 51.62%
Epoch: 4 Training loss: 0.6959 Training accuracy: 50.25%
Epoch 4: Validation loss: 0.6927 Validation accuracy: 51.62%
Epoch: 5 Training loss: 0.6956 Training accuracy: 50.107%
Epoch 5: Validation loss: 0.6927 Validation accuracy: 51.62%
Epoch: 6 Training loss: 0.6954 Training accuracy: 50.236%
Epoch 6: Validation loss: 0.6927 Validation accuracy: 51.62%
Epoch: 7 Training loss: 0.6952 Training accuracy: 50.073%
Epoch 7: Validation loss: 0.6926 Validation accuracy: 51.62%
Epoch: 8 Training loss: 0.6947 Training accuracy: 

In [23]:
torch.save(model, f"{main_directory}/{model_path}")

### Evaluation

In [24]:
model = torch.load(f"{main_directory}/{model_path}")

In [25]:
evaluate("Test", model, test_loader, criterion, num_test_examples, device, loss_type = "Test")

Epoch Test: Test loss: 0.6931 Test accuracy: 50.74%
