## Dataset

- use datasets in ./data/training to train your model
- use DownloadUpload.ipynb to Download a dataset from Hugging Face
- use DataPreProcessing.ipynb to create your own dataset

In [None]:
# xLAN
ds_19k = "./data/training/09_2023/xlan/standard_rated_19k.xlan.tok"
ds_71k = "./data/training/09_2023/xlan/standard_rated_71k.xlan.tok"
ds_350k = "./data/training/09_2023/xlan/standard_rated_350k.xlan.tok"

# xLAN+
ds_19k_plus = "./data/training/09_2023/xlanplus/standard_rated_19k.xlanplus.tok"
ds_71k_plus = "./data/training/09_2023/xlanplus/standard_rated_71k.xlanplus.tok"
ds_350k_plus = "./data/training/09_2023/xlanplus/standard_rated_350k.xlanplus.tok"
ds_1M_plus = "./data/training/09_2023/xlanplus/standard_rated_1M.xlanplus.tok"

# xLANcap
ds_19k_cap = "./data/training/09_2023/xlancap/standard_rated_19k.xlancap.tok"
ds_71k_cap = "./data/training/09_2023/xlancap/standard_rated_71k.xlancap.tok"
ds_350k_cap = "./data/training/09_2023/xlancap/standard_rated_350k.xlancap.tok"

# xLANchk
ds_19k_chk = "./data/training/09_2023/xlanchk/standard_rated_19k.xlanchk.tok"
ds_71k_chk = "./data/training/09_2023/xlanchk/standard_rated_71k.xlanchk.tok"
ds_350k_chk = "./data/training/09_2023/xlanchk/standard_rated_350k.xlanchk.tok"

### Config

In [None]:
dataset = ds_71k_plus

## HYPERPARAMETERS
BATCH_SIZE = 1  # use the largest batch size that fits on your GPU
SAVE_STEPS = 2000  # how often to save a checkpoint
LOGGING_STEPS = 4  # how often to validate model and publish it to Weights & Biases
EPOCHS = 4  # how many epochs to train for - how many times to go through the dataset
LEARNING_RATE = 0.0001  # learning rate - how fast the model should learn
SKIP_VALIDATION = False  # skip validation and only save model checkpoints
WEIGHTS_AND_BIASES_ENABLED = True  # enable logging to Weights & Biases
USE_FP16 = True  # enable mixed precision training (GPU only)
NOTATION = "xLANplus"  # Options: xLAN, xLANplus, xLANcap, xLANchk
LEFT_PAD = False  # pad sequences on the left

## NAMING
MODEL_TYPE = "GPT2"  # choose between "GPT2" or "Mamba"
VERSION_NUMBER = "V100"  # chronological version increment
DATASET_SIZE = "71k"  # Options: 19k, 71k, 350k, 1M

model_name = f"{VERSION_NUMBER}_{MODEL_TYPE}_{DATASET_SIZE}_{EPOCHS}E_{NOTATION}"

## SAVING MODEL
output_dir = f"./Leon-LLM-Models/{model_name}/"

print(model_name)

### Check if CUDA is available

In [None]:
import torch

print(torch.cuda.is_available())

### Train

In [None]:
from src.train import ChessTrainer

trainer = ChessTrainer(
    model_type=MODEL_TYPE,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS,
    input_file=dataset,
    output_dir=output_dir,

    save_steps=SAVE_STEPS,

    logging_steps=LOGGING_STEPS,
    skip_validation=SKIP_VALIDATION,
    weight_and_biases=WEIGHTS_AND_BIASES_ENABLED,
    use_FP16=USE_FP16,
    notation=NOTATION,
    left_padding=LEFT_PAD,
)



trainer.train()

# Validate the model

### Config

In [None]:
from transformers import AutoModelForCausalLM

modelname = "Leon-LLM/Leon-Chess-1M-BOS"  # Huggingface model name or local path
model = AutoModelForCausalLM.from_pretrained(modelname)
number_of_sequences = 1000  # how many sequences to generate to calculate average for "average correct plies" metric
number_of_plies_to_generate = 125  # how many plies to generate for each sequence
max_batch_size = 100  # max batch size for "average correct plies" metric

### Validate

In [None]:
from src.validation.validate_model import validate_model

(
    hard_position_accuracy,
    legal_piece_moves_accuracy,
    average_correct_plies,
    error_frequencies,
    hard_position_results,
    legal_piece_moves_results,
    sequence_results,
) = validate_model(
    model, number_of_sequences, number_of_plies_to_generate, max_batch_size
)

### Show results

In [None]:
print(
    f"hard_position_accuracy: {hard_position_accuracy}\nLegal piece moves accuracy: {legal_piece_moves_accuracy}\nAverage correct plies: {average_correct_plies}\nError frequencies: {error_frequencies}\n"
)