In [1]:
import numpy as np
import pandas as pd
from typing import List
import chess
from tqdm import tqdm


np.random.seed(42)

N_GAMES = 100_000
SPLIT_WEIGHTS = np.array([0.7, 0.2, 0.1])


In [2]:
df = pd.read_parquet("../data/chess_game_0001.parquet").loc[:N_GAMES-1]
df

Unnamed: 0,Moves,Termination,Result
0,"[d2d4, f7f5, g2g3, g7g6, f1g2, f8g7, g1f3, d7d...",FIVEFOLD_REPETITION,1/2-1/2
1,"[e2e4, e7e6, d2d4, d7d5, b1c3, f8b4, e4e5, c7c...",CHECKMATE,1-0
2,"[d2d4, g8f6, c2c4, e7e5, d4e5, f6g4, c1f4, b8c...",INSUFFICIENT_MATERIAL,1/2-1/2
3,"[c2c4, g8f6, b1c3, e7e5, g2g3, g7g6, f1g2, f8g...",CHECKMATE,1-0
4,"[d2d4, g8f6, c2c4, e7e6, b1c3, f8b4, d1c2, c7c...",CHECKMATE,1-0
...,...,...,...
99995,"[d2d4, g8f6, c2c4, e7e6, g1f3, f8b4, c1d2, a7a...",CHECKMATE,1-0
99996,"[d2d4, g8f6, g1f3, d7d5, c2c4, e7e6, b1c3, f8b...",CHECKMATE,1-0
99997,"[e2e3, e7e5, g1f3, b8c6, f1c4, d7d5, c4b3, g8f...",CHECKMATE,0-1
99998,"[e2e4, d7d5, e4d5, g8f6, b1c3, f6d5, f1c4, e7e...",CHECKMATE,0-1


In [4]:
def get_moved_piece(moves: List[str]) -> List[str]:
    board = chess.Board()
    board.reset()
    moved_pieces = []
    for move in moves:
        move = chess.Move.from_uci(move)
        board.push(move)
        moved_piece = board.piece_at(move.to_square)
        moved_pieces.append(str(moved_piece).lower())   
    return moved_pieces


def get_captures(moves: List[str]) -> List[str]:
    board = chess.Board()
    board.reset()
    captures = []
    for move in moves:
        move = chess.Move.from_uci(move)
        if board.is_capture(move):
            captures.append(str(board.piece_at(move.to_square)).lower())
        else:
            captures.append("")
        board.push(move)
    return captures

def get_check(moves: List[str]) -> List[str]:
    board = chess.Board()
    board.reset()
    checks = []
    for move in moves:
        move = chess.Move.from_uci(move)
        board.push(move)
        checks.append(board.is_check())
    return checks


def get_next_move(moves: List[str]) -> List[str]:
    next_moves = moves[1:]
    next_moves = np.append(next_moves, "<EOS>")
    return next_moves

pieces = []
captures = []
checks = []
next_moves = []
termination_seqs = []
result_seqs = []
for game in tqdm(df.values):
    moves, termination, result = game
    pieces.append(get_moved_piece(moves))
    captures.append(get_captures(moves))
    checks.append(get_check(moves))
    next_moves.append(get_next_move(moves))
    termination_seqs.append(np.array([termination] * len(moves)))
    result_seqs.append(np.array([result] * len(moves)))

df["pieces"] = pieces
df["captures"] = captures
df["checks"] = checks
df["next_moves"] = next_moves
df["termination_seqs"] = termination_seqs
df["result_seqs"] = result_seqs

100%|██████████| 100000/100000 [04:06<00:00, 406.29it/s]


In [5]:
game_indices = np.arange(N_GAMES)
np.random.shuffle(game_indices)

train_idx, val_idx, test_idx = np.split(game_indices, (SPLIT_WEIGHTS.cumsum()[:-1] * N_GAMES).astype(int))
train_idx

array([75721, 80184, 19864, ..., 71582, 21635, 91054])

In [6]:
import os

try:
    os.mkdir("../data/games_0001")
except FileExistsError:
    pass

In [7]:
train_df = df.iloc[train_idx]
train_df.to_parquet("../data/games_0001/train_100K.parquet")
val_df = df.iloc[val_idx]
val_df.to_parquet("../data/games_0001/val_100K.parquet")
test_df = df.iloc[test_idx]
test_df.to_parquet("../data/games_0001/test_100K.parquet")