## Imports

In [20]:
import numpy as np 
import polars
import os
from tqdm import tqdm
from time import sleep
import asyncio
import chess
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from threading import active_count

## Downloading the dataset

In [21]:
!kaggle datasets download -d datasnaek/chess

chess.zip: Skipping, found more recently modified local copy (use --force to force download)


In [22]:
!unzip -qf chess.zip

## Reading in the dataset

In [23]:
games = polars.read_csv("games.csv", columns=["moves", "white_rating", "black_rating"])

In [24]:
print(games.head())

shape: (5, 3)
┌──────────────┬──────────────┬─────────────────────────────────────┐
│ white_rating ┆ black_rating ┆ moves                               │
│ ---          ┆ ---          ┆ ---                                 │
│ i64          ┆ i64          ┆ str                                 │
╞══════════════╪══════════════╪═════════════════════════════════════╡
│ 1500         ┆ 1191         ┆ d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1322         ┆ 1261         ┆ d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1496         ┆ 1500         ┆ e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1439         ┆ 1454         ┆ d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1523         ┆ 1469         ┆ e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na... │
└─────

## Seperate games into individual moves

In [25]:
moves2DArray = [game.split() for game in list(games.get_column("moves"))]
moves = [move for game in moves2DArray for move in game] # moves is a list where each 
lengthOfGames = [len(game) for game in moves2DArray]
gamesNoMoves = games.drop("moves")
dataFrames = [
    polars.DataFrame([gamesNoMoves.row(i)]*lengthOfGame, orient="row") 
    for i,lengthOfGame in enumerate(lengthOfGames)
]
multipleGamesNoMoves = polars.concat(dataFrames)

In [26]:
clean = multipleGamesNoMoves
clean = clean.with_column(polars.Series("moves", moves, dtype='str'))
clean.columns = ["white_rating", "black_rating", "moves"]

In [27]:
clean.write_csv("clean.csv")

In [28]:
clean = polars.read_csv("clean.csv")

In [29]:
clean.head()

white_rating,black_rating,moves
i64,i64,str
1500,1191,"""d4"""
1500,1191,"""d5"""
1500,1191,"""c4"""
1500,1191,"""c6"""
1500,1191,"""cxd5"""


## Generating FEN strings from previous moves played

In [30]:
class Chunk:

    def __init__(self, index : int, dataFrame : polars.DataFrame, idxOfGames : list[int]):
        self.index = index
        self.dataFrame = dataFrame
        self.idxOfGames = idxOfGames
    
    def __repr__(self):
        return str(self.dataFrame)

In [31]:
def chunkateData(df : polars.DataFrame, numOfGamesPerChunk : int, startingIndexOfGames : list[int]) -> list[Chunk]:
    startingIndexOfGames.insert(0, 0)
    startingIndexOfChunks = startingIndexOfGames[0::numOfGamesPerChunk]
    startingIndexOfChunks.append(df.height)
    chunks = []
    for i in range(len(startingIndexOfChunks)-1):
        chunks.append(
            Chunk(
                index=i,
                dataFrame=df[startingIndexOfChunks[i]:startingIndexOfChunks[i+1]],
                idxOfGames=[
                    startingIdx - startingIndexOfGames[i*numOfGamesPerChunk] 
                    for startingIdx in startingIndexOfGames[i*numOfGamesPerChunk:(i+1)*numOfGamesPerChunk]
                ],
            )
        )
    return chunks

In [32]:
def concatData(chunks : list[Chunk]) -> polars.DataFrame:

    chunks.sort(key=lambda x: x.index)

    return polars.concat([chunk.dataFrame for chunk in chunks])

In [33]:
def addFenToChunk(chunk: Chunk) -> None:
    """
    Adds a Fen column to a chunk.
    """
    fens = []
    idxOfGames = chunk.idxOfGames
    idxOfGames.append(chunk.dataFrame.height)
    for i in range(len(chunk.idxOfGames) - 1):
        board = chess.Board()
        moves = chunk.dataFrame[idxOfGames[i]:idxOfGames[i+1]]["moves"]
        for SANmove in moves:
            fens.append(board.fen())
            board.push_san(SANmove)
    
    chunk.dataFrame = chunk.dataFrame.with_column(polars.Series("fen", fens, dtype='str'))

In [35]:
chunks = chunkateData(
    df=clean,
    numOfGamesPerChunk=1000,
    startingIndexOfGames=list(np.cumsum(lengthOfGames)),
)

for chunk in tqdm(chunks):
    addFenToChunk(chunk)

cleanWithFen = concatData(chunks)

100%|██████████| 21/21 [08:22<00:00, 23.94s/it]


In [36]:
cleanWithFen.write_csv("cleanWithFen.csv")

In [37]:
cleanWithFen = polars.read_csv("cleanWithFen.csv")

In [38]:
cleanWithFen.head()

white_rating,black_rating,moves,fen
i64,i64,str,str
1500,1191,"""d4""","""rnbqkbnr/ppppp..."
1500,1191,"""d5""","""rnbqkbnr/ppppp..."
1500,1191,"""c4""","""rnbqkbnr/ppp1p..."
1500,1191,"""c6""","""rnbqkbnr/ppp1p..."
1500,1191,"""cxd5""","""rnbqkbnr/pp2pp..."


In [88]:
def pieceFromMove(move : str) -> str:
    if move[0] in {'R', 'N', 'B', 'Q', 'K', 'P'}:
        return move[0]
    else:
        return 'P'

In [117]:
def cleanFeatures(white_rating : int, black_rating : int, fen : str):
    piece_symbols = ['R', 'N', 'B', 'Q', 'K', 'P']
    piece_colors = [0, 1]
    piece_arrays = np.zeros(shape=(12,8,8))
    board = chess.Board(fen)
    for i, piece_symbol in enumerate(piece_symbols):
        for j, piece_color in enumerate(piece_colors):
            squares = board.pieces(piece_type=chess.Piece.from_symbol(piece_symbol).piece_type, color=piece_color)
            for square in squares:
                piece_arrays[i + j*6, chess.square_rank(square), chess.square_file(square)] = 1
    piece_arrays = piece_arrays.astype(np.int64)
    features = np.insert(np.insert(piece_arrays.flatten(),0,black_rating),0,white_rating)
    print(features.shape)
    return features

In [144]:
def cleanFeatures2(white_rating : int, black_rating : int, fen : str):
    piece_arrays = np.zeros(shape=(8,8))
    board = chess.Board(fen)
    piece_to_int = {
        'R' : 0,
        'N' : 1,
        'B' : 2,
        'Q' : 3,
        'K' : 4,
        'P' : 5,
        'r' : 6,
        'n' : 7,
        'b' : 8,
        'q' : 9,
        'k' : 10,
        'p' : 11,
    }
    for square in chess.SQUARES:
        if board.piece_at(square):
            piece_arrays[chess.square_rank(square), chess.square_file(square)] = piece_to_int[board.piece_at(square).symbol()]
    piece_arrays = piece_arrays.astype(np.int64)
    features = np.insert(np.insert(piece_arrays.flatten(),0,black_rating),0,white_rating)
    print(features.shape)
    return features

In [145]:
print(cleanFeatures2(cleanWithFen[0, 0], cleanWithFen[0, 1], cleanWithFen[0, 3]))

(66,)
[1500 1191    0    1    2    3    4    2    1    0    5    5    5    5
    5    5    5    5    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   11   11   11   11   11   11
   11   11    6    7    8    9   10    8    7    6]
