In [1]:
import numpy as np 
import polars # move to polars
import os
from tqdm import tqdm
from time import sleep
import asyncio
import chess

In [28]:
!kaggle datasets download -d datasnaek/chess

chess.zip: Skipping, found more recently modified local copy (use --force to force download)


In [29]:
!unzip -qf chess.zip

In [2]:
games = polars.read_csv("games.csv", columns=["moves", "white_rating", "black_rating"])

In [3]:
async def seperateMoves1(game, moves):
    previousMovesList = []
    previousMoves = []
    targets = []
    numMoves = len(moves)
    for j in range(numMoves):
        previousMovesList.append(previousMoves.copy())
        previousMoves.append(moves[j])
        targets.append(moves[j])
    multipleGamesNoMoves = polars.DataFrame([game]*numMoves, orient="row")
    return previousMovesList, targets, multipleGamesNoMoves

In [4]:
movesFrame = games.get_column("moves")
gamesNoMoves = games.drop("moves")
numGames = len(gamesNoMoves)
seperatedNoMoves = polars.DataFrame(columns=gamesNoMoves.columns)
seperatedGamesFutures = []
for i in tqdm(range(numGames)): # TODO executor instead of async
    seperatedGamesFutures.append(seperateMoves1(gamesNoMoves.row(i), movesFrame[i].split(" ")))

previousMovesListList = []
targetsLists = []
multipleGamesNoMovesList = []
for i in tqdm(range(numGames)):
    previousMovesList, targets, repeatedGame = (await asyncio.gather(seperatedGamesFutures[i]))[0]
    previousMovesListList.append(previousMovesList)
    targetsLists.append(targets)
    multipleGamesNoMovesList.append(repeatedGame)
previousMovesList = [str(previousMoves) for list in previousMovesListList for previousMoves in list]
print("previousMovesList has been concatenated")
targets = [target for list in targetsLists for target in list]
print("targets has been concatenated")
multipleGamesNoMoves = polars.concat(multipleGamesNoMovesList)
print("multipleGamesNoMoves has been concatenated")

100%|██████████| 20058/20058 [00:00<00:00, 25602.30it/s]
100%|██████████| 20058/20058 [00:15<00:00, 1310.69it/s]


previousMovesList has been concatenated
targets has been concatenated
multipleGamesNoMoves has been concatenated


In [5]:
clean = multipleGamesNoMoves
clean = clean.with_column(polars.Series("target", targets, dtype='str'))
clean = clean.with_column(polars.Series("previousMoves", previousMovesList, dtype='str'))
clean.columns = ["white_rating", "balck_rating", "target", "previousMoves"]
clean.dtypes

[Int64, Int64, Utf8, Utf8]

In [7]:
clean.write_csv("clean.csv")

In [8]:
clean = polars.read_csv("clean.csv")

In [9]:
clean.head()

white_rating,balck_rating,target,previousMoves
i64,i64,str,str
1500,1191,"""d4""","""[]"""
1500,1191,"""d5""","""['d4']"""
1500,1191,"""c4""","""['d4', 'd5']"""
1500,1191,"""c6""","""['d4', 'd5', '..."
1500,1191,"""cxd5""","""['d4', 'd5', '..."


In [19]:
lengthOfGames = []
length = 0
previousMoves = clean.get_column("previousMoves")
for moves in tqdm(previousMoves):
    if moves == '[]':
        lengthOfGames.append(length+1)
        length = 0
    else:
        length += 1
lengthOfGames.append(length+1)
lengthOfGames = lengthOfGames[1:]

100%|██████████| 1212827/1212827 [01:56<00:00, 10413.51it/s]


In [20]:
startingIndexOfGame = 0
fens = []
for length in tqdm(lengthOfGames): #TODO paralelize
    board = chess.Board()
    moves = clean[startingIndexOfGame:startingIndexOfGame + length]["target"]
    for SANmove in moves:
        PGNmove = board.parse_san(SANmove)
        board.push(PGNmove)
        fens.append(board.fen())
    startingIndexOfGame += length

100%|██████████| 20058/20058 [08:26<00:00, 39.57it/s] 


In [21]:
clean = clean.with_column(polars.Series("fen", fens, dtype='str'))

In [23]:
clean.write_csv("cleanWithFen.csv")

In [24]:
cleanWithFen = polars.read_csv("cleanWithFen.csv")

In [25]:
cleanWithFen.head()

white_rating,balck_rating,target,previousMoves,fen
i64,i64,str,str,str
1500,1191,"""d4""","""[]""","""rnbqkbnr/ppppp..."
1500,1191,"""d5""","""['d4']""","""rnbqkbnr/ppp1p..."
1500,1191,"""c4""","""['d4', 'd5']""","""rnbqkbnr/ppp1p..."
1500,1191,"""c6""","""['d4', 'd5', '...","""rnbqkbnr/pp2pp..."
1500,1191,"""cxd5""","""['d4', 'd5', '...","""rnbqkbnr/pp2pp..."
