In [1]:
import sys
sys.path.insert(1, 'Classes/')
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from multiprocessing import current_process

from ScoreGetter import ScoreGetter
from dataset_utils import checkIfEarlyMidEnd
from dataset_utils import encodeBoard
from dataset_utils import getColumns

MILLION = 1e6

In [2]:
from PGNParser import parseFromPGN

data = parseFromPGN('Datasets/lichess_db_standard_rated_2020-02.pgn', nb_fens=40 * MILLION)
df = pd.DataFrame(data, columns=['board'])
df = df.drop_duplicates(subset=['board'])
df.to_csv('Datasets/raw_dataset.csv', index=False)

print(df.shape)
df.tail()

Creating fens from pgn: 40000031it [1:36:56, 6877.26it/s]                                  


(34048083, 1)


Unnamed: 0,board
40606094,8/2R1Rpkp/8/2p1n3/8/P1p1N1PP/P4q1K/8 w - - 0 37
40606095,8/2R1Rpkp/8/2p1n3/8/P1p3PP/P4qNK/8 b - - 1 37
40606096,8/2R1Rpkp/8/2p5/8/P1p2nPP/P4qNK/8 w - - 2 38
40606097,8/2R1Rpkp/8/2p5/8/P1p2nPP/P4qN1/7K b - - 3 38
40606098,8/2R1Rpkp/8/2p5/8/P1p2nPP/P5N1/6qK w - - 4 39


In [3]:
df = pd.read_csv('Datasets/raw_dataset.csv')
boards = df['board'].values
print(df.shape)
df.tail()

(1000000, 1)


Unnamed: 0,board
999995,rn1qkbnr/pp2pppp/8/3p4/3P2b1/3B1N2/PPP2PPP/RNB...
999996,rn1qkbnr/pp3ppp/4p3/3p4/3P2b1/3B1N2/PPP2PPP/RN...
999997,rn1qkbnr/pp3ppp/4p3/3p4/3P2b1/3B1N2/PPPN1PPP/R...
999998,r2qkbnr/pp3ppp/2n1p3/3p4/3P2b1/3B1N2/PPPN1PPP/...
999999,r2qkbnr/pp3ppp/2n1p3/3p4/3P2b1/2PB1N2/PP1N1PPP...


#### We load an engine to get a score from the positions.

In [4]:
score_getter = ScoreGetter('/usr/local/bin/stockfish', 'eval', 'go depth 1')

#### We create our dataset by getting an equal number of start, middle and end game positions and encoding them.

In [5]:
batch_size = int(MILLION)
nb_sample = 34
offset = 0
engine = "Stockfish 13"

current = current_process()
pos = current._identity[0]-1 if len(current._identity) > 0 else 0
pbar = tqdm(total=batch_size*nb_sample, desc='Splitting and encoding', position=pos)


for i in range(nb_sample):
    print(i+1)
    data = []
    for j in range(i * batch_size, min(boards.shape[0], i * batch_size + batch_size)):
        board = boards[j]
        try:
            data.append(np.append(encodeBoard(board), score_getter.getScore(board)))
            pbar.update(1)
        except Exception as e: 
            if str(e) == '[Errno 32] Broken pipe':
                score_getter.restart()
            continue
            
    df = pd.DataFrame(data, columns=np.append(getColumns(), 'cp (' + engine + ')'))
    df.to_csv('Datasets/' + engine + '/dataset' + str(offset + i + 1) + '.csv', index=False)
    
pbar.close()


Splitting and encoding:   0%|          | 0/3000000 [00:00<?, ?it/s]

1


Splitting and encoding:   3%|▎         | 97072/3000000 [01:46<53:15, 908.36it/s]  

KeyboardInterrupt: 