In [1]:
import chess
import chess.pgn
import chess_functions
import numpy as np
import tensorflow as tf
import re
import random

In [2]:
import time
from multiprocessing import Pool
import jumulti

In [3]:
def gen_helper_indexes(games_per_cycle,processes):

    l = [0]
    ac = 0

    l1 = [games_per_cycle // processes + (1 if x < games_per_cycle % processes else 0)  for x in range (processes)]

    for i in range(processes):
        ac += l1[i]
        l.append(ac)

    return l


def pgn_to_npz(pgn_source_file,
             result_dir_file,
             total_files,
             positions_per_file,
             mode=0,
             log_file=None,
             game_offset=0,
             free_file_int=0,
             used_pgn=None,
             games_per_cycle=50,
             processes=1,
             with_boards=True,
             shuffle=True):
    
    g=0
    if used_pgn is not None:
        pgn=used_pgn
    else:
        pgn = open(pgn_source_file)
    
    for i in range(game_offset):
        game = chess.pgn.read_game(pgn)
        print(i)
        
    x_bool_train = [] 
    x_float_train = [] 
    ytrain = [] 
    btrain = []
    timetaken = []
    boards = []
        
    try:

        for r in range(total_files):

            while len(x_bool_train)<positions_per_file:

                if mode==2:
                    print("On file number",r,
                          "and position",len(x_bool_train),
                          "and game",g)

                game_list = []
                for i in range(games_per_cycle):
                    game = chess.pgn.read_game(pgn)

                    if game=='None':
                        raise StopIteration

                    game_list.append(game)   

                l =gen_helper_indexes(games_per_cycle,processes)

                grouped_games = []
                for i in range(processes):                
                    grouped_games.append(game_list[l[i]:l[i+1]])

                if mode==2: print("Games grouped")
                g+=games_per_cycle

                
                if __name__ ==  '__main__': 
                    p = Pool(processes=processes)
                    data = p.map(jumulti.helper_process_game, grouped_games)
                    p.close()
                    p.join()
                
                # To debug use data = jumulti.helper_process_game(grouped_games[0])
                
                print("Computation Done")

                for i in range(processes):
                    x_bool_train    += (data[i])[0]
                    x_float_train   += (data[i])[1]
                    ytrain          += (data[i])[2]
                    btrain          += (data[i])[3]
                    timetaken       += (data[i])[4]
                    boards          += (data[i])[5]

                del data, game_list

                if mode==2: print("Assimilation Done. ",len(x_bool_train)," positions") 

            if shuffle:
                c = list(zip(x_bool_train, x_float_train, ytrain, btrain, timetaken, boards))
                random.shuffle(c)
                x_bool_train, x_float_train, ytrain, btrain, timetaken, boards = zip(*c)
                del c       


            npx_bool_train=np.array(x_bool_train[0:positions_per_file])
            npx_float_train=np.array(x_float_train[0:positions_per_file])
            npytrain=np.array(ytrain[0:positions_per_file])
            npbtrain=np.array(btrain[0:positions_per_file])
            nptimetaken=np.array(timetaken[0:positions_per_file])

            npx_bool_train=tf.convert_to_tensor(npx_bool_train, dtype=tf.bool)
            npx_float_train=tf.convert_to_tensor(npx_float_train, dtype=tf.float16)
            npytrain=tf.convert_to_tensor(npytrain, dtype=tf.bool)
            npbtrain=tf.convert_to_tensor(npbtrain, dtype=tf.bool)
            nptimetaken=tf.convert_to_tensor(nptimetaken, dtype=tf.float16)
            nptimetaken=tf.clip_by_value(nptimetaken, 0, 100)

            if with_boards:
                np.savez_compressed(str(result_dir_file)+str(free_file_int+r)+'.npz',
                                    npx_bool_train,
                                    npx_float_train,
                                    npbtrain,
                                    npytrain,
                                    nptimetaken,
                                    boards[0:positions_per_file])
            else:
                np.savez_compressed(str(result_dir_file)+str(free_file_int+r)+'.npz',
                                    npx_bool_train,
                                    npx_float_train,
                                    npbtrain,
                                    npytrain,
                                    nptimetaken)


            if log_file is not None:
                with open(log_file, "a") as file:

                    file.write("\n")
                    file.write("\nresult npz file: "+str(result_dir_file)+str(free_file_int+r)+'.npz')
                    file.write("\npgn_source_file: "+str(pgn_source_file))
                    file.write("\ntotal_files: "+str(total_files))
                    file.write("\ncurrent_file_number: "+str(1+r))
                    file.write("\npositions_per_file: "+str(positions_per_file))
                    file.write("\ngame_offset: "+str(game_offset))
                    file.write("\nfree_file_int: "+str(free_file_int))
                    file.write("\nused_pgn: "+str(used_pgn))
                    file.write("\ngames_per_cycle: "+str(games_per_cycle))
                    file.write("\nprocesses: "+str(processes))
                    file.write("\nwith_boards: "+str(with_boards))
                    file.write("\nshuffle: "+str(shuffle))
                    file.write("\n")

            
                    
            del npx_bool_train, npx_float_train, npbtrain, npytrain, nptimetaken 

            x_bool_train = list(x_bool_train[positions_per_file:])
            x_float_train = list(x_float_train[positions_per_file:])
            ytrain = list(ytrain[positions_per_file:])   
            btrain = list(btrain[positions_per_file:])  
            timetaken = list(timetaken[positions_per_file:])   
            boards = list(boards[positions_per_file:])

            if mode==2: 
                print(len(x_bool_train)," leftover")
                print("games processed: ", g)
                
    except StopIteration:
        print("pgn_ended")

In [None]:
# generator uses multiprocessing so if you get OOM error
# reduce processes to 1 or reduce games_per_cycle

source = 'D:\\Chess\\unbounded_1400to2600.pgn'

pgn_to_npz(source, #source file
        'D:\\Chess\\dolphin_database\\D', #output directory and part of file name
        10, #number of files to generate
        65536, #positions per file 32768 65536 131072
        mode=2,
        log_file="D:\\Chess\\dolphin_database\\description.txt",
        game_offset=0,
        free_file_int=0,
        games_per_cycle=150,
        processes=6,
        shuffle=True,
        with_boards=False
        )