In [1]:
import os
import json

from time import time
from pprint import pprint
#from collections import Counter
from multiprocessing import Pool

import chess
import chess.pgn

In [51]:
fname = 'pgns/Bogo4Bd2.pgn'
#fname = 'pgns/mock.pgn'
fp = open(fname)
game = chess.pgn.read_game(fp)
fp.close()

In [57]:
game.headers['Date']

'1883.??.??'

In [55]:
#dir(game)

In [36]:
# Contains code for reading pgn files and saving some data about them.
class PGNReader:
    def __init__(self, max_depth=12):
        self.data = {}
        self.max_depth = max_depth
        
    def save(self, filename='pgn_data.json'):
        with open(filename, 'w') as f:
            json.dump(self.data, f)
            
    def load_data(self, filename='pgn_data.json'):
        with open(filename, 'w') as f:
            self.data = json.load(f)

    # User should call 'save' after invoking this function.
    def load_pgns_from_file(self, filename, max_flocs=1000):
        ret = {}

        unicode_failures = 0
        errors = 0
        asdf = 0
        
        games_read = 0
        moves_read = 0
        
        with open(filename) as fp:
            while True:
                start_tell = fp.tell()
                
                try:
                    # Advance loop
                    game = chess.pgn.read_game(fp)
                except UnicodeDecodeError as ex:
                    unicode_failures += 1
                    if unicode_failures >= 10:
                        print(f'Quit due to {unicode_failures} decoding errors in {filename}: "{ex}"')
                        break
                    continue
                except Exception as ex:
                    errors += 1
                    if errors > 100:
                        print(f'Quit due to {errors} generic errors in {filename}:\n"{ex}"')
                        break
                    continue
                    
                # Exit if the file was read completely
                if not game:
                    break
                    
                # Check the date since I don't trust older games
                try:
                    date = int(game.headers['Date'].split('.')[0])
                except (KeyError, ValueError) as ex:
                    continue
                if date < 1990: 
                    continue
                    
                asdf += 1
                if (asdf % 10) > 0:
                    continue
                if games_read > 999:
                    break
                    
                end_tell = fp.tell()
                        
                curr_pos = game
                games_read += 1
                while not curr_pos.is_end():
                    moves_read += 1
                    fen = curr_pos.board().board_fen()

                    next_pos = curr_pos.next()
                    next_board = next_pos.board()

                    uci = next_pos.move.uci() # get string value
                    
                    if fen not in ret:
                        ret[fen] = {'resp': {}, 'loc': {}}

                    # Increment response counter
                    try:
                        ret[fen]['resp'][uci] += 1
                    except KeyError:
                        ret[fen]['resp'][uci] = 1
                        
                    # Add file location
                    loc = ret[fen]['loc']
                    if filename not in loc:
                        loc[filename] = []
                    
                    floc = (start_tell, end_tell)
                    if len(loc[filename]) < max_flocs and floc not in loc[filename]:
                        loc[filename].append(floc)
            
                    if next_board.fullmove_number > self.max_depth:
                        break
                        
                    curr_pos = next_pos
                    
        print(f'Done with {filename}. games: {games_read} moves: {moves_read}')
        return ret
    
    # aggregates some data which was calculated in parallel
    # @d is a list of jsons, one per file parsed.
    def _consolidate(self, d):
        ret = {}
        
        for file_dict in d:
            for fen, info in file_dict.items():
                # This case is easy for us since we don't need to worry
                # about overwriting existing data.
                if fen not in ret:
                    ret[fen] = info
                    continue

                src_resp = info['resp']
                src_locs = info['loc']
                
                # copy responses, adding to existing structure each time
                for uci, count in src_resp.items():
                    try:
                        ret[fen]['resp'][uci] += count
                    except KeyError:
                        ret[fen]['resp'][uci] = count
                        
                # Copy file locations. Location data keys off of file name,
                # so we can safely update the existing structure since each
                # file is only read once.
                ret[fen]['loc'].update(info['loc'])
                    
        return ret
                        
            
    # Loads all files from the specified directory
    def load_from_dir(self, dirname='.', cpus=1):
        for root, dirs, files in os.walk(dirname):
            if root != dirname:
                continue # no recursive search
            pgns = [os.path.join(root, f) for f in files if f.endswith('.pgn')]
            
            p = Pool(cpus)
            start_time = time()

            save_level = logging.getLogger('chess.pgn').getEffectiveLevel()
            logging.getLogger('chess.pgn').setLevel(logging.CRITICAL)
            
            try:
                res = p.map(self.load_pgns_from_file, pgns)
            except Exception as ex:
                raise
            finally:
                logging.getLogger('chess.pgn').setLevel(save_level)
            
            diff = time() - start_time
            print(f'Elapsed time: {int(diff)}s')
            
            self.data = self._consolidate(res)
            
            break

        #self.save()


In [37]:
tr = PGNReader()

#res = tr.load_from_dir('pgns', cpus=6)
#tr.save()

res = tr.load_from_dir('benoni', cpus=6)
tr.save('benoni_data.json')

Done with benoni/SemiBenoni.pgn. games: 399 moves: 9561
Done with benoni/CzechBenoni.pgn. games: 411 moves: 9800
Done with benoni/ModernBenoni6e4.pgn. games: 914 moves: 21905
Done with benoni/BenkoGambit.pgn. games: 1000 moves: 23959
Done with benoni/ModernBenoni6Nf3.pgn. games: 1000 moves: 23978
Elapsed time: 53s


In [48]:
#pprint(len(res))
#pprint(res[0])