# Move dictionary

In [1]:
class MoveDictionary:
    def __init__(self):
        all_moves = self.generate_all_moves()
        self.move_index_dict = {move: index for index, move in enumerate(all_moves)}
        self.index_move_dict = {index: move for index, move in enumerate(all_moves)}
        #return move_index_dict


    def get_all_legal_moves(self, fen):
        board = chess.Board(fen)
        legal_moves = list(board.legal_moves)  # Get an iterator of legal moves and convert to a list
        moves = [move.uci() for move in legal_moves]
        return [self.move_index_dict[move] for move in moves]

    def generate_all_squares(self):
        files = 'abcdefgh'
        ranks = '12345678'
        return [f + r for f in files for r in ranks]

    def is_within_board(self, file, rank):
        return 'a' <= file <= 'h' and '1' <= rank <= '8'

    def move_in_direction(self, start_square, file_step, rank_step, steps=8):
        moves = []
        start_file, start_rank = start_square[0], start_square[1]
        for step in range(1, steps + 1):
            new_file = chr(ord(start_file) + file_step * step)
            new_rank = chr(ord(start_rank) + rank_step * step)
            if self.is_within_board(new_file, new_rank):
                moves.append(new_file + new_rank)
            else:
                break
        return moves

    def generate_fairy_moves(self, start_square):
        directions = [
            (1, 0), (-1, 0), (0, 1), (0, -1),  # Rook-like moves
            (1, 1), (1, -1), (-1, 1), (-1, -1),  # Bishop-like moves
            (2, 1), (2, -1), (-2, 1), (-2, -1),  # Knight-like moves
            (1, 2), (1, -2), (-1, 2), (-1, -2)
        ]
        moves = []
        for file_step, rank_step in directions:
            if abs(file_step) == 2 or abs(rank_step) == 2:  # Knight-like moves
                moves.extend(self.move_in_direction(start_square, file_step, rank_step, steps=1))
            else:
                moves.extend(self.move_in_direction(start_square, file_step, rank_step))
        return moves

    def generate_promotion_moves(self, start_square, end_square):
        promotion_pieces = ['b', 'n', 'r', 'q']
        return [start_square + end_square + piece for piece in promotion_pieces]

    def generate_all_moves(self):
        all_squares = self.generate_all_squares()
        all_moves = []

        for start_square in all_squares:
            fairy_moves = self.generate_fairy_moves(start_square)
            for end_square in fairy_moves:
                all_moves.append(start_square + end_square)
                # Add promotion moves for pawns
                if start_square[1] == '7' and end_square[1] == '8' and abs(int(ord(start_square[0]))-int(ord(end_square[0]))) <= 1:  # White pawn promotion
                    all_moves.extend(self.generate_promotion_moves(start_square, end_square))
                if start_square[1] == '2' and end_square[1] == '1' and abs(int(ord(start_square[0]))-int(ord(end_square[0]))) <= 1:  # Black pawn promotion
                    all_moves.extend(self.generate_promotion_moves(start_square, end_square))
        return all_moves

In [2]:
move_dict_obj = MoveDictionary()
move_to_index = move_dict_obj.move_index_dict
index_to_move = move_dict_obj.index_move_dict

In [3]:
move_to_index["e2e4"]

1017

# Data collection

In [4]:
file_path = r"C:\Users\bluni\Downloads\lichess_db_standard_rated_2024-08.pgn\lichess_db_standard_rated_2024-08.pgn"

In [26]:
import chess.pgn
import time

def count_games_in_pgn(file_path, limit=5000):
    """
    Counts the number of games in a PGN file and prints the time elapsed every 10 seconds.

    Parameters:
    - file_path: Path to the PGN file.
    - limit: Maximum number of games to count.
    
    Returns:
    - The number of games in the file.
    """
    game_count = 0
    start_time = time.time()
    last_print_time = start_time

    with open(file_path) as pgn_file:
        while True:
            # Try to read the next game
            game = chess.pgn.read_game(pgn_file)
            
            if game is None:
                break  # Stop if no more games are present

            game_count += 1  # Increment the game counter

            # Get the current time
            current_time = time.time()

            # Check if 10 seconds have passed since the last time we printed the elapsed time
            if current_time - last_print_time >= 10:
                elapsed_time = current_time - start_time
                print(f"Processed {game_count} games. Time elapsed: {elapsed_time:.2f} seconds.")
                last_print_time = current_time  # Reset the last print time
            
            # Stop after the limit is reached
            if game_count >= limit:
                break
    
    return game_count

# Usage example:
file_path = r"C:\Users\bluni\Downloads\lichess_db_standard_rated_2024-08.pgn\lichess_db_standard_rated_2024-08.pgn"

start_time = time.time()
number_of_games = count_games_in_pgn(file_path, limit=5)
end_time = time.time()
total_elapsed_time = end_time - start_time

print(f"Number of games processed: {number_of_games}")
print(f"Total time taken: {total_elapsed_time:.2f} seconds")


Processed 8836 games. Time elapsed: 10.01 seconds.
Processed 17901 games. Time elapsed: 20.02 seconds.
Processed 26876 games. Time elapsed: 30.03 seconds.
Processed 35888 games. Time elapsed: 40.04 seconds.
Processed 44946 games. Time elapsed: 50.05 seconds.
Processed 53932 games. Time elapsed: 60.06 seconds.
Processed 62899 games. Time elapsed: 70.06 seconds.
Processed 71847 games. Time elapsed: 80.06 seconds.
Processed 80710 games. Time elapsed: 90.07 seconds.
Processed 89753 games. Time elapsed: 100.08 seconds.
Processed 98640 games. Time elapsed: 110.09 seconds.
Processed 107442 games. Time elapsed: 120.10 seconds.
Processed 116154 games. Time elapsed: 130.11 seconds.
Processed 124973 games. Time elapsed: 140.12 seconds.
Processed 133645 games. Time elapsed: 150.12 seconds.
Processed 142208 games. Time elapsed: 160.12 seconds.
Processed 150665 games. Time elapsed: 170.12 seconds.
Processed 159495 games. Time elapsed: 180.13 seconds.
Processed 168424 games. Time elapsed: 190.14 seco

In [5]:
import chess.pgn

In [6]:
def read_pgn_file(file_path, limit=None):
    """
    Reads games from a PGN file and prints metadata and moves.
    
    Parameters:
    - file_path: Path to the PGN file.
    - limit: The maximum number of games to read. If None, read all games.
    """
    game_count = 0  # Track how many games have been processed
    
    with open(file_path) as pgn_file:
        while True:
            # Read the next game from the PGN file
            game = chess.pgn.read_game(pgn_file)
            
            if game is None or (limit and game_count >= limit):
                break  # Stop if no more games or limit is reached
            
            # Print game metadata (headers)
            print(f"Event: {game.headers.get('Event', '?')}")
            print(f"White: {game.headers.get('White', '?')}")
            print(f"Black: {game.headers.get('Black', '?')}")
            print(f"Result: {game.headers.get('Result', '?')}")
            
            # Initialize the board from the starting position of the game
            board = game.board()

            # Print the game moves
            print("Moves: ")
            for move in game.mainline_moves():
                board.push(move)  # Apply the move to the board
                print(board.san(move), end=" ")  # Print move in standard algebraic notation (SAN)
            print("\n" + "="*50 + "\n")
            
            game_count += 1  # Increment the game counter


In [7]:
# Usage example:
read_pgn_file(file_path, limit=2)  # Replace limit=2 with the number of games you want to read

Event: Rated Bullet game
White: kingskreamer
Black: mysteryvabs
Result: 1-0
Moves: 


AssertionError: san() and lan() expect move to be legal or null, but got e2e4 in rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq - 0 1

In [17]:
import chess.pgn

def process_pgn_moves(file_path, limit=None):
    """
    Reads each game from the PGN file and iterates through each move.
    Saves the FEN, next move, and number of moves until the end of the game.
    
    Parameters:
    - file_path: Path to the PGN file.
    - limit: Maximum number of games to process. If None, process all games.
    """
    game_count = 0  # Track how many games have been processed
    
    with open(file_path) as pgn_file:
        while True:
            # Read the next game from the PGN file
            game = chess.pgn.read_game(pgn_file)
            
            if game is None or (limit and game_count >= limit):
                break  # Stop if no more games or the limit is reached

            # Initialize the board from the game's starting position
            board = game.board()

            # List to hold move information
            move_data = []

            # Get total number of moves in the game
            total_moves = len(list(game.mainline_moves()))
            
            # Iterate through the moves in the game
            move_number = 1
            for move in game.mainline_moves():
                # Save current board's FEN
                current_fen = board.fen()
                
                # Apply the move to the board
                board.push(move)

                # Get the next move in UCI format
                next_move = move.uci()

                # Calculate the number of moves left until the end of the game
                moves_left = total_moves - move_number

                # Store the move data
                move_data.append({
                    'current_fen': current_fen,
                    'next_move': next_move,
                    'moves_left': moves_left
                })
                
                # Increment the move number
                move_number += 1

            # Print the results for the current game
            print(f"Game {game_count + 1}:")
            for data in move_data:
                print(f"FEN: {data['current_fen']}, Next Move: {data['next_move']}, Moves Left: {data['moves_left']}")
            print("\n" + "="*50 + "\n")
            
            game_count += 1  # Increment the game counter

# Usage example:
process_pgn_moves(file_path, limit=2)  # Process the first 2 games



Game 1:
FEN: rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1, Next Move: e2e4, Moves Left: 112
FEN: rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq - 0 1, Next Move: c7c6, Moves Left: 111
FEN: rnbqkbnr/pp1ppppp/2p5/8/4P3/8/PPPP1PPP/RNBQKBNR w KQkq - 0 2, Next Move: c2c4, Moves Left: 110
FEN: rnbqkbnr/pp1ppppp/2p5/8/2P1P3/8/PP1P1PPP/RNBQKBNR b KQkq - 0 2, Next Move: d7d5, Moves Left: 109
FEN: rnbqkbnr/pp2pppp/2p5/3p4/2P1P3/8/PP1P1PPP/RNBQKBNR w KQkq - 0 3, Next Move: c4d5, Moves Left: 108
FEN: rnbqkbnr/pp2pppp/2p5/3P4/4P3/8/PP1P1PPP/RNBQKBNR b KQkq - 0 3, Next Move: c6d5, Moves Left: 107
FEN: rnbqkbnr/pp2pppp/8/3p4/4P3/8/PP1P1PPP/RNBQKBNR w KQkq - 0 4, Next Move: e4d5, Moves Left: 106
FEN: rnbqkbnr/pp2pppp/8/3P4/8/8/PP1P1PPP/RNBQKBNR b KQkq - 0 4, Next Move: d8d5, Moves Left: 105
FEN: rnb1kbnr/pp2pppp/8/3q4/8/8/PP1P1PPP/RNBQKBNR w KQkq - 0 5, Next Move: b1c3, Moves Left: 104
FEN: rnb1kbnr/pp2pppp/8/3q4/8/2N5/PP1P1PPP/R1BQKBNR b KQkq - 1 5, Next Move: d5d8, Moves Left: 103


In [14]:
def print_raw_pgn(file_path, limit=None):
    """
    Prints the raw content of a PGN file with an optional limit on the number of lines.
    
    Parameters:
    - file_path: Path to the PGN file.
    - limit: Maximum number of lines to print. If None, prints all lines.
    """
    with open(file_path, "r") as pgn_file:
        line_count = 0
        
        # Read and print each line in the PGN file
        for line in pgn_file:
            print(line.strip())  # Print the line without extra newline characters
            
            # Increment the line counter
            line_count += 1
            
            # Stop if the limit is reached
            if limit and line_count >= limit:
                break

# Usage example:
print_raw_pgn(file_path, limit=100)  # Prints the first 10 lines of the PGN file


[Event "Rated Bullet game"]
[Site "https://lichess.org/nQ1xYNSF"]
[Date "2024.08.01"]
[Round "-"]
[White "kingskreamer"]
[Black "mysteryvabs"]
[Result "1-0"]
[UTCDate "2024.08.01"]
[UTCTime "00:00:09"]
[WhiteElo "2148"]
[BlackElo "2155"]
[WhiteRatingDiff "+6"]
[BlackRatingDiff "-6"]
[ECO "B10"]
[Opening "Caro-Kann Defense: Accelerated Panov Attack"]
[TimeControl "60+0"]
[Termination "Time forfeit"]

1. e4 { [%clk 0:01:00] } 1... c6 { [%clk 0:01:00] } 2. c4 { [%clk 0:00:59] } 2... d5 { [%clk 0:01:00] } 3. cxd5 { [%clk 0:00:59] } 3... cxd5 { [%clk 0:01:00] } 4. exd5 { [%clk 0:00:58] } 4... Qxd5 { [%clk 0:00:59] } 5. Nc3 { [%clk 0:00:58] } 5... Qd8 { [%clk 0:00:59] } 6. Bc4 { [%clk 0:00:58] } 6... Nf6 { [%clk 0:00:59] } 7. Qb3 { [%clk 0:00:57] } 7... e6 { [%clk 0:00:58] } 8. Nf3 { [%clk 0:00:57] } 8... Nc6 { [%clk 0:00:57] } 9. Bb5 { [%clk 0:00:55] } 9... Bd7 { [%clk 0:00:57] } 10. O-O { [%clk 0:00:54] } 10... Rc8 { [%clk 0:00:56] } 11. Re1 { [%clk 0:00:52] } 11... a6 { [%clk 0:00:56] } 1

# Final resultant code

In [8]:
def fen_to_vector(fen, turn_multiplier):
    # Take only the part of the FEN before the first space (the board position)
    board_fen = fen.split(" ")[0]

    # If the turn multiplier is -1, reverse and swap case for Black's perspective
    if turn_multiplier == -1:
        board_fen = board_fen.swapcase()[::-1]

    position = ""
    piece_dict = {
        " ": "0,", "p": "1,", "n": "2,", "b": "3,", "r": "4,", "q": "5,", "k": "6,",
        "P": "7,", "N": "8,", "B": "9,", "R": "10,", "Q": "11,", "K": "12,"
    }

    # Loop through each row of the board in the FEN
    for row in board_fen.split("/"):
        for square in row:
            if square.isalpha():
                position += piece_dict[square]
            else:
                position += int(square) * "0,"

    # Remove the trailing comma
    return position[:-1]

In [5]:
def get_game_result(game):
    result = game.headers["Result"]
    if result == "1-0":
        return 1  # White wins
    elif result == "0-1":
        return -1  # Black wins
    elif result == "1/2-1/2":
        return 0  # Draw
    return None

def count_pieces(board):
    return sum(1 for square in chess.SQUARES if board.piece_at(square))

In [24]:
import chess

def process_single_game(game, game_count, game_result, move_to_index):
    """
    Processes a single game, extracting move-by-move data, including FEN, 
    next move, number of moves left, game result from the player's perspective,
    average Elo of the players, and termination status.
    
    Parameters:
    - game: chess.pgn.Game object representing a single chess game.
    - game_count: integer, the count of the game being processed.
    - game_result: result of the game, as provided by a separate function.
    - move_to_index: dictionary mapping moves to their corresponding indices.
    
    Returns:
    - List of dictionaries containing move-by-move data for the game.
    """
    # Initialize the board from the game's starting position
    board = game.board()

    # Get total number of moves in the game
    total_moves = len(list(game.mainline_moves()))

    # Get the termination status
    termination_status = game.headers.get("Termination", "Unknown")

    # Extract the Elo ratings and calculate the average Elo
    white_elo = int(game.headers.get("WhiteElo", 0))
    black_elo = int(game.headers.get("BlackElo", 0))
    average_elo = (white_elo + black_elo) / 2 if white_elo and black_elo else 0

    # List to hold move data for the game
    game_data = []

    # Cache FEN parts if you're using this in the `fen_to_vector`
    fen_parts_cache = {}

    # Iterate through the moves in the game
    move_number = 1
    for move in game.mainline_moves():
        # Save current board's FEN and cache the FEN manipulation if possible
        current_fen = board.fen()
        if current_fen not in fen_parts_cache:
            fen_parts = current_fen.split(" ")
            turn_multiplier = -1 if fen_parts[1] == "b" else 1
            fen_parts_cache[current_fen] = turn_multiplier
        else:
            turn_multiplier = fen_parts_cache[current_fen]
        
        # Apply the move to the board
        board.push(move)

        # Get the next move in UCI format
        next_move = move.uci()

        # Calculate the number of moves left until the end of the game
        moves_left = total_moves - move_number

        # Determine the result from the perspective of the current player
        result_from_perspective = game_result if board.turn == chess.WHITE else -game_result

        # Store the move data in the list
        game_data.append({
            'game_number': game_count,
            'current_fen': current_fen,
            'board_state': fen_to_vector(current_fen, turn_multiplier),
            'next_move': move_to_index[next_move],
            'moves_left': moves_left,
            'result_from_perspective': result_from_perspective,
            'termination_status': termination_status,
            'average_elo': average_elo  # Add average Elo here
        })

        # Increment the move number
        move_number += 1

    return game_data


In [32]:
import chess.pgn
import time
import pandas as pd

def process_chess_games(file_path, limit=5000):
    """
    Processes chess games in a PGN file, tracks each move's FEN, 
    the next move, moves left, game result from the player's perspective, 
    number of pieces on the board, and termination status.
    Prints the time elapsed every 10 seconds.
    """
    game_count = 0
    start_time = time.time()
    last_print_time = start_time
    cumulative_read_time = 0  # To track time for reading games
    cumulative_process_time = 0  # To track time for processing games

    # List to store data for all games
    all_game_data = []

    with open(file_path) as pgn_file:
        while True:
            # Measure time for reading a single game
            read_start_time = time.time()
            game = chess.pgn.read_game(pgn_file)
            read_end_time = time.time()

            # Update cumulative read time
            cumulative_read_time += (read_end_time - read_start_time)
            
            # Stop if no more games or the limit is reached
            if game is None or (limit and game_count >= limit):
                break

            # Get the game result (assumed to be handled externally)
            game_result = get_game_result(game)
            
            if game_result is None:
                game_result = 0

            # Measure time for processing a single game
            process_start_time = time.time()
            game_data = process_single_game(game, game_count + 1, game_result, move_to_index)
            all_game_data.extend(game_data)
            process_end_time = time.time()

            # Update cumulative processing time
            cumulative_process_time += (process_end_time - process_start_time)

            game_count += 1  # Increment the game counter

            # Get the current time
            current_time = time.time()

            # Check if 10 seconds have passed since the last time we printed the elapsed time
            if current_time - last_print_time >= 10:
                elapsed_time = current_time - start_time
                print(f"Processed {game_count} games. Time elapsed: {elapsed_time:.2f} seconds.")
                print(f"Cumulative read time for 'read_game': {cumulative_read_time:.2f} seconds.")
                print(f"Cumulative process time for 'process_single_game': {cumulative_process_time:.2f} seconds.")
                last_print_time = current_time  # Reset the last print time

    # Convert the list of dictionaries into a Pandas DataFrame
    game_df = pd.DataFrame(all_game_data)

    # Final cumulative time printout
    print(f"Total cumulative time for reading games: {cumulative_read_time:.2f} seconds.")
    print(f"Total cumulative time for processing games: {cumulative_process_time:.2f} seconds.")
    
    return game_df

# Usage example:
file_path = r"C:\Users\bluni\Downloads\lichess_db_standard_rated_2024-08.pgn\lichess_db_standard_rated_2024-08.pgn"

start_time = time.time()
game_df = process_chess_games(file_path, limit=50_000)
end_time = time.time()
total_elapsed_time = end_time - start_time

print(f"Number of games processed: {game_df['game_number'].nunique()}")
print(f"Total time taken: {total_elapsed_time:.2f} seconds")

# Display the DataFrame
print(game_df.head())


Processed 2622 games. Time elapsed: 10.00 seconds.
Cumulative read time for 'read_game': 3.08 seconds.
Cumulative process time for 'process_single_game': 6.92 seconds.
Processed 5309 games. Time elapsed: 20.02 seconds.
Cumulative read time for 'read_game': 6.27 seconds.
Cumulative process time for 'process_single_game': 13.73 seconds.
Processed 8008 games. Time elapsed: 30.02 seconds.
Cumulative read time for 'read_game': 9.70 seconds.
Cumulative process time for 'process_single_game': 20.30 seconds.
Processed 10723 games. Time elapsed: 40.03 seconds.
Cumulative read time for 'read_game': 12.96 seconds.
Cumulative process time for 'process_single_game': 27.05 seconds.
Processed 13381 games. Time elapsed: 50.03 seconds.
Cumulative read time for 'read_game': 16.10 seconds.
Cumulative process time for 'process_single_game': 33.91 seconds.
Processed 15986 games. Time elapsed: 60.03 seconds.
Cumulative read time for 'read_game': 19.61 seconds.
Cumulative process time for 'process_single_gam

In [33]:
# Display the DataFrame
game_df

Unnamed: 0,game_number,current_fen,board_state,next_move,moves_left,result_from_perspective,termination_status,average_elo
0,1,rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...,"4,2,3,5,6,3,2,4,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,...",1017,112,-1,Time forfeit,2151.5
1,1,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,"4,2,3,6,5,3,2,4,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,...",662,111,1,Time forfeit,2151.5
2,1,rnbqkbnr/pp1ppppp/2p5/8/4P3/8/PPPP1PPP/RNBQKBN...,"4,2,3,5,6,3,2,4,1,1,0,1,1,1,1,1,0,0,1,0,0,0,0,...",485,110,-1,Time forfeit,2151.5
3,1,rnbqkbnr/pp1ppppp/2p5/8/2P1P3/8/PP1P1PPP/RNBQK...,"4,2,3,6,5,3,2,4,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,...",931,109,1,Time forfeit,2151.5
4,1,rnbqkbnr/pp2pppp/2p5/3p4/2P1P3/8/PP1P1PPP/RNBQ...,"4,2,3,5,6,3,2,4,1,1,0,0,1,1,1,1,0,0,1,0,0,0,0,...",565,108,-1,Time forfeit,2151.5
...,...,...,...,...,...,...,...,...
3332346,50000,5Q2/8/4p1p1/6P1/3k4/3pn3/3K4/8 w - - 1 55,"0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1...",1510,4,-1,Time forfeit,1801.0
3332347,50000,8/8/4p1p1/6P1/1Q1k4/3pn3/3K4/8 b - - 2 55,"0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,8,7,0,0,...",1077,3,1,Time forfeit,1801.0
3332348,50000,8/8/4p1p1/6P1/1Qnk4/3p4/3K4/8 w - - 3 56,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,...",763,2,-1,Time forfeit,1801.0
3332349,50000,8/8/4p1p1/6P1/1Qnk4/3p4/8/4K3 b - - 4 56,"0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,...",794,1,1,Time forfeit,1801.0


In [34]:
game_df["termination_status"].unique()

array(['Time forfeit', 'Normal', 'Abandoned', 'Unterminated'],
      dtype=object)

In [35]:
sum(game_df["termination_status"] == 'Unterminated')

161

In [36]:
sum(game_df["termination_status"] == 'Normal')

2045718

In [41]:
sum(game_df["average_elo"]>2200)

299777

In [49]:
a = game_df["current_fen"][0]
a

'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1'

In [44]:
move_to_index["e2e4"]

1017

In [55]:
fen_to_vector(a, 1)

'4,2,3,5,6,3,2,4,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,7,7,7,7,7,7,7,10,8,9,11,12,9,8,10'