# Chess Dataset Processing Script
- This Script processes different PGN file and combines them into a singular file for easy processing

In [1]:
import os
from chess.pgn import read_game
import chess
import src.utils.board_to_tensor as bt
from src.utils.data_preperation import chessDataLoader
import torch
import numpy as np

In [6]:
games = []
with open("dataset/chess_1.pgn") as f:
    while True:
        game = chess.pgn.read_game(f)
        if game is None:
            break
        games.append(game)
        break

## Counting number of games

In [3]:
# Specify the PGN file path
pgn_file_path = "./combined_games.pgn"

# Open and parse the PGN file
with open(pgn_file_path, "r") as pgn_file:
    game_count = 0  # To track the number of games processed

    while True:
        # Read the next game
        game = chess.pgn.read_game(pgn_file)
        if game is None:  # End of file
            break

        # Optional: Access moves
        board = game.board()
        moves = list(game.mainline_moves())  # List of moves

        game_count += 1

print(f"Processed {game_count} games.")


Processed 125550 games.


## Combine two or more PGN files 

In [1]:
# Folder containing the PGN files
folder_path = "dataset"

# Initialize variables
total_games = 0
valid_games = 0

# File to store combined valid games
output_file_path = "combined_games.pgn"

with open(output_file_path, "w") as output_file:
    # Iterate through all PGN files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".pgn"):
            file_path = os.path.join(folder_path, filename)
            
            # Open and read each PGN file
            with open(file_path, "r") as file:
                while True:
                    try:
                        game = read_game(file)
                        if game is None:
                            break
                        # Write valid games to the output file
                        output_file.write(str(game) + "\n\n")
                        valid_games += 1
                    except ValueError as e:
                        # Skip games with unsupported variants
                        print(f"Skipping a game due to error: {e}")
                    total_games += 1

# Print the results
print(f"Total games processed: {total_games}")
print(f"Valid games written: {valid_games}")


Total games processed: 125550
Valid games written: 125550


## PGN to Tensor conversion

In [1]:
import os
from chess.pgn import read_game
import chess
import src.utils.board_to_tensor as bt
from src.utils.data_preperation import chessDataLoader
import torch
import numpy as np

tensor_dataset, value_data, policy_data = bt.TensorEncoder("./dataset/chess_1.pgn", limit_games=int(4e4), depth=8, stockfish_path="/opt/homebrew/bin/stockfish")

# Changw Name before using

# torch.save(tensor_dataset, "dataset/board_2_tensor.pt")
# torch.save(value_data, "dataset/value_2.pt")
# torch.save(policy_data, "dataset/policy_2.pt")


Read 63350....
Limiting the games to 40000....


Processing Games: 100%|████████████████| 40000/40000 [03:04<00:00, 216.61game/s]


Time taken: 0h 3m 52s


## Verify the Data Loader

In [3]:
import os
from chess.pgn import read_game
import src.utils.board_to_tensor as bt
from src.utils.data_preperation import chessDataLoader
import torch
import numpy as np

tensor_file = "dataset/board_1_tensors.pt"
value_file = "dataset/value_2.pt"
policy_file = "dataset/policy_2.pt"
batch_size = 256
dataloader = chessDataLoader(tensor_file, value_file, policy_file, batch_size=batch_size)

# Example: Iterate over batches
temp = None
for game, value, policy in dataloader:
    print(game.shape)
    print(value.shape)
    print(policy.shape)
    break  # Just to show one batch

torch.Size([256, 10, 19, 8, 8])
torch.Size([256, 10])
torch.Size([256, 10, 128])
