In [1]:
import os
import gc
import random
import numpy as np
import chess
import chess.pgn as pgn
import h5py
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm import tqdm
from model import ChessModel
from processing_parallel import collect_unique_moves_parallel, preprocess_and_save_to_hdf5_parallel
from dataset import HDF5Dataset
from torch.optim.lr_scheduler import OneCycleLR
import time
import pickle

In [2]:
# Paths to PGN files
data_dir = '../../games/data/pgn'
files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.pgn')]
files.sort()  # Ensure consistent order

# Parameters
LIMIT_OF_FILES = min(len(files), 28)
files = files[:LIMIT_OF_FILES]
max_games = 500000
positions_per_game = 10
batch_size = 128
num_epochs = 2

In [None]:
move_to_int, num_classes = collect_unique_moves_parallel(files, max_games=max_games)

# Save move_to_int mapping for future use
with open('mark5_move_to_int.pkl', 'wb') as f:
    pickle.dump(move_to_int, f)

In [3]:
# Load the move_to_int mapping
with open("../mark5_move_to_int.pkl", "rb") as file:
    move_to_int = pickle.load(file)
num_classes = len(move_to_int)

130 minutes - 638 k lines of h5; 140 MB

In [None]:
preprocess_and_save_to_hdf5_parallel(files, move_to_int, max_games, positions_per_game=positions_per_game)

In [4]:
dataset = HDF5Dataset('../preprocessed_data.h5')

data_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True, num_workers=0
)

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f'Using device: {device}')

model = ChessModel(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
max_lr = 0.01  # You can adjust this based on experimentation
scheduler = OneCycleLR(
    optimizer,
    max_lr=max_lr,
    steps_per_epoch=len(dataset) // batch_size + 1,
    epochs=num_epochs
)

Using device: mps


In [5]:
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    running_loss = 0.0
    pbar = tqdm(data_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for X_batch, y_batch in pbar:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step() 

        running_loss += loss.item() * X_batch.size(0)
        pbar.set_postfix({'Loss': running_loss / ((pbar.n + 1) * batch_size)})

    epoch_loss = running_loss / len(dataset)
    end_time = time.time()
    epoch_time = end_time - start_time
    minutes = int(epoch_time // 60)
    seconds = int(epoch_time % 60)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}, Time: {minutes}m{seconds}s')

                                                    

OSError: Can't read data (addr overflow, addr = 1672, size = 3656, eoa = 2048)

In [None]:
# Save the model
torch.save(model.state_dict(), "mark5-10e-500k.pth")

In [7]:
import h5py

hdf5_file_path = "../preprocessed_data.h5"
with h5py.File(hdf5_file_path, 'r') as h5_file:
    print("Keys:", list(h5_file.keys()))
    print("X Shape:", h5_file['X'].shape)
    print("y Shape:", h5_file['y'].shape)

Keys: ['X', 'y']
X Shape: (5000000, 16, 8, 8)
y Shape: (5000000,)


In [8]:
import h5py

try:
    with h5py.File(hdf5_file_path, 'r') as h5_file:
        h5_file['X'][:10]  # Attempt to read a small portion
        h5_file['y'][:10]
except Exception as e:
    print("Error accessing HDF5 file:", e)

Error accessing HDF5 file: Can't read data (addr overflow, addr = 1672, size = 3656, eoa = 2048)
