In [3]:
!pip install python-chess

Collecting python-chess
  Downloading python_chess-1.999-py3-none-any.whl.metadata (776 bytes)
Collecting chess<2,>=1 (from python-chess)
  Downloading chess-1.11.2.tar.gz (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading python_chess-1.999-py3-none-any.whl (1.4 kB)
Building wheels for collected packages: chess
  Building wheel for chess (setup.py) ... [?25l[?25hdone
  Created wheel for chess: filename=chess-1.11.2-py3-none-any.whl size=147776 sha256=f4edca37671a273c780a8f176dd28630c207b194b2dca28aeb4356c6a8b61cd6
  Stored in directory: /root/.cache/pip/wheels/fb/5d/5c/59a62d8a695285e59ec9c1f66add6f8a9ac4152499a2be0113
Successfully built chess
Installing collected packages: chess, python-chess
Successfully installed chess-1.11.2 python-chess-1.999


In [4]:
import chess
import chess.pgn
import pyarrow as pa
import pyarrow.parquet as pq
import io
from tqdm.notebook import tqdm
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# Set your Chess.com username
username = "alexulanch"

# Set date range (March 2025 to July 2021)
start_date = datetime(2025, 3, 1)
end_date = datetime(2021, 7, 1)

# Output PGN file (single merged file)
output_file = f"/content/data/chesscom_{username}_games_{end_date.year}-{start_date.year}.pgn"

# User-Agent header to prevent Chess.com from blocking requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Function to fetch PGN data
def fetch_pgn(year, month, max_retries=3):
    formatted_month = f"{month:02d}"  # Ensure two-digit month format
    url = f"https://api.chess.com/pub/player/{username}/games/{year}/{formatted_month}/pgn"

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200 and response.text.strip():
                print(f"✅ Nice! You played some games in {year}-{formatted_month}. Adding them to the archive! 📂♟️")
                return f"\n\n; Games from {year}-{formatted_month}\n" + response.text
            elif response.status_code == 200 and not response.text.strip():
                print(f"🤔 Hmm.. looks like you didn't play in {year}-{formatted_month}. Were you on vacation? 🏖️")
                return None  # No games for this month
            else:
                raise requests.exceptions.RequestException(f"Unexpected status: {response.status_code}")
        except requests.exceptions.RequestException as e:
            time.sleep(2)  # Wait before retrying
            if attempt == max_retries - 1:
                print(f"❌ Failed after {max_retries} attempts: {year}-{formatted_month} → {e}")
                return None

# Generate all year-month pairs in reverse order
date_list = []
current_date = start_date
while current_date >= end_date:
    date_list.append((current_date.year, current_date.month))
    current_date -= relativedelta(months=1)

# Fetch PGN files in parallel and merge them into one file
print(f"🚀 Fetching and merging Chess.com PGNs for {username} from {start_date.year}-{start_date.month} to {end_date.year}-{end_date.month}")

merged_pgns = []
with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust workers based on rate limit
    future_to_date = {executor.submit(fetch_pgn, year, month): (year, month) for year, month in date_list}

    for future in as_completed(future_to_date):
        pgn_data = future.result()
        if pgn_data:
            merged_pgns.append(pgn_data)

# Save all merged PGNs into a single file
with open(output_file, "w", encoding="utf-8") as outfile:
    outfile.write("\n".join(merged_pgns))

print(f"\n🎉 Done! All games are saved in '{output_file}'.")

🚀 Fetching and merging Chess.com PGNs for alexulanch from 2025-3 to 2021-7
✅ Nice! You played some games in 2025-03. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2025-02. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-11. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2025-01. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-12. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-10. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-09. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-06. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-07. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-08. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-05. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024-04. Adding them to the archive! 📂♟️
✅ Nice! You played some games in 2024

In [9]:
import os
import chess.pgn
import pyarrow as pa
import pyarrow.parquet as pq

def process_pgn_to_parquet(pgn_file_path, output_parquet_path, max_half_moves_per_game=None):
    """
    Process a PGN file, extract metadata and moves, convert all games to FENs, and write to a Parquet file.

    Args:
        pgn_file_path (str): Path to the PGN file
        output_parquet_path (str): Path for the output Parquet file
        max_half_moves_per_game (int, optional): Limit the number of half-moves per game

    Returns:
        dict: Summary of total games and positions processed
    """
    # Define schema including metadata fields
    schema = pa.schema([
        pa.field('game_id', pa.string()),
        pa.field('event', pa.string()),
        pa.field('site', pa.string()),
        pa.field('date', pa.string()),
        pa.field('round', pa.string()),
        pa.field('white_player', pa.string()),
        pa.field('black_player', pa.string()),
        pa.field('white_elo', pa.int32()),
        pa.field('black_elo', pa.int32()),
        pa.field('result', pa.string()),
        pa.field('time_control', pa.string()),
        pa.field('termination', pa.string()),
        pa.field('eco', pa.string()),
        pa.field('eco_url', pa.string()),
        pa.field('start_time', pa.string()),
        pa.field('end_time', pa.string()),
        pa.field('game_link', pa.string()),
        pa.field('move_number', pa.int32()),
        pa.field('half_move', pa.int32()),
        pa.field('fen', pa.string()),
        pa.field('move_san', pa.string()),
    ])

    # Storage for collected data
    data = {field.name: [] for field in schema}

    total_games = 0
    total_positions = 0

    # Read PGN file
    with open(pgn_file_path, 'r') as pgn_file:
        game_id = 0

        while True:
            game = chess.pgn.read_game(pgn_file)
            if game is None:
                break  # End of file

            game_id += 1
            total_games += 1

            # Extract metadata with safe defaults
            get_meta = lambda key, default="Unknown": game.headers.get(key, default)
            white_elo = int(get_meta("WhiteElo", "0")) if get_meta("WhiteElo", "0").isdigit() else None
            black_elo = int(get_meta("BlackElo", "0")) if get_meta("BlackElo", "0").isdigit() else None

            # Store metadata
            metadata = {
                "game_id": str(game_id),
                "event": get_meta("Event"),
                "site": get_meta("Site"),
                "date": get_meta("Date"),
                "round": get_meta("Round"),
                "white_player": get_meta("White"),
                "black_player": get_meta("Black"),
                "white_elo": white_elo,
                "black_elo": black_elo,
                "result": get_meta("Result"),
                "time_control": get_meta("TimeControl"),
                "termination": get_meta("Termination"),
                "eco": get_meta("ECO"),
                "eco_url": get_meta("ECOUrl"),
                "start_time": get_meta("StartTime"),
                "end_time": get_meta("EndTime"),
                "game_link": get_meta("Link"),
            }

            # Process moves
            board = game.board()
            half_move = 0

            # Store initial position
            for key, value in metadata.items():
                data[key].append(value)
            data["move_number"].append(0)
            data["half_move"].append(half_move)
            data["fen"].append(board.fen())
            data["move_san"].append("")  # No move yet
            total_positions += 1

            # Process each move
            mainline_moves = list(game.mainline_moves())

            if max_half_moves_per_game:
                mainline_moves = mainline_moves[:max_half_moves_per_game]

            for move in mainline_moves:
                san = board.san(move)
                board.push(move)
                half_move += 1
                move_number = (half_move + 1) // 2

                for key, value in metadata.items():
                    data[key].append(value)
                data["move_number"].append(move_number)
                data["half_move"].append(half_move)
                data["fen"].append(board.fen())
                data["move_san"].append(san)
                total_positions += 1

    # Convert collected data to PyArrow Table and write to Parquet
    table = pa.Table.from_pydict(data, schema=schema)
    pq.write_table(table, output_parquet_path)

    print(f"\n🎉 Processing complete! Total games: {total_games}, Total positions: {total_positions}")
    print(f"✅ Output written to: {output_parquet_path}")

    return {
        "total_games": total_games,
        "total_positions": total_positions,
        "output_file": output_parquet_path
    }

In [None]:
pgn_file_path = f"data/chesscom_{username}_games_{end_date.year}-{start_date.year}.pgn"
output_parquet_path = f"data/chesscom_{username}_games_{end_date.year}-{start_date.year}.parquet"  # Output file path

result = process_pgn_to_parquet(
    pgn_file_path=pgn_file_path,
    output_parquet_path=output_parquet_path,
)

# Check some statistics about the processing
print(f"Processed {result['total_games']} games")
print(f"Extracted {result['total_positions']} FEN positions")


🎉 Processing complete! Total games: 28847, Total positions: 2004808
✅ Output written to: data/chesscom_alexulanch_games_2021-2025.parquet
Processed 28847 games
Extracted 2004808 FEN positions


In [14]:
username = "alexulanch"
start_date = datetime(2025, 3, 1)
end_date = datetime(2021, 7, 1)
# Read Parquet file
output_parquet_path = f"/content/data/chesscom_{username}_games_{end_date.year}-{start_date.year}.parquet"
table = pq.read_table(output_parquet_path)
df = table.to_pandas()

# Display first few rows
print(df.columns)

Index(['game_id', 'event', 'site', 'date', 'round', 'white_player',
       'black_player', 'white_elo', 'black_elo', 'result', 'time_control',
       'termination', 'eco', 'eco_url', 'start_time', 'end_time', 'game_link',
       'move_number', 'half_move', 'fen', 'move_san'],
      dtype='object')


In [15]:
df_sorted = df.sort_values(by=['date', 'start_time', 'half_move'])

In [16]:
# Create a single-bit column indicating if you played as White (1) or Black (0)
df_sorted['played_as_white'] = (df_sorted['white_player'] == 'alexulanch').astype(int)

# Select only essential columns
essential_columns = [
    'date', 'time_control', 'white_elo', 'black_elo',
    'played_as_white', 'move_number', 'half_move', 'fen', 'move_san'
]

# Create a new reduced DataFrame
df_reduced = df_sorted[essential_columns]

# Display the cleaned DataFrame
import pandas as pd
print(df_reduced.head())  # Show first few rows

               date time_control  white_elo  black_elo  played_as_white  \
1956026  2021.07.10      1/86400       1200        584                1   
1956027  2021.07.10      1/86400       1200        584                1   
1956028  2021.07.10      1/86400       1200        584                1   
1956029  2021.07.10      1/86400       1200        584                1   
1956030  2021.07.10      1/86400       1200        584                1   

         move_number  half_move  \
1956026            0          0   
1956027            1          1   
1956028            1          2   
1956029            2          3   
1956030            2          4   

                                                       fen move_san  
1956026  rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...           
1956027  rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...       e4  
1956028  rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBN...       d5  
1956029  rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPPNPPP/RNBQKB1...

In [17]:
# Set the sliding window size
window_size = 3  # Number of previous positions to include

# Create new columns for past positions
for i in range(1, window_size):
    df_reduced[f'fen_t-{i}'] = df_reduced['fen'].shift(i)

# Drop rows with NaN due to shifting
df_transformed = df_reduced.dropna()

# Select features (X) and target (y)
X_cols = [f'fen_t-{i}' for i in range(1, window_size)] + ['fen', 'move_number', 'half_move',
                                                           'time_control', 'white_elo', 'black_elo',
                                                           'played_as_white']
y_col = 'move_san'

X = df_transformed[X_cols]
y = df_transformed[y_col]

# Display transformed data
print(X, "\n")
print(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced[f'fen_t-{i}'] = df_reduced['fen'].shift(i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced[f'fen_t-{i}'] = df_reduced['fen'].shift(i)


                                                   fen_t-1  \
1956028  rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...   
1956029  rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBN...   
1956030  rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPPNPPP/RNBQKB1...   
1956031  rnbqkbnr/ppp1pppp/8/8/4p3/8/PPPPNPPP/RNBQKB1R ...   
1956032  rnbqkbnr/ppp1pppp/8/8/4p3/5P2/PPPPN1PP/RNBQKB1...   
...                                                    ...   
94               8/8/5p1p/1pk4P/4P1P1/2K2P2/8/8 b - - 0 47   
95              8/8/5p1p/2k4P/1p2P1P1/2K2P2/8/8 w - - 0 48   
96              8/8/5p1p/2k4P/1p2P1P1/1K3P2/8/8 b - - 1 48   
97              8/8/5p1p/1k5P/1p2P1P1/1K3P2/8/8 w - - 2 49   
98                8/8/5p1p/1k5P/1p2PPP1/1K6/8/8 b - - 0 49   

                                                   fen_t-2  \
1956028  rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...   
1956029  rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...   
1956030  rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBN...   
1956031

In [18]:
import torch

# Define a custom vocabulary for FEN characters
fen_vocab = {
    'r': 1, 'n': 2, 'b': 3, 'q': 4, 'k': 5, 'p': 6,  # Black pieces
    'R': 7, 'N': 8, 'B': 9, 'Q': 10, 'K': 11, 'P': 12,  # White pieces
    '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20,  # Empty squares
    '/': 21, 'w': 22, 'b': 23, 'K': 24, 'Q': 25, 'k': 26, 'q': 27, '-': 28,  # Game metadata
    '0': 29, ' ': 30,  # Turn counter, space separator
    '+': 31, 'x': 32, '#': 33,  # Special move symbols
    '=': 34, 'O': 35  # Pawn promotion & castling (O-O, O-O-O)
}

# Function to convert FEN to a tensor of tokens
def fen_to_tokens(fen, max_length=80):
    tokens = [fen_vocab.get(char, 0) for char in fen]  # Convert FEN to token IDs
    tokens = tokens[:max_length]  # Truncate if too long
    tokens += [0] * (max_length - len(tokens))  # Pad if too short
    return torch.tensor(tokens, dtype=torch.long)

# Example FEN position
fen_example = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"

# Convert to tokenized format
fen_tokens = fen_to_tokens(fen_example)
print(fen_tokens)

tensor([ 1,  2, 23, 27, 26, 23,  2,  1, 21,  6,  6,  6,  6,  6,  6,  6,  6, 21,
        20, 21, 20, 21, 20, 21, 20, 21, 12, 12, 12, 12, 12, 12, 12, 12, 21,  7,
         8,  9, 25, 24,  9,  8,  7, 30, 22, 30, 24, 25, 26, 27, 30, 28, 30, 29,
        30, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0])


In [19]:
import torch.nn as nn

# Define embedding layer
embedding_dim = 256  # Can be adjusted based on model size
vocab_size = len(fen_vocab) + 1  # Account for padding (0)

embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Convert tokenized FEN to embeddings
embedded_fen = embedding_layer(fen_tokens.unsqueeze(0))  # Add batch dimension
print(embedded_fen.shape)  # Expected: (1, 80, 256)

torch.Size([1, 80, 256])


In [20]:
embedded_fen.shape

torch.Size([1, 80, 256])