In [1]:
# download and compile Pycall and Pandas to use python
import Pkg
Pkg.add("PyCall")
Pkg.add("Pandas")
Pkg.add("DataStructures")
Pkg.add("ProgressMeter")

Pkg.build("PyCall")
Pkg.build("Pandas")
Pkg.build("DataStructures")
Pkg.build("ProgressMeter")



[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.9/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.9/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.9/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.9/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.9/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.9/Manifest.toml`
[32m[1m    Building[22m[39m Conda ─→ `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/8c86e48c0db1564a1d49548d3515ced5d604c408/build.log`
[32m[1m    Building[22m[39m PyCall → `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/43d304ac6f0354755f1d60730ece8c499980f7ba/build.log`
[32m[1m    Building[22m[39m Conda ─→ `~/.julia/scratchspaces/44cfe9

# Imports

In [2]:
# import julia libraries
using PyCall
using Random
using ProgressMeter
using DataStructures
# using Pandas

# import python libraries
# np = pyimport("numpy")

# if "invalid redefinition of constant chess" error pops out
# try to clear cache and restart the session
@pyimport chess
cp = pyimport("chess.pgn")

PyObject <module 'chess.pgn' from '/home/ubuntu/.local/lib/python3.10/site-packages/chess/pgn.py'>

# Explaining the Chess Game Analysis Pipeline

The Chess Game Analysis Pipeline is designed to process and analyze chess games from a PGN (Portable Game Notation) file. This pipeline is particularly tailored for filtering and categorizing games based on specific criteria such as time control and player ELO ratings. Below is an overview of its components and functionalities.

## Overview

The pipeline is structured around a `Pipeline` struct in Julia, which is used to manage the processing of chess games. It includes various functions for counting games, reading specific games, generating random game indices, and more.

## Global Variables

- `filename`: The path to the PGN file containing the chess games.
- `seed`: A seed value for controlling randomness in the pipeline, ensuring reproducible results.

## The `Pipeline` Struct

The `Pipeline` struct is the core of the pipeline, containing fields and methods necessary for processing the chess games.

### Fields

- `filename`: Path to the PGN file.
- `seed`: Seed for random number generation, ensuring consistency.
- `games_per_cell`: The target number of games for each category in the analysis matrix.
- `matrix`: A 3x3 matrix, each cell containing an array of integers representing game indices.
- `total_games`: Total number of games in the file.

### Methods

- `count_games_in_file`: Counts the total number of games in the PGN file.
- `read_nth_game`: Reads the nth game from the file.
- `randoming`: Generates a random number within the range of total games in the file.
- `write_file`: Writes game data to a file, either creating a new file or appending to an existing one.
- `check_criteria`: Checks whether a game meets the specified criteria based on time controls and player ELO ranges.

## Processing Procedure

1. **Initialization**: A `Pipeline` instance is created with the specified filename and seed.
2. **Counting Games**: The total number of games in the file is counted.
3. **Game Selection**: Games are randomly selected and checked against specified criteria.
4. **Criteria Checking**: Each game is evaluated to determine if it fits the desired time control and ELO range categories.
5. **Data Storage**: Valid games are stored in the matrix, categorized based on the analysis goals.


In [3]:
# define a pipeline struct
struct Pipeline
    filename::String # filename of the total large file
    export_path::String # the folder of the exporting files

    seed::Int # seed for the randomness control
    total_games::Int # total number of games in the file

    # criteria::Array # criteria condition to filter the game

    # matrix::Array{Array{Int,1},2} # the 2D table with an array per each cell
    games_per_cell::Int # number of games in each cell in the table
    
    function Pipeline(filename::String, export_path::String, seed::Int, total_games::Int, games_per_cell::Int)
        Random.seed!(seed)
        new(filename, export_path, seed, total_games, games_per_cell)
    end
end


## File and group of games operation

In [4]:
function read_nth_game(pipeline::Pipeline, n::Int)
    pgn = open(pipeline.filename)
    game = nothing
    for i in 1:n
        game = cp.read_game(pgn)
        if game === nothing
            break
        end
    end
    close(pgn)
    return game
end

read_nth_game (generic function with 1 method)

In [5]:
function write_file(filename::String, game)
    
    # Check if the file exists
    file_mode = isfile(filename) ? "a" : "w"

    # Open the file with appropriate mode
    open(filename, file_mode) do new_pgn
        # Create an exporter instance
        exporter = cp.FileExporter(new_pgn)
        
        # Use the game.accept method to write the game to the file
        game.accept(exporter)
    end
end


write_file (generic function with 1 method)

In [6]:
function randoming(pipeline::Pipeline, n::Int)
    Random.seed!(pipeline.seed)
    rand(1:pipeline.total_games, n-1)
    return rand(1:pipeline.total_games)
end

randoming (generic function with 1 method)

## Single game operation

In [7]:
function time_and_elo(headers)
    time_control = get(headers, "TimeControl")
    WhiteElo = parse(Int64, get(headers, "WhiteElo"))
    BlackElo = parse(Int64, get(headers, "BlackElo"))

    return time_control, WhiteElo, BlackElo
end

time_and_elo (generic function with 1 method)

In [8]:
function check_criteria(game)
    # criteria (will set as parameter in future)
    valid_time = ["180+0", "600+0", "1800+0"]
    elo_ranges = [(1250, 1350), (1750, 1850), (2150, Inf)]

    # time groups:
    # 0 - Invalid time group
    # 1 - Blitz: 180+0 seconds (3 minutes)
    # 2 - Rapid: 600+0 seconds (10 minutes)
    # 3 - Classical: 1800+0 seconds (30 minutes)

    time_group = 0

    # valid elo groups:
    # 0 - Invalid time group
    # 1 - Beginners: 1250-1350
    # 2 - Intermediate: 1750-1850
    # 3 - Expert: Above 2150

    elo_group = 0

    # get data from the game header
    time_control, player1_elo, player2_elo = time_and_elo(game.headers)

    for time in 1:length(valid_time)
        if (cmp(valid_time[time], time_control) == 0)
            time_group = time
        end
    end

    for elo in 1:length(elo_ranges)
        if ((elo_ranges[elo][1] < player1_elo) && (player1_elo < elo_ranges[elo][2]) && (elo_ranges[elo][1] < player2_elo) && (player2_elo < elo_ranges[elo][2]))
            elo_group = elo
        end
    end

    return time_group, elo_group
end

check_criteria (generic function with 1 method)

In [9]:
function determine_tree_index(criteria_result)
    time_group, elo_group = criteria_result
    if time_group == 0 || elo_group == 0
        return 0  # Indicate an invalid index
    end
    return (time_group - 1) * 3 + elo_group
end

determine_tree_index (generic function with 1 method)

In [10]:
function get_file_name_components(criteria_result)
    time_group, elo_group = criteria_result

    # Mapping time groups to their string representations
    time_control = if time_group == 1
        "blitz"
    elseif time_group == 2
        "rapid"
    elseif time_group == 3
        "classical"
    end

    # Mapping ELO groups to their string representations
    elo_range = if elo_group == 1
        "beginner"
    elseif elo_group == 2
        "intermediate"
    elseif elo_group == 3
        "expert"
    end

    return time_control, elo_range
end


get_file_name_components (generic function with 1 method)

In [72]:
function display_progress(trees, descriptions, tree_index, pipeline)
    progress = length(trees[tree_index]) / pipeline.games_per_cell
    progress_percent = round(progress * 100)

end


display_progress (generic function with 2 methods)

In [81]:
function main(pipeline::Pipeline)
    # Initialize sorted trees for each condition
    trees = Vector{AVLTree{Int}}(undef, 9)
    for i in 1:9
        trees[i] = AVLTree{Int}()
    end

    # # Generate descriptions dynamically
    # descriptions = String[]
    # for time_group in 1:3
    #     for elo_group in 1:3
    #         time_control, elo_range = get_file_name_components((time_group, elo_group))
    #         push!(descriptions, "$(time_control)_$(elo_range)")
    #     end
    # end

    n = 1

    while true
        # Generate a random game index
        game_index = randoming(pipeline, n)
        game = read_nth_game(pipeline, game_index)

        # Check game criteria
        criteria_result = check_criteria(game)

        # Skip if any criteria result element is 0
        if any(x -> x == 0, criteria_result)
            n += 1
            continue
        end

        # Determine the tree index based on criteria_result
        tree_index = determine_tree_index(criteria_result)

        # Check if the tree has reached its limit
        if length(trees[tree_index]) >= pipeline.games_per_cell
            n += 1
            continue
        end

        # Add game index to the tree
        insert!(trees[tree_index], game_index)

        # Determine file name
        time_control, elo_range = get_file_name_components(criteria_result)
        filename = "$(time_control)_$(elo_range)_$(pipeline.games_per_cell).pgn"

        # Write game to file
        write_file(joinpath(pipeline.export_path, filename), game)

        # Optionally, add a condition to break the loop if all trees are full
        if all(t -> length(t) >= pipeline.games_per_cell, trees)
            break
        end

        n += 1
    end
end


main (generic function with 1 method)

# Define variables

In [79]:
# file name of the original decompressed file
filename = "./data/lichess_db_standard_rated_2023-09.pgn"

# the seed for the randomness control in this project
seed = 2023

# export path
export_path = "./data/exported/"

# pipeline struct
# pipeline = Pipeline(filename, export_path, seed, 93218629, 1) # (filename, seed, total_games, games_per_cell)
pipeline = Pipeline(filename, export_path, seed, 1000, 1) # (filename, seed, total_games, games_per_cell)

Pipeline("./data/lichess_db_standard_rated_2023-09.pgn", "./data/exported/", 2023, 1000, 1)

1~93218629 choose 100,000 sorted in order then all the way through the list and then stop
check if cells are full
if full, close so don't fill them in in next round
repeat the process, until all cells are closed (leave cells over-filled)


after all cells are closed
<!-- then random select certain amount of the games  in cell? -->
leave cells over-filled


random draw without replacement

In [80]:
main(pipeline)

[K26 th game processing, game index 113, criteria result = (1, 2)Progress for blitz_intermediate: 100.0%
[K39 th game processing, game index 151, criteria result = (2, 1)Progress for rapid_beginner: 100.0%
[K127 th game processing, game index 121, criteria result = (1, 3)Progress for blitz_expert: 100.0%
[K154 th game processing, game index 930, criteria result = (1, 1)Progress for blitz_beginner: 100.0%
[K314 th game processing, game index 618, criteria result = (2, 2)Progress for rapid_intermediate: 100.0%
[K1233 th game processing, game index 910, criteria result = (1, 2)

LoadError: InterruptException:

# Test code

In [None]:
# tenth_game = read_nth_game(pipeline, 10)
# get(tenth_game.headers, "TimeControl")
# print(tenth_game.headers)


In [None]:
# test_filename = "test.pgn"

# write_file(joinpath(export_path, test_filename), tenth_game)

In [None]:
# # file name of the original decompressed file

# test = "./data/exported/test.pgn"

# # the seed for the randomness control in this project
# seed = 2023

# # pipeline struct
# test_pipeline = Pipeline(test, seed, 33, 300)


In [None]:
# for i in 1:10
#     game = read_nth_game(pipeline, i)
#     println(time_and_elo(game.headers))
#     println(check_criteria(game))
# end


In [None]:
# tenth_game = read_nth_game(pipeline, 10)
# time_control = get(tenth_game.headers, "TimeControl")
# WhiteElo = get(tenth_game.headers, "WhiteElo")
# BlackElo = get(tenth_game.headers, "BlackElo")
# # println(time_control)
# # println(cmp(time_control, "180+0"))
# println(WhiteElo)
# println(typeof(parse(Int, BlackElo)))