# Import APIs

In [1]:
# import julia libraries
using PyCall
using Random
using ProgressMeter
using CSV
using DataFrames


@pyimport chess
cp = pyimport("chess.pgn")

PyObject <module 'chess.pgn' from '/home/ubuntu/.local/lib/python3.10/site-packages/chess/pgn.py'>

# Define Struct

In [2]:
# define the pipeline struct
struct Pipeline
    filename::String        # Filename of the total large file
    export_path::String     # The folder of the exporting files
    seed::Int               # Seed for the randomness control
    total_games::Int        # Total number of games in the file
    games_per_cell::Int     # Number of games in each cell in the table
    round_draw_numbers::Int # Number of random indices to draw each round
    cells::Matrix{Int}      # 2D matrix to track the number of games in each cell

    function Pipeline(filename::String, export_path::String, seed::Int, total_games::Int, games_per_cell::Int, round_draw_numbers::Int)
        new(filename, export_path, seed, total_games, games_per_cell, round_draw_numbers, fill(0, 3, 3))
    end
end


# Define methods/functions

## Check if one game satisfies the criteria

In [3]:
# helper method of check_criteria()
function time_and_elo(headers)
    time_control = get(headers, "TimeControl")
    WhiteElo = parse(Int64, get(headers, "WhiteElo"))
    BlackElo = parse(Int64, get(headers, "BlackElo"))

    return time_control, WhiteElo, BlackElo
end

time_and_elo (generic function with 1 method)

In [4]:
function check_criteria(game)
    # criteria (will set as parameter in future)
    valid_time = ["180+0", "600+0", "1800+0"]
    elo_ranges = [(1250, 1350), (1750, 1850), (2150, Inf)]

    # time groups:                                                                                               
    # 0 - Invalid time group
    # 1 - Blitz: 180+0 seconds (3 minutes)
    # 2 - Rapid: 600+0 seconds (10 minutes)
    # 3 - Classical: 1800+0 seconds (30 minutes)

    time_group = 0

    # valid elo groups:
    # 0 - Invalid time group
    # 1 - Beginners: 1250-1350
    # 2 - Intermediate: 1750-1850
    # 3 - Expert: Above 2150

    elo_group = 0

    # get data from the game header
    time_control, player1_elo, player2_elo = time_and_elo(game.headers)

    for time in 1:length(valid_time)
        if (cmp(valid_time[time], time_control) == 0)
            time_group = time
        end
    end

    for elo in 1:length(elo_ranges)
        if ((elo_ranges[elo][1] < player1_elo) && (player1_elo < elo_ranges[elo][2]) && (elo_ranges[elo][1] < player2_elo) && (player2_elo < elo_ranges[elo][2]))
            elo_group = elo
        end
    end

    return time_group, elo_group
end

check_criteria (generic function with 1 method)

## Random process

In [5]:
function random_process(seed::Int64, range::Int64, n::Int)::Vector{Int32}
    """
    @param seed::Int64 seed of the randoming method
    @param range::Int64 the generated value is from 1 this upper bound
    @param n::Int n numbers of generated number is going to be returned
    @return Vector{Int32} the return list

    """
    Random.seed!(seed)
    drawn = Set{Int32}()
    
    while length(drawn) < n
        # Draw numbers up to the remaining amount needed
        new_draws = rand(1:range, (n - length(drawn)))
        # Add them to the set (automatically filters out duplicates)
        union!(drawn, new_draws)
    end
    
    # Correctly convert the set to a vector
    return collect(Int32, drawn)
end


random_process (generic function with 1 method)

In [6]:
# function random_arr(pipeline::Pipeline, start_nth::Int, n::Int)::Vector{Int32}
#     # 1-indexed
#     Random.seed!(pipeline.seed)
#     rand(1:pipeline.total_games, start_nth - 1)
#     return rand(1:pipeline.total_games, n)
# end

## Game File I/O

In [7]:
# write game in file
function write_game_file(filename::String, game)
    
    # Check if the file exists
    file_mode = isfile(filename) ? "a" : "w"

    # Open the file with appropriate mode
    open(filename, file_mode) do new_pgn
        # Create an exporter instance
        exporter = cp.FileExporter(new_pgn)
        
        # Use the game.accept method to write the game to the file
        game.accept(exporter)
    end

end


write_game_file (generic function with 1 method)

In [8]:
function game_cell_filename(criteria_result)::String
    time_group, elo_group = criteria_result

    # Mapping time groups to their string representations
    time_control = if time_group == 1
        "blitz"
    elseif time_group == 2
        "rapid"
    elseif time_group == 3
        "classical"
    end

    # Mapping ELO groups to their string representations
    elo_range = if elo_group == 1
        "beginner"
    elseif elo_group == 2
        "intermediate"
    elseif elo_group == 3
        "expert"
    end

    filename = "$(time_control)_$(elo_range)"

    return filename
end


game_cell_filename (generic function with 1 method)

## Log file I/O

In [9]:
function write_log(filename::String, message::String; overwrite::Bool=false)
    file_mode = ""
    if overwrite
        file_mode = "w"
    else
        # Check if the file exists
        file_mode = isfile(filename) ? "a" : "w"
    end

    # Open the file with appropriate mode
    open(filename, file_mode) do file
        # Append the string to the file
        write(file, "$message\n")
    end
end


write_log (generic function with 1 method)

In [10]:
# Function to save the current position in a file
function save_game_position(index::Int64, pos::Int64, saved_filename::String)
    save_msg = "$index:$pos"
    write_log(saved_filename, save_msg, overwrite=true)
end

# # Function to restore the position in a file
# function restore_position(file::IOStream, pos::Int64)
#     seek(file, pos)
# end


function log_game_message(game_pos::Int64, path::String, criteria_result::Tuple)::String
    # if the game satisfies one of the traits in the matrix
    # log the game with file pointer position in the game log
    # save game log file separatly

    filename = game_cell_filename(criteria_result)
    write_log("$path/$(filename)_index_list", string(game_pos))

    return "Written $filename game with the position $game_pos"
end

log_game_message (generic function with 1 method)

In [11]:
function matrix_csv(filename::String, criteria_result::Tuple)
    if isfile(filename)
        # Read the existing CSV file
        df = CSV.read(filename, DataFrame)
    else
        # Create a new 3x3 DataFrame with zeros
        df = DataFrame(Int64.(zeros(3, 3)), :auto)
        
        # Rename columns and rows as per your specification
        rename!(df, [:blitz, :rapid, :classical])
        df[!, :Level] = ["beginner", "intermediate", "expert"]
        df = df[:, [:Level, :blitz, :rapid, :classical]]
    end

    # Increment the specified cell by 1
    # Adjust cell_location to account for the added "Level" column
    adjusted_cell_location = (criteria_result[1], criteria_result[2] + 1)
    df[adjusted_cell_location...] += 1

    # Write the DataFrame back to the CSV file
    CSV.write(filename, df)
end

matrix_csv (generic function with 1 method)

In [12]:
function adjustment(filename::String)
    if isfile(filename)
        # Read the existing CSV file
        df = CSV.read(filename, DataFrame)

        # Iterate through each cell and decrement by 1
        for row in 1:size(df, 1)
            for col in 1:size(df, 2)
                # Skip the "Level" column
                if col != 1
                    df[row, col] -= 1
                end
            end
        end
    
        # Write the DataFrame back to the CSV file
        CSV.write(filename, df)
    else
        println("File not found: $filename")
    end
end


adjustment (generic function with 1 method)

# Actual Workflow

## $1^{st}$ workflow - process and log the eligible games in file
1. process every game in the file
2. if the game satisfies one of the cell trait, log the file pointer's position and increment corresponded cell in table
3. if not, continue processing until finished

In [13]:
function pre_check(pipeline::Pipeline, index::Int64=1, starting_position::Int64=0)::Pipeline
    # log files path and name
    log_save_path = "./log"
    pos_save = "running_position"
    game_cell_stats = "game_statistics.csv"

    # Start process file
    open(pipeline.filename, "r") do pgn

        # Check sizes
        seekend(pgn)
        fileSize = position(pgn)
        seekstart(pgn)
        
        # Initialize progress bar
        file_scaning_progress = Progress(fileSize; dt=1.0, desc="Scanning Progress...")

        if starting_position != 0
            # restore process in case interrupted
            print("starting from index $starting_position")
            seek(pgn, starting_position)
        end

        while !eof(pgn)
            cur_pos = position(pgn)
            game = cp.read_game(pgn)

            # Check the game's criteria to see if the game satisfies the condition
            criteria_result = check_criteria(game)

            # If the game satisfies the condition
            if all(x -> x != 0, criteria_result)
                filename = game_cell_filename(criteria_result)
                # log game file and upate the csv matrix
                log_game_message(cur_pos, log_save_path, criteria_result)
                matrix_csv("$log_save_path/$game_cell_stats", criteria_result)
            end

            index += 1
            save_game_position(index, cur_pos, "$log_save_path/$pos_save")

        end
    end

    if pipeline.total_games != index
        pipeline.total_games == index
    end
    
    return pipeline

end

pre_check (generic function with 3 methods)

## $2^{nd}$ workflow - randomly select data from the cell, read and write games

In [14]:
# log the randomized array

# random_process(seed::Int64, range::Int64, n::Int)::Vector{Int32}
# log the randomized array

function log_list(pipeline::Pipeline, log_path::String, game_stats_filename::String)
    # Read the CSV file into a DataFrame
    df = CSV.read("$log_path/$game_stats_filename", DataFrame)
    random_index = 0

    # Iterate through each cell in the DataFrame
    for level in 1:3
        for time in 2:4
            cell_value = df[level, time]

            # Determine the number of games to select
            num_games_to_select = 3000

            section_name = game_cell_filename((level, (time - 1)))
            # Read the corresponding index list file
            index_list_filename = "$log_path/$(section_name)_index_list"
            index_list = CSV.read(index_list_filename, DataFrame)

            # Make sure the range does not exceed the number of rows in the index list
            actual_range = size(index_list, 1)

            # in case the outlier situation
            if num_games_to_select > actual_range
                num_games_to_select = actual_range
            end

            # Generate random indices within the actual range
            indices = random_process((pipeline.seed + random_index), actual_range, num_games_to_select)

            # Select the games based on indices
            selected_games = index_list[indices, :]

            # Write the selected games to a new file
            drawn_games_filename = "$log_path/drawn_game_$(section_name)"
            CSV.write(drawn_games_filename, selected_games, append=true)

            random_index += 1
        end
    end
end


log_list (generic function with 1 method)

## $3^{rd}$ workflow - read and write games to export folder

In [15]:
function main(pipeline::Pipeline, log_path::String)

    open(pipeline.filename, "r") do pgn
        for level in 1:3
            for time in 1:3

                # read file
                section_name = game_cell_filename((level, time))
                drawn_list = "$log_path/drawn_game_$(section_name)"
                
                open(drawn_list, "r") do game_ppt
                    for game_ptr_str in eachline(game_ppt)
                        game_ptr = parse(Int, game_ptr_str)

                        seek(pgn, game_ptr)
                        game = cp.read_game(pgn)

                        println(game.headers)
                        # write file
                        write_game_file("$(pipeline.export_path)/$(section_name).pgn", game)
                    end
                end
            end
        end
    end
    
    println("Total number of game in file: $(pipeline.total_games)");
    println("Work finished! I know, finally! (but it is just the first pipeline...)");
end

main (generic function with 1 method)

# Define variables

In [16]:
# File name of the original decompressed file
filename = "./data/lichess_db_standard_rated_2023-09.pgn"

# The seed for the randomness control in this project
seed = 2023

# Export path
export_path = "./data/exported"

# Total games
total_games = 93218629

# Games per cell
games_per_cell = 3000

# Number of random indices to draw each round
round_draw_numbers = 100000

# log path of all non-game files
log_path = "./log"

# filename of the csv table
game_stats_filename = "game_statistics.csv"

# Pipeline struct
# Initialize with filename, export_path, seed, total_games, and games_per_cell, round_draw_numbers
pipeline = Pipeline(filename, export_path, seed, total_games, games_per_cell, round_draw_numbers)

Pipeline("./data/lichess_db_standard_rated_2023-09.pgn", "./data/exported", 2023, 93218629, 3000, 100000, [0 0 0; 0 0 0; 0 0 0])

# Run workflow

In [17]:
# run pre_check
# @time pre_check(pipeline)

# if interrupted and attempt to restore
# index =  # enter the running position file first value
# position =  # enter the running position file second value
# pre_check(pipeline, index, position)

In [18]:
# decrement the reuslt by 1
# adjustment("$log_path/$game_stats_filename")

In [19]:
# @time log_list(pipeline, log_path, game_stats_filename)

In [20]:
# @time main(pipeline, log_path)

PyObject Headers(Event='Rated Blitz game', Site='https://lichess.org/Cb1HrWMh', Date='2023.09.30', Round='-', White='Taras_K_1848', Black='samy0101', Result='0-1', UTCDate='2023.09.30', UTCTime='13:09:22', WhiteElo='1280', BlackElo='1289', WhiteRatingDiff='-6', BlackRatingDiff='+5', ECO='C00', Opening='French Defense: Normal Variation', TimeControl='180+0', Termination='Normal')
PyObject Headers(Event='Rated Blitz game', Site='https://lichess.org/IXMu4NLw', Date='2023.09.13', Round='-', White='Bulletjie14', Black='juanchonelo', Result='0-1', UTCDate='2023.09.13', UTCTime='16:15:21', WhiteElo='1260', BlackElo='1308', WhiteRatingDiff='-5', BlackRatingDiff='+5', ECO='C23', Opening="Bishop's Opening: Boi Variation", TimeControl='180+0', Termination='Normal')
PyObject Headers(Event='Rated Blitz game', Site='https://lichess.org/gZWpf5It', Date='2023.09.23', Round='-', White='guitarraelectrica123', Black='AjaySaxena', Result='0-1', UTCDate='2023.09.23', UTCTime='05:55:23', WhiteElo='1294', Bl

Excessive output truncated after 524598 bytes.

PyObject Headers(Event='Rated Blitz game', Site='https://lichess.org/z24nEfk9', Date='2023.09.07', Round='-', White='gines41', Black='Ali_Ardabili', Result='0-1', UTCDate='2023.09.07', UTCTime='20:07:12', WhiteElo='1324', BlackElo='1326', WhiteRatingDiff='-6', BlackRatingDiff='+5', ECO='B10', Opening='Caro-Kann Defense: Hillbilly Attack', TimeControl='180+0', Termination='Normal')
PyObject Headers(Event='Rated Blitz game', Site='https://lichess.org/16sY7pdx', Date='2023.09.07', Round='-', White='rachit504', Black='Bryan_dang', Result='1-0', UTCDate='2023.09.07', UTCTime='04:16:34', WhiteElo='1281', BlackElo='1316', WhiteRatingDiff='+6', BlackRatingDiff='-9', ECO='D00', Opening="Queen's Pawn Game: Mason Variation", TimeControl='180+0', Termination='Normal')
PyObject Headers(Event='Rated Blitz game', Site='https://lichess.org/ZOchFGw7', Date='2023.09.29', Round='-', White='luispahissa', Black='Kazare', Result='1-0', UTCDate='2023.09.29', UTCTime='06:48:55', WhiteElo='1310', BlackElo='134