In [3]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json

In [4]:
target = "/Users/jonah/Desktop/Projects/Chess-Insights/lichess_db_standard_rated_2020-01.parquet"
date = target.split("_")[-1].split(".")[0]
OUT = "/Users/jonah/Desktop/Projects/Chess-Insights/" f"lichess_{date}_analytics" + ".json"
data = {
    "openings": {},
    "time_controls": {},
    "terminations": {}
}
file = pq.ParquetFile(target)

In [5]:
def elo_to_category(elo):
    if elo < 800:
        return "beginner"
    if elo < 1600:
        return "intermediate"
    return "advanced"

In [None]:
# Prepare the data dictionary
data = {
    "openings": {},
    "ecos": {},
    "time_controls": {},
    "terminations": {}
}

i = 1

# Process the parquet file in batches
for batch in file.iter_batches():
    # Convert batch to pandas DataFrame
    df = batch.to_pandas()

    # Vectorize the elo_to_category function using pandas apply
    df['category'] = df['white_elo'].apply(elo_to_category)

    # Group by 'opening', 'time_control', and 'termination', and count the games by category
    openings_group = df.groupby(['opening', 'category']).size().reset_index(name='games')
    time_controls_group = df.groupby(['time_control', 'category']).size().reset_index(name='games')
    terminations_group = df.groupby(['termination', 'category']).size().reset_index(name='games')
    eco_group = df.groupby(['eco', 'category']).size().reset_index(name='games')

    # Update the data dictionary for openings
    for _, row in openings_group.iterrows():
        opening = row['opening']
        category = row['category']
        games = row['games']
        if opening not in data['openings']:
            data['openings'][opening] = {"beginner": {"games": 0}, "intermediate": {"games": 0}, "advanced": {"games": 0}}
        data['openings'][opening][category]["games"] += games

    # Update the data dictionary for time controls
    for _, row in time_controls_group.iterrows():
        time_control = row['time_control']
        category = row['category']
        games = row['games']
        if time_control not in data['time_controls']:
            data['time_controls'][time_control] = {"beginner": {"games": 0}, "intermediate": {"games": 0}, "advanced": {"games": 0}}
        data['time_controls'][time_control][category]["games"] += games

    # Update the data dictionary for terminations
    for _, row in terminations_group.iterrows():
        termination = row['termination']
        category = row['category']
        games = row['games']
        if termination not in data['terminations']:
            data['terminations'][termination] = {"beginner": {"games": 0}, "intermediate": {"games": 0}, "advanced": {"games": 0}}
        data['terminations'][termination][category]["games"] += games

    for _, row in eco_group.iterrows():
        eco = row['eco']
        category = row['category']
        games = row['games']
        if eco not in data['ecos']:
            data['ecos'][eco] = {"beginner": {"games": 0}, "intermediate": {"games": 0}, "advanced": {"games": 0}}
        data['ecos'][eco][category]["games"] += games
    
    if i % 5 == 0:
        print(f"Processed batch {i}")
    i += 1

In [2]:
import re

def count_moves(pgn_string):
    # Remove comments and metadata
    pgn_string = re.sub(r'\{[^}]*\}|\[.*?\]', '', pgn_string)
    
    # Find move notations (e.g., "1. e4 e5 2. Nf3 Nc6")
    moves = re.findall(r'\b\d+\.\s+[a-zA-Z0-9+#=\s]+', pgn_string)
    
    # Split moves and count them
    move_count = sum(len(re.findall(r'\b[a-h][1-8][a-h][1-8]|[a-h][1-8]|[O\-]{3,5}|[KQNRB]?[a-h]?[1-8]?x?[a-h][1-8]|[KQNRB][a-h1-8]\+?', move)) for move in moves)
    
    return move_count

In [13]:
# Prepare the data dictionary
data = {
    "beginner": {"count": 0, "n": 0},
    "intermediate": {"count": 0, "n": 0},
    "advanced": {"count": 0, "n": 0}
}

i = 1

# Process the parquet file in batches
for batch in file.iter_batches():
    # Convert batch to pandas DataFrame
    df = batch.to_pandas()

    # Vectorize the elo_to_category function using pandas apply
    df['category'] = df['white_elo'].apply(elo_to_category)

    # Group by 'opening', 'time_control', and 'termination', and count the games by category
    moves_group = df.groupby(['moves', 'category']).size().reset_index(name='games')

    # Update the data dictionary for openings
    for _, row in moves_group.iterrows():
        moves = row['moves']
        category = row['category']
        games = row['games']
        n_moves = count_moves(moves)
        data[category]["count"] += n_moves
        data[category]["n"] += 1

    if i % 5 == 0:
        print(f"Processed batch {i}")
    i += 1


Processed batch 5
Processed batch 10
Processed batch 15
Processed batch 20
Processed batch 25


KeyboardInterrupt: 

In [16]:
data

{'beginner': 23.024770642201833,
 'intermediate': 30.41630170729192,
 'advanced': 35.57078990408999}

In [15]:
data = {k: v["count"] / v["n"] for k, v in data.items()}





















In [17]:
with open("./moves.json", "w") as f:
    json.dump(data, f, indent=4)