In [1]:
import pandas as pd

In [2]:
# Define the function to extract the necessary fields from a single game block
def extract_data_from_game_lines(game_lines):
    game_data = {}

    for line in game_lines:
        # Check if the line is metadata and extract it
        if line.startswith('[White "'):
            game_data['white_name'] = line.split('"')[1]
        elif line.startswith('[Black "'):
            game_data['black_name'] = line.split('"')[1]
        elif line.startswith('[UTCDate "'):
            game_data['Date'] = line.split('"')[1]
        elif line.startswith('[WhiteElo "'):
            game_data['WhiteElo'] = line.split('"')[1]
        elif line.startswith('[BlackElo "'):
            game_data['BlackElo'] = line.split('"')[1]
        elif line.startswith('[ECO "'):
            game_data['ECO'] = line.split('"')[1]
        elif line.startswith('[Opening "'):
            game_data['Opening'] = line.split('"')[1]
        elif line.startswith('[TimeControl "'):
            game_data['TimeControl'] = line.split('"')[1]
        elif line.startswith('[Termination "'):
            game_data['Termination'] = line.split('"')[1]
        # Moves start after metadata, capture them
        elif line.startswith('1.'):
            game_data['Moves'] = line

    return game_data

# Stream read the PGN file and parse each game
def read_pgn_to_dataframe(file_path):
    games = []
    game_lines = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == "": # Either end of metadata or end of game
                continue
            else:
                game_lines.append(line.strip())
            if line.startswith('1.'):
                game_data = extract_data_from_game_lines(game_lines)
                games.append(game_data)
                game_lines = []

    # Convert the games data to a DataFrame
    df = pd.DataFrame(games)
    return df


In [3]:
# Example usage:
file_path = './Projects/Chess-Insights/lichess_db_standard_rated_2015-08.pgn'
df = read_pgn_to_dataframe(file_path)

In [4]:
df.head()

Unnamed: 0,white_name,black_name,Date,WhiteElo,BlackElo,ECO,Opening,TimeControl,Termination,Moves
0,paul2chess3,Andique,2015.07.31,1509,1623,C62,Ruy Lopez: Steinitz Defense,300+0,Normal,1. e4 e5 2. Nf3 Nc6 3. Bb5 d6 4. d4 exd4 5. Nx...
1,Shehewho,gentux,2015.07.31,1857,1963,B00,Owen Defense,180+0,Normal,1. e4 b6 2. d4 Bb7 3. Nc3 e6 4. Nf3 Bb4 5. Bd3...
2,Gaardon,ChessDragon24,2015.07.31,892,1345,B20,Sicilian Defense,300+0,Normal,1. e4 c5 2. d3 d6 3. Nd2 Nc6 4. Ngf3 Nf6 5. g3...
3,Romuald47,HACKERSHUNTER86,2015.07.31,1501,1474,A40,Queen's Pawn,300+0,Abandoned,1. d4 1-0
4,shouganai,MrBangBang,2015.07.31,1953,1872,A02,Bird Opening,300+0,Normal,1. f4 c5 2. Nf3 Nc6 3. e3 e6 4. Bb5 Nf6 5. Bxc...


In [5]:
df.describe()

Unnamed: 0,white_name,black_name,Date,WhiteElo,BlackElo,ECO,Opening,TimeControl,Termination,Moves
count,2599852,2599852,2599852,2599852,2599852,2599852,2599852,2599852,2599852,2599852
unique,46834,47032,32,1977,1976,491,2810,703,4,2568288
top,leko29,leko29,2015.08.01,1500,1500,A00,Van't Kruijs Opening,60+0,Normal,1. e4 1-0
freq,4294,4247,95978,18836,20628,180672,54716,630206,1723846,6816


In [12]:
df["Termination"].unique()

array(['Normal', 'Abandoned', 'Time forfeit', 'Rules infraction'],
      dtype=object)