# Change file from PGN to JSON

## Get the games only with normal termination and avr elo above 2k

In [1]:
import json
import re

def clean_moves(moves):
    """
    Cleans the moves by removing move numbers, ellipsis (e.g., '1.', '1...'), and annotations like '!', '?', '?!', etc.
    
    Parameters:
    - moves: A list of moves (strings) that may contain move numbers or annotations.
    
    Returns:
    - A cleaned list of only the moves without numbers or annotations.
    """
    cleaned_moves = []
    for move in moves:
        # Ignore move numbers like '1.', '1...', and remove annotations like '!', '?', '?!'
        if not re.match(r'^\d+\.+', move):  # Ignore move numbers
            # Remove exclamation marks, question marks, and other annotations from the move
            cleaned_move = re.sub(r'[!?]+', '', move)
            cleaned_moves.append(cleaned_move)
    return cleaned_moves

In [2]:
import json
import re
import time


def pgn_to_json_custom(pgn_file_path, output_json_path, max_games=1_000_000, min_avg_elo=2000):
    games = []
    game_count = 0
    discarded_count = 0  # Count discarded games
    game_headers = {}
    moves = []

    start_time = time.time()  # Start the timer
    last_report_time = start_time  # Track time for periodic updates
    
    with open(pgn_file_path, 'r') as pgn_file, open(output_json_path, 'w') as json_file:
        for line in pgn_file:
            line = line.strip()

            # If line starts with '[', it's a header (e.g., [Event "GameName"])
            if line.startswith("["):
                match = re.match(r'\[(\w+)\s+"(.+)"\]', line)
                if match:
                    key, value = match.groups()
                    game_headers[key] = value
                continue  # Continue to the next line

            # If line is empty and we have both headers and moves, a game is complete
            if line == "" and moves:
                # Calculate average Elo
                white_elo = int(game_headers.get("WhiteElo", 0))
                black_elo = int(game_headers.get("BlackElo", 0))
                average_elo = (white_elo + black_elo) / 2

                # Get the termination status
                termination_status = game_headers.get("Termination", "").lower()

                # Filter games by average Elo and termination status
                if average_elo >= min_avg_elo and termination_status == "normal":
                    # Clean the move list by removing numbers and irrelevant characters
                    cleaned_moves = clean_moves(moves)

                    # Store the game data
                    game_data = {
                        "event": game_headers.get("Event", ""),
                        "result": game_headers.get("Result", ""),
                        "average_elo": average_elo,  # Save average Elo
                        "moves": cleaned_moves
                    }
                    games.append(game_data)
                    game_count += 1
                else:
                    discarded_count += 1  # Increment discarded games count

                # Reset for the next game
                game_headers = {}
                moves = []

                if game_count >= max_games:
                    break

                # Check if one minute has passed and report the number of games processed
                elapsed_time = time.time() - last_report_time
                if elapsed_time >= 60:
                    current_time = time.time() - start_time
                    print(f"Processed {game_count} games in {current_time // 60} minutes.")
                    print(f"Discarded {discarded_count} games so far.")
                    print()
                    last_report_time = time.time()  # Reset the time check for the next minute

            # If we encounter the move section (no header brackets)
            if not line.startswith("[") and line:
                # Clean the moves by removing annotations
                cleaned_moves = re.sub(r"\{[^}]*\}", "", line)
                move_parts = cleaned_moves.split()
                moves.extend(move_parts)

        # Write the complete list of games to JSON
        json.dump(games, json_file, indent=4)

    # Final time check
    total_time = time.time() - start_time
    print(f"Conversion complete. {game_count} games processed.")
    print(f"Discarded {discarded_count} games in total.")
    print(f"Total time taken: {total_time:.2f} seconds.")


In [17]:
# Example usage:
pgn_file = r"C:\Users\bluni\Chess-Transformer\large_datasets\lichess_db_standard_rated_2024-08.pgn"
output_json = r"output4.json"


start_time = time.time()
pgn_to_json_custom(pgn_file, output_json, max_games=5_000_000)
end_time = time.time()


time_taken = end_time - start_time
print(f"Total time taken: {time_taken:.2f} seconds.")

Processed 187743 games in 1.0 minutes.
Discarded 1390933 games so far.

Processed 377391 games in 2.0 minutes.
Discarded 2727062 games so far.

Processed 562463 games in 3.0 minutes.
Discarded 4111051 games so far.

Processed 760580 games in 4.0 minutes.
Discarded 5424889 games so far.

Processed 981534 games in 5.0 minutes.
Discarded 6633780 games so far.

Processed 1200318 games in 6.0 minutes.
Discarded 7822678 games so far.

Processed 1388860 games in 7.0 minutes.
Discarded 9221165 games so far.

Processed 1574770 games in 8.0 minutes.
Discarded 10526741 games so far.

Processed 1766535 games in 9.0 minutes.
Discarded 11930706 games so far.

Processed 1944885 games in 10.0 minutes.
Discarded 13230984 games so far.

Processed 2132002 games in 11.0 minutes.
Discarded 14616522 games so far.

Processed 2312478 games in 12.0 minutes.
Discarded 15951745 games so far.

Processed 2479080 games in 13.0 minutes.
Discarded 17185772 games so far.

Processed 2654498 games in 14.0 minutes.
Disca

In [1]:
import json
import pandas as pd

def json_to_dataframe(json_file_path):
    """
    Loads a JSON file into a Pandas DataFrame.
    
    Parameters:
    - json_file_path: Path to the JSON file.
    
    Returns:
    - A Pandas DataFrame containing the JSON data.
    """
    with open(json_file_path, 'r') as json_file:
        # Load JSON data from file
        data = json.load(json_file)
    
    # Convert the list of game dictionaries to a DataFrame
    df = pd.DataFrame(data)
    
    return df

# Example usage:
json_file = r"output4.json"
df = json_to_dataframe(json_file)

# Display the first few rows of the DataFrame
df

Unnamed: 0,event,result,average_elo,moves
0,Rated Bullet game,0-1,2507.5,"[e4, c5, Nf3, e6, c3, d5, d4, dxe4, Ne5, cxd4,..."
1,Rated Bullet game,0-1,2031.5,"[g3, e6, Bg2, d5, d3, c5, Nf3, Nc6, O-O, h5, R..."
2,Rated Blitz tournament https://lichess.org/tou...,1-0,2021.0,"[e4, e6, d4, c5, Nf3, Nf6, Nc3, Nc6, Be2, Be7,..."
3,Rated Bullet game,1-0,3015.5,"[e4, c5, Nf3, e6, c3, Nf6, e5, Nd5, d4, cxd4, ..."
4,Rated Bullet game,1-0,2132.0,"[d4, d5, Nf3, f6, Bg5, fxg5, Nxg5, Nf6, e4, dx..."
...,...,...,...,...
4999995,Rated Bullet game,1-0,2257.0,"[e4, e6, d3, c5, Nd2, Nf6, Ngf3, Nc6, g3, Be7,..."
4999996,Rated Blitz game,1-0,2049.5,"[Nc3, Nf6, g4, g5, e4, d6, Be2, Bg7, d4, h6, h..."
4999997,Rated Blitz game,1/2-1/2,2111.0,"[e4, d5, exd5, Qxd5, d4, Nc6, Nf3, Bg4, Be2, O..."
4999998,Rated Blitz game,1-0,2041.5,"[e4, e5, Nf3, Nc6, Bb5, f5, Nc3, Nf6, d3, Bb4,..."


In [2]:
# Count the number of draws
draws_count = df[df['result'] == '1/2-1/2'].shape[0]

# Print the count of draws
print(f"Number of draws: {draws_count}")


Number of draws: 308209


In [41]:
import pandas as pd

# Check if any moves in the column contain '!' or '?'
contains_marks = df["moves"].apply(lambda moves: any('!' in move or '?' in move for move in moves))

# Print the rows where exclamation marks or question marks are found
marked_moves_df = df[contains_marks]
print("Rows with exclamation marks or question marks in moves:")
print(marked_moves_df)


Rows with exclamation marks or question marks in moves:
Empty DataFrame
Columns: [event, result, average_elo, moves]
Index: []


# Data Collection and Processing

In [1]:
class MoveDictionary:
    def __init__(self):
        all_moves = self.generate_all_moves()
        self.move_index_dict = {move: index for index, move in enumerate(all_moves)}
        self.index_move_dict = {index: move for index, move in enumerate(all_moves)}
        #return move_index_dict


    def get_all_legal_moves(self, fen):
        board = chess.Board(fen)
        legal_moves = list(board.legal_moves)  # Get an iterator of legal moves and convert to a list
        moves = [move.uci() for move in legal_moves]
        return [self.move_index_dict[move] for move in moves]

    def generate_all_squares(self):
        files = 'abcdefgh'
        ranks = '12345678'
        return [f + r for f in files for r in ranks]

    def is_within_board(self, file, rank):
        return 'a' <= file <= 'h' and '1' <= rank <= '8'

    def move_in_direction(self, start_square, file_step, rank_step, steps=8):
        moves = []
        start_file, start_rank = start_square[0], start_square[1]
        for step in range(1, steps + 1):
            new_file = chr(ord(start_file) + file_step * step)
            new_rank = chr(ord(start_rank) + rank_step * step)
            if self.is_within_board(new_file, new_rank):
                moves.append(new_file + new_rank)
            else:
                break
        return moves

    def generate_fairy_moves(self, start_square):
        directions = [
            (1, 0), (-1, 0), (0, 1), (0, -1),  # Rook-like moves
            (1, 1), (1, -1), (-1, 1), (-1, -1),  # Bishop-like moves
            (2, 1), (2, -1), (-2, 1), (-2, -1),  # Knight-like moves
            (1, 2), (1, -2), (-1, 2), (-1, -2)
        ]
        moves = []
        for file_step, rank_step in directions:
            if abs(file_step) == 2 or abs(rank_step) == 2:  # Knight-like moves
                moves.extend(self.move_in_direction(start_square, file_step, rank_step, steps=1))
            else:
                moves.extend(self.move_in_direction(start_square, file_step, rank_step))
        return moves

    def generate_promotion_moves(self, start_square, end_square):
        promotion_pieces = ['b', 'n', 'r', 'q']
        return [start_square + end_square + piece for piece in promotion_pieces]

    def generate_all_moves(self):
        all_squares = self.generate_all_squares()
        all_moves = []

        for start_square in all_squares:
            fairy_moves = self.generate_fairy_moves(start_square)
            for end_square in fairy_moves:
                all_moves.append(start_square + end_square)
                # Add promotion moves for pawns
                if start_square[1] == '7' and end_square[1] == '8' and abs(int(ord(start_square[0]))-int(ord(end_square[0]))) <= 1:  # White pawn promotion
                    all_moves.extend(self.generate_promotion_moves(start_square, end_square))
                if start_square[1] == '2' and end_square[1] == '1' and abs(int(ord(start_square[0]))-int(ord(end_square[0]))) <= 1:  # Black pawn promotion
                    all_moves.extend(self.generate_promotion_moves(start_square, end_square))
        return all_moves

In [2]:
def flip_uci(uci_move_string):
    # Flip the UCI move for black's perspective
    return f"{uci_move_string[0]}{9 - int(uci_move_string[1])}{uci_move_string[2]}{9 - int(uci_move_string[3])}"


In [3]:
import json

import json

def fen_to_vector(fen):
    fen_parts = fen.split(" ")
    rows = fen_parts[0].split("/")
    turn = fen_parts[1]

    # Initialize the position array with the special token at the start
    position = [0]  # Special token
    piece_dict = {
        " ": 1, "p": 2, "n": 3, "b": 4, "r": 5, "q": 6, "k": 7,
        "P": 8, "N": 9, "B": 10, "R": 11, "Q": 12, "K": 13
    }

    index = 0
    # Loop over each row of the board, swap case and reverse if it's Black's turn
    for row in (rows[::-1] if turn == "b" else rows):
        for square in row:
            if square.isdigit():
                # Add empty squares (represented by 1s) directly
                position.extend([1] * int(square))
            else:
                # Add piece codes from the piece_dict
                position.append(piece_dict.get(square, 1))
    
    # Handle castling rights
    castling_rights = fen_parts[2]
    special_tokens = [1 if c in castling_rights else 0 for c in "KQkq"]

    # Handle en passant square
    en_passant = fen_parts[3]
    if en_passant == "-":
        special_tokens.extend([0] * 9)
    else:
        file_index = ord(en_passant[0]) - 97
        special_tokens.extend([1] + [0] * file_index + [1] + [0] * (7 - file_index))

    # Convert the position and special tokens to JSON strings
    json_position = json.dumps(position)
    json_special_tokens = json.dumps(special_tokens)

    return json_position, json_special_tokens, turn

In [4]:
def process_single_game(game, game_count, move_to_index):
    board = chess.Board()
    total_moves = len(game["moves"])
    result = game["result"]  # The game result, e.g., "1-0", "0-1", or "1/2-1/2"
    game_data = []

    # Determine who won the game: "w" for White, "b" for Black, exclude draw ("draw")
    if result == "1-0":  # White wins
        winning_player = "w"
    elif result == "0-1":  # Black wins
        winning_player = "b"
    else:  # Draw, don't store the game
        return []

    # Process each move up to the final result
    for move_number, move in enumerate(game["moves"][:-1], start=1):  # Exclude the result move
        current_fen = board.fen()  # Get the FEN before making the move
        try:
            # Convert SAN move to UCI move using python-chess
            uci_move = board.parse_san(move)
            moves_left = total_moves - move_number

            # Get board state, special tokens, and determine if it's White or Black's turn
            board_state, special_tokens, turn = fen_to_vector(current_fen)

            # Only store data from the winning player's perspective
            if winning_player == turn:
                # Extract legal moves and convert them to UCI format
                legal_moves = [m.uci() for m in board.legal_moves]
                num_legal_moves = len(legal_moves)

                # Flip moves if it's Black's turn and handle indexing for moves
                next_move = move_to_index[flip_uci(uci_move.uci())] if turn == "b" else move_to_index[uci_move.uci()]
                legal_moves_converted = [
                    move_to_index[flip_uci(m)] if turn == "b" else move_to_index[m] for m in legal_moves
                ]

                # Store data only from the winning player's perspective
                game_data.append({
                    'game_number': int(game_count),  # Store the game number
                    'next_move': int(next_move),  # Store the next move index
                    'legal_moves': legal_moves_converted,  # Store legal moves as indices
                    'num_legal_moves': int(num_legal_moves),  # Store the number of legal moves
                    'board_state': board_state,  # Store the board state as a vector
                    'special_tokens': special_tokens,  # Store special tokens (castling, en passant)
                    'moves_left': int(moves_left),  # Moves remaining in the game
                    'result': 1,  # 1 for winner's perspective
                    'average_elo': int(game["average_elo"]),  # Average Elo of players
                    'turn': turn  # Store whose turn it is ("w" for White, "b" for Black)
                })

            # Apply the move to the board to update the board state
            board.push(uci_move)

        except Exception as e:
            print(f"Error processing move {move} in game {game_count}: {e}")
            continue

    return game_data


In [7]:
import concurrent.futures

In [5]:
import os
import json
import pandas as pd
import chess
import time

def save_checkpoint(game_count, checkpoint_file="checkpoint.txt"):
    with open(checkpoint_file, "w") as f:
        f.write(str(game_count))

def load_checkpoint(checkpoint_file="checkpoint.txt"):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            return int(f.read())
    return 0

def extract_fen_and_moves_with_stats(json_file_path, db_path, max_games=None, chunk_size=20000, checkpoint_file="checkpoint.txt"):
    all_game_data = []
    move_dict_obj = MoveDictionary()
    move_to_index = move_dict_obj.move_index_dict
    
    with open(json_file_path, 'r') as json_file:
        games = json.load(json_file)

    start_time = time.time()
    game_counter = load_checkpoint(checkpoint_file)

    # Start processing from the last checkpoint
    for game_count, game in enumerate(games[game_counter:], start=game_counter + 1):
        if max_games is not None and game_count > max_games:
            break

        game_data = process_single_game(game, game_count, move_to_index)
        all_game_data.extend(game_data)
        game_counter += 1

        if game_counter % chunk_size == 0:
            df = pd.DataFrame(all_game_data)
            batch_insert_data(db_path, df, batch_size=chunk_size)
            print(f"Inserted {game_counter} games into the database.")
            all_game_data = []  # Clear the list to start new batch
            save_checkpoint(game_counter, checkpoint_file)

        if game_count % 1000 == 0:
            elapsed_time = time.time() - start_time
            print(f"Processed {game_count} games. Time elapsed: {elapsed_time:.4f} seconds.")

    if all_game_data:
        df = pd.DataFrame(all_game_data)
        batch_insert_data(db_path, df, batch_size=chunk_size)
        print(f"Inserted remaining {game_counter} games into the database.")
        save_checkpoint(game_counter, checkpoint_file)

    total_time = time.time() - start_time
    print(f"Finished processing {game_counter} games. Total time taken: {total_time:.4f} seconds.")


# Save the data

In [6]:
import sqlite3

def create_database(db_path):
    # Connect to SQLite database at db_path, will create if it doesn't exist
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        
        # SQL statement to create a new table with columns matching your data
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS chess_analysis (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                game_number INTEGER,
                board_state TEXT,  -- Store board state as text (JSON or string representation)
                next_move INTEGER,  -- Store the next move index as an integer
                legal_moves TEXT,  -- Store legal moves as text (comma-separated or JSON)
                num_legal_moves INTEGER,  -- Store the number of legal moves as an integer
                moves_left INTEGER,  -- Store the number of moves left in the game as an integer
                result INTEGER,  -- Store the result (1 for win, 0 for draw) as an integer
                average_elo INTEGER,  -- Store average ELO as an integer
                special_tokens TEXT,  -- Store special tokens (castling, en passant) as text (JSON)
                turn TEXT  -- Store whose turn it is ("w" for White, "b" for Black)
            );
        ''')

        conn.commit()
        print("Database created and table initialized for chess analysis.")



In [7]:
import sqlite3
import json

def batch_insert_data(db_path, df, batch_size):
    # Open the database connection
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        
        # Prepare the batch of tuples for insertion
        batch = []
        for index, row in df.iterrows():
            # Convert legal_moves list to JSON or comma-separated string
            legal_moves_str = json.dumps(row['legal_moves']) if isinstance(row['legal_moves'], list) else row['legal_moves']

            # Calculate the number of legal moves
            num_legal_moves = len(row['legal_moves']) if isinstance(row['legal_moves'], list) else 0

            # Append each row to the batch
            batch.append((
                row['game_number'],
                row['board_state'],
                row['next_move'],  # Integer field for next move
                legal_moves_str,  # Store the legal moves as JSON or string
                num_legal_moves,  # Number of legal moves
                row['moves_left'],  # Integer field for moves left
                row['result'],  # Integer field: 1 for win, 0 for draw
                row['average_elo'],  # Integer field for average Elo
#                 row['fen'],  # FEN of the board
                row['special_tokens'],  # JSON string for special tokens
                row['turn']  # Whose turn it is ("w" or "b")
            ))
            
            # Check if the batch size is reached
            if len(batch) >= batch_size:
                cursor.executemany('''
                    INSERT INTO chess_analysis (
                        game_number, board_state, next_move, 
                        legal_moves, num_legal_moves, moves_left, result, 
                        average_elo, special_tokens, turn
                    ) 
                    VALUES (?, ?, ?, ?, ?, ?,  ?, ?, ?, ?)
                ''', batch)
                conn.commit()
                batch = []  # Clear the batch after commit
        
        # Insert any remaining data in the batch
        if batch:
            cursor.executemany('''
                INSERT INTO chess_analysis (
                    game_number, board_state, next_move, 
                    legal_moves, num_legal_moves, moves_left, result, 
                    average_elo, special_tokens, turn
                ) 
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', batch)
            conn.commit()

    print("Batch insertion completed.")


In [8]:
def recreate_table(db_path):
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        # Drop the existing table
        cursor.execute('DROP TABLE IF EXISTS chess_analysis')
        conn.commit()
        print("Dropped existing chess_analysis table.")

# Final Run

In [9]:

# Define the database file path
db_path = 'Final_chess_database_lichess_150million_04.db'

# Create the database and table
recreate_table(db_path)
create_database(db_path)
# Process and insert data into the database
json_file = r"output4.json"
max_games = 4_500_000  # Example of maximum games to process
extract_fen_and_moves_with_stats(json_file, db_path, max_games=max_games, chunk_size=50_000)


Dropped existing chess_analysis table.
Database created and table initialized for chess analysis.
Processed 3501000 games. Time elapsed: 5.5101 seconds.
Processed 3502000 games. Time elapsed: 10.5220 seconds.
Processed 3503000 games. Time elapsed: 15.1227 seconds.
Processed 3504000 games. Time elapsed: 19.7981 seconds.
Processed 3505000 games. Time elapsed: 24.4689 seconds.
Processed 3506000 games. Time elapsed: 29.2570 seconds.
Processed 3507000 games. Time elapsed: 33.8882 seconds.
Processed 3508000 games. Time elapsed: 38.5859 seconds.
Processed 3509000 games. Time elapsed: 52.6872 seconds.
Processed 3510000 games. Time elapsed: 57.5261 seconds.
Processed 3511000 games. Time elapsed: 62.1731 seconds.
Processed 3512000 games. Time elapsed: 66.9562 seconds.
Processed 3513000 games. Time elapsed: 71.4100 seconds.
Processed 3514000 games. Time elapsed: 76.0717 seconds.
Processed 3515000 games. Time elapsed: 80.7931 seconds.
Processed 3516000 games. Time elapsed: 85.3948 seconds.
Process

Processed 3641000 games. Time elapsed: 855.3288 seconds.
Processed 3642000 games. Time elapsed: 860.1070 seconds.
Processed 3643000 games. Time elapsed: 864.9281 seconds.
Processed 3644000 games. Time elapsed: 869.6400 seconds.
Processed 3645000 games. Time elapsed: 874.2559 seconds.
Processed 3646000 games. Time elapsed: 878.9929 seconds.
Processed 3647000 games. Time elapsed: 883.7501 seconds.
Processed 3648000 games. Time elapsed: 898.7880 seconds.
Processed 3649000 games. Time elapsed: 903.5693 seconds.
Batch insertion completed.
Inserted 3650000 games into the database.
Processed 3650000 games. Time elapsed: 964.3870 seconds.
Processed 3651000 games. Time elapsed: 969.1003 seconds.
Processed 3652000 games. Time elapsed: 973.8569 seconds.
Processed 3653000 games. Time elapsed: 978.5820 seconds.
Processed 3654000 games. Time elapsed: 983.3232 seconds.
Processed 3655000 games. Time elapsed: 987.9869 seconds.
Processed 3656000 games. Time elapsed: 992.6070 seconds.
Processed 3657000 g

Processed 3779000 games. Time elapsed: 1742.6672 seconds.
Processed 3780000 games. Time elapsed: 1747.5893 seconds.
Processed 3781000 games. Time elapsed: 1752.2400 seconds.
Processed 3782000 games. Time elapsed: 1756.9810 seconds.
Processed 3783000 games. Time elapsed: 1761.8022 seconds.
Processed 3784000 games. Time elapsed: 1766.5149 seconds.
Processed 3785000 games. Time elapsed: 1771.2219 seconds.
Processed 3786000 games. Time elapsed: 1776.0312 seconds.
Processed 3787000 games. Time elapsed: 1780.7956 seconds.
Processed 3788000 games. Time elapsed: 1785.5390 seconds.
Processed 3789000 games. Time elapsed: 1790.2888 seconds.
Processed 3790000 games. Time elapsed: 1794.9907 seconds.
Processed 3791000 games. Time elapsed: 1799.9030 seconds.
Processed 3792000 games. Time elapsed: 1804.5041 seconds.
Processed 3793000 games. Time elapsed: 1809.3269 seconds.
Processed 3794000 games. Time elapsed: 1814.2242 seconds.
Processed 3795000 games. Time elapsed: 1818.9763 seconds.
Processed 3796

Processed 3917000 games. Time elapsed: 2662.6195 seconds.
Processed 3918000 games. Time elapsed: 2667.5219 seconds.
Processed 3919000 games. Time elapsed: 2672.2698 seconds.
Processed 3920000 games. Time elapsed: 2677.0608 seconds.
Processed 3921000 games. Time elapsed: 2681.7551 seconds.
Processed 3922000 games. Time elapsed: 2686.5481 seconds.
Processed 3923000 games. Time elapsed: 2691.4152 seconds.
Processed 3924000 games. Time elapsed: 2695.8865 seconds.
Processed 3925000 games. Time elapsed: 2700.4859 seconds.
Processed 3926000 games. Time elapsed: 2705.2119 seconds.
Processed 3927000 games. Time elapsed: 2709.9751 seconds.
Processed 3928000 games. Time elapsed: 2714.7457 seconds.
Processed 3929000 games. Time elapsed: 2719.4381 seconds.
Processed 3930000 games. Time elapsed: 2724.0959 seconds.
Processed 3931000 games. Time elapsed: 2728.9132 seconds.
Processed 3932000 games. Time elapsed: 2733.6009 seconds.
Processed 3933000 games. Time elapsed: 2738.2980 seconds.
Processed 3934

Processed 4055000 games. Time elapsed: 3580.4652 seconds.
Processed 4056000 games. Time elapsed: 3585.2572 seconds.
Processed 4057000 games. Time elapsed: 3589.9933 seconds.
Processed 4058000 games. Time elapsed: 3594.7401 seconds.
Processed 4059000 games. Time elapsed: 3599.4640 seconds.
Processed 4060000 games. Time elapsed: 3604.1811 seconds.
Processed 4061000 games. Time elapsed: 3608.8748 seconds.
Processed 4062000 games. Time elapsed: 3613.4524 seconds.
Processed 4063000 games. Time elapsed: 3618.0163 seconds.
Processed 4064000 games. Time elapsed: 3622.7115 seconds.
Processed 4065000 games. Time elapsed: 3627.3600 seconds.
Processed 4066000 games. Time elapsed: 3632.1565 seconds.
Processed 4067000 games. Time elapsed: 3636.9588 seconds.
Processed 4068000 games. Time elapsed: 3641.6796 seconds.
Processed 4069000 games. Time elapsed: 3646.3554 seconds.
Processed 4070000 games. Time elapsed: 3651.1008 seconds.
Processed 4071000 games. Time elapsed: 3655.8333 seconds.
Processed 4072

Processed 4194000 games. Time elapsed: 4417.9518 seconds.
Processed 4195000 games. Time elapsed: 4422.6127 seconds.
Processed 4196000 games. Time elapsed: 4427.1671 seconds.
Processed 4197000 games. Time elapsed: 4431.8099 seconds.
Processed 4198000 games. Time elapsed: 4436.5240 seconds.
Processed 4199000 games. Time elapsed: 4441.1463 seconds.
Batch insertion completed.
Inserted 4200000 games into the database.
Processed 4200000 games. Time elapsed: 4499.5269 seconds.
Processed 4201000 games. Time elapsed: 4504.1613 seconds.
Processed 4202000 games. Time elapsed: 4508.9985 seconds.
Processed 4203000 games. Time elapsed: 4548.1607 seconds.
Processed 4204000 games. Time elapsed: 4552.8705 seconds.
Processed 4205000 games. Time elapsed: 4557.5201 seconds.
Processed 4206000 games. Time elapsed: 4562.1746 seconds.
Processed 4207000 games. Time elapsed: 4566.7707 seconds.
Processed 4208000 games. Time elapsed: 4571.5293 seconds.
Processed 4209000 games. Time elapsed: 4576.0794 seconds.
Pro

Processed 4332000 games. Time elapsed: 5338.4824 seconds.
Processed 4333000 games. Time elapsed: 5343.3414 seconds.
Processed 4334000 games. Time elapsed: 5348.0346 seconds.
Processed 4335000 games. Time elapsed: 5352.8505 seconds.
Processed 4336000 games. Time elapsed: 5357.8831 seconds.
Processed 4337000 games. Time elapsed: 5362.6225 seconds.
Processed 4338000 games. Time elapsed: 5367.3181 seconds.
Processed 4339000 games. Time elapsed: 5372.0082 seconds.
Processed 4340000 games. Time elapsed: 5376.5217 seconds.
Processed 4341000 games. Time elapsed: 5381.3983 seconds.
Processed 4342000 games. Time elapsed: 5386.0910 seconds.
Processed 4343000 games. Time elapsed: 5390.8975 seconds.
Processed 4344000 games. Time elapsed: 5395.4842 seconds.
Processed 4345000 games. Time elapsed: 5400.3716 seconds.
Processed 4346000 games. Time elapsed: 5405.1398 seconds.
Processed 4347000 games. Time elapsed: 5409.9183 seconds.
Processed 4348000 games. Time elapsed: 5414.6921 seconds.
Processed 4349

Processed 4470000 games. Time elapsed: 6257.0702 seconds.
Processed 4471000 games. Time elapsed: 6261.6606 seconds.
Processed 4472000 games. Time elapsed: 6266.3316 seconds.
Processed 4473000 games. Time elapsed: 6270.8954 seconds.
Processed 4474000 games. Time elapsed: 6275.4854 seconds.
Processed 4475000 games. Time elapsed: 6279.9806 seconds.
Processed 4476000 games. Time elapsed: 6284.6775 seconds.
Processed 4477000 games. Time elapsed: 6289.3004 seconds.
Processed 4478000 games. Time elapsed: 6294.0737 seconds.
Processed 4479000 games. Time elapsed: 6298.5994 seconds.
Processed 4480000 games. Time elapsed: 6303.1414 seconds.
Processed 4481000 games. Time elapsed: 6307.8106 seconds.
Processed 4482000 games. Time elapsed: 6312.4647 seconds.
Processed 4483000 games. Time elapsed: 6317.0385 seconds.
Processed 4484000 games. Time elapsed: 6321.6645 seconds.
Processed 4485000 games. Time elapsed: 6326.2583 seconds.
Processed 4486000 games. Time elapsed: 6330.8928 seconds.
Processed 4487

In [8]:
game_fen_moves_df

Unnamed: 0,game_number,next_move,legal_moves,num_legal_moves,board_state,special_tokens,moves_left,result,average_elo,turn
0,1,485,"[1538, 1539, 232, 233, 1788, 1547, 1284, 1016,...",20,"[0, 11, 9, 10, 12, 13, 10, 9, 11, 8, 8, 8, 8, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",47,1,2507,b
1,1,1016,"[1538, 1539, 734, 735, 736, 232, 233, 1788, 15...",22,"[0, 11, 9, 10, 12, 13, 10, 1, 11, 8, 8, 8, 8, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",45,1,2507,b
2,1,749,"[1537, 1538, 1539, 1268, 1269, 991, 730, 734, ...",29,"[0, 11, 9, 10, 12, 13, 10, 1, 11, 8, 8, 1, 8, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",43,1,2507,b
3,1,829,"[1537, 1538, 1539, 1268, 1269, 991, 1001, 730,...",34,"[0, 11, 9, 10, 12, 13, 10, 1, 11, 8, 8, 1, 1, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",41,1,2507,b
4,1,565,"[1537, 1538, 1539, 1268, 1269, 991, 730, 723, ...",34,"[0, 11, 9, 10, 12, 13, 10, 1, 11, 8, 8, 1, 1, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",39,1,2507,b
...,...,...,...,...,...,...,...,...,...,...
155380,5000,485,"[1538, 1539, 1537, 1272, 1271, 1270, 1269, 126...",36,"[0, 5, 3, 1, 6, 7, 4, 3, 5, 2, 2, 2, 1, 1, 2, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9,1,2043,w
155381,5000,1539,"[1538, 1539, 1537, 1269, 1268, 991, 1001, 733,...",36,"[0, 5, 3, 1, 6, 7, 4, 3, 5, 2, 2, 1, 1, 1, 2, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7,1,2043,w
155382,5000,565,"[1347, 1343, 1346, 1348, 1758, 1269, 1268, 991...",32,"[0, 5, 3, 1, 6, 7, 4, 3, 5, 2, 2, 1, 1, 1, 2, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",5,1,2043,w
155383,5000,232,"[1347, 1343, 1346, 1348, 1758, 1272, 1271, 127...",34,"[0, 5, 3, 1, 1, 7, 4, 3, 5, 2, 2, 1, 1, 1, 2, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",3,1,2043,w


# Check the data

In [5]:
import pandas as pd
import sqlite3
import json

# Your existing function to fetch data from the database
def fetch_data_from_database_to_df(db_path, limit=10):
    """
    Fetches data from the chess_analysis table in the SQLite database and loads it into a DataFrame.
    
    Parameters:
    - db_path: Path to the SQLite database.
    - limit: The number of rows to fetch (default is 10).
    
    Returns:
    - A Pandas DataFrame containing the rows fetched from the database.
    """
    with sqlite3.connect(db_path) as conn:
        query = f"SELECT * FROM chess_analysis LIMIT {limit};"
        df = pd.read_sql_query(query, conn)
        
        return df

# Example usage
db_path = 'Final_chess_database_lichess_150million_04.db'

# Fetch the data from the database
df_retrieved = fetch_data_from_database_to_df(db_path, limit=70_000_000)
df_retrieved

Unnamed: 0,id,game_number,board_state,next_move,legal_moves,num_legal_moves,moves_left,result,average_elo,special_tokens,turn
0,1,3500001,"[0, 11, 9, 10, 12, 13, 10, 9, 11, 8, 8, 8, 1, ...",1539,"[1538, 1539, 232, 233, 1788, 1547, 1284, 1016,...",20,67,1,2225,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",b
1,2,3500001,"[0, 11, 9, 1, 12, 13, 10, 9, 11, 8, 8, 8, 1, 8...",1016,"[1758, 232, 233, 1348, 1343, 1345, 1347, 1349,...",21,65,1,2225,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",b
2,3,3500001,"[0, 11, 9, 1, 12, 13, 10, 9, 11, 8, 8, 8, 1, 8...",731,"[1758, 1268, 1269, 1270, 1271, 1272, 730, 731,...",24,63,1,2225,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",b
3,4,3500001,"[0, 11, 1, 1, 12, 13, 10, 9, 11, 8, 8, 8, 9, 8...",1268,"[1758, 1268, 1269, 1270, 1271, 1272, 987, 991,...",35,61,1,2225,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",b
4,5,3500001,"[0, 11, 1, 1, 12, 13, 10, 9, 11, 8, 8, 8, 9, 1...",985,"[1758, 1759, 984, 987, 232, 233, 1030, 1039, 1...",36,59,1,2225,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",b
...,...,...,...,...,...,...,...,...,...,...,...
31187188,31187189,4500000,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1014,"[864, 850, 867, 1021, 1020, 1019, 1018, 1017, ...",17,9,1,2262,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",w
31187189,31187190,4500000,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",860,"[864, 850, 867, 860, 242, 241, 239, 238, 237, ...",14,7,1,2262,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",w
31187190,31187191,4500000,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",819,"[829, 822, 815, 819, 833, 826, 242, 241, 239, ...",16,5,1,2262,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",w
31187191,31187192,4500000,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, ...",240,"[565, 551, 569, 562, 246, 245, 244, 243, 242, ...",18,3,1,2262,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",w


In [13]:
df_retrieved.columns

Index(['id', 'game_number', 'board_state', 'next_move', 'legal_moves',
       'num_legal_moves', 'moves_left', 'result', 'average_elo',
       'special_tokens', 'turn'],
      dtype='object')

In [11]:
# Count duplicate rows
duplicate_count = df_retrieved.duplicated().sum()

# Print the count of duplicate rows
print(f"Number of duplicate rows: {duplicate_count}")


Number of duplicate rows: 0
