Webscraping

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0'
}

players = {
    "Magnus Carlsen": "https://www.chess.com/games/search?fromSearchShort=1&p1=Magnus%20Carlsen&playerId=822231&page=",
    "Hikaru Nakamura": "https://www.chess.com/games/search?fromSearchShort=1&p1=Hikaru%20Nakamura&playerId=291573&page=",
    "Garry Kasparov": "https://www.chess.com/games/search?fromSearchShort=1&p1=Garry%20Kasparov&playerId=21779&page=",
    "Viswanathan Anand": "https://www.chess.com/games/search?fromSearchShort=1&p1=Viswanathan%20Anand&playerId=284082&page=",
    "Fabiano Caruana": "https://www.chess.com/games/search?fromSearchShort=1&p1=Fabiano%20Caruana&playerId=42382&page=",
    "Alexander Morozevich": "https://www.chess.com/games/search?fromSearchShort=1&p1=Alexander%20Morozevich&playerId=32913&page=",
    "Alan Pichot": "https://www.chess.com/games/search?fromSearchShort=1&p1=Alan%20Pichot&playerId=813915&page=2"
}

pgn_file = "../data/all_games.pgn"  # Path to save the combined PGN file

for player, player_url in players.items():
    game_ids = []

    for page in tqdm(range(40)):   # this attempts to grab the first 25 pages
        URL = f'{player_url}{page}'
        response = requests.get(URL, headers=headers)
        if response.status_code != 200:
            print(f'Error retrieving games for {player} on page {page}')
            break
        soup = BeautifulSoup(response.text)
        page_game_ids = [x.get('href') for x in soup.find_all('a', {'class': 'master-games-clickable-link master-games-td-user'})]
        page_game_ids = [x.split('/')[-1] for x in page_game_ids]
        game_ids.extend(page_game_ids)
        time.sleep(15)

    # Join game ids
    game_ids = ','.join(game_ids)

    DOWNLOAD_URL = f'https://www.chess.com/games/downloadPgn?game_ids={game_ids}'
    download_response = requests.get(DOWNLOAD_URL)

    with open(pgn_file, 'ab') as file:
        file.write(download_response.content)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

Formatting/Processing

In [1]:
import chess.pgn
import pandas as pd

def extract_moves(game):
    moves = []
    node = game
    move_number = 1
    board = chess.Board()  # Create a new board object
    while not node.is_end():
        node = node.variations[0]
        move = node.move
        color = "Black" if node.board().turn == chess.WHITE else "White"  # Reverse the color assignment
        move_from = chess.square_name(move.from_square)
        move_to = chess.square_name(move.to_square)

        move_combined = move_from + move_to  # Combine move squares into one string

        moves.append((color, move_number, move_combined, get_piece_from_move(board, move)))
        move_number += 1
        board.push(move)  # Push the move onto the board

    return moves

def get_piece_from_move(board, move):
    piece = board.piece_at(move.from_square)
    if piece is None:
        return "Pawn"
    elif piece.piece_type == chess.KNIGHT:
        return "Knight"
    elif piece.piece_type == chess.BISHOP:
        return "Bishop"
    elif piece.piece_type == chess.ROOK:
        return "Rook"
    elif piece.piece_type == chess.QUEEN:
        return "Queen"
    elif piece.piece_type == chess.KING:
        return "King"
    else:
        return "Pawn"

# Initialize empty lists for each column of the DataFrame
game_id_list = []
white_result_list = []
black_result_list = []
color_list = []
move_number_list = []
move_list = []
piece_list = []

pgn_file = "../data/all_games.pgn"  # Path to the combined PGN file

# Open the PGN file
with open(pgn_file) as f:
    game_number = 1
    while True:
        game = chess.pgn.read_game(f)
        if game is None:
            break

        # Extract information about the game
        headers = game.headers
        game_id = game_number  # Use the game_number as the game ID
        game_number += 1
        result = headers["Result"]
        white_result = 0
        black_result = 0

        if result == "1-0":
            white_result = 1
            black_result = 0
        elif result == "0-1":
            white_result = 0
            black_result = 1

        # Extract the moves
        moves = extract_moves(game)

        # Append the information for the game to the lists
        for color, move_number, move_combined, piece in moves:
            game_id_list.append(game_id)
            white_result_list.append(white_result)
            black_result_list.append(black_result)
            color_list.append(color)
            move_number_list.append(move_number)
            move_list.append(move_combined)
            piece_list.append(piece)

# Create a DataFrame from the extracted data
data = {
    "Game ID": game_id_list,
    "White Result": white_result_list,
    "Black Result": black_result_list,
    "Color": color_list,
    "Move Number": move_number_list,
    "Move": move_list,
    "Piece": piece_list
}
df = pd.DataFrame(data)
df

Unnamed: 0,Game ID,White Result,Black Result,Color,Move Number,Move,Piece
0,1,1,0,White,1,a2a4,Pawn
1,1,1,0,Black,2,g8f6,Knight
2,1,1,0,White,3,d2d4,Pawn
3,1,1,0,Black,4,d7d5,Pawn
4,1,1,0,White,5,g1f3,Knight
...,...,...,...,...,...,...,...
552160,6445,0,1,Black,66,f4f3,Pawn
552161,6445,0,1,White,67,a5d5,Rook
552162,6445,0,1,Black,68,f6e6,King
552163,6445,0,1,White,69,d5d2,Rook


One Hot Encoding

In [16]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Encode categorical variables
encoder = OneHotEncoder(sparse=False)
color_encoded = encoder.fit_transform(df[["Color"]])

# Normalize numerical variables
scaler = StandardScaler()
game_id_scaled = scaler.fit_transform(df[["Game ID"]])
move_number_scaled = scaler.fit_transform(df[["Move Number"]])

# Encode text variables
move_encoder = CountVectorizer(analyzer=lambda x: x.split(), binary=True)
move_encoded = move_encoder.fit_transform(df["Move"])
piece_encoder = LabelEncoder()
piece_encoded = piece_encoder.fit_transform(df["Piece"])

In [17]:
# Concatenate encoded and normalized columns
X = np.concatenate((game_id_scaled, move_number_scaled, color_encoded, move_encoded.toarray(), piece_encoded.reshape(-1, 1)), axis=1)

# Get the target variable
y = df[["White Result"]].values  # Assuming you want to predict the "White Result" column

# Print the shape of input features and target variable
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (552165, 1798)
y shape: (552165, 1)


Keras Model

In [18]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the target variable to categorical format
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Create a sequential model
model = Sequential()

# Add input layer
model.add(Dense(64, activation='relu', input_shape=(X.shape[1],)))

# Add one or more hidden layers
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

# Add output layer
model.add(Dense(2, activation='softmax'))  # Adjusted to 2 output units for binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 64)                115136    
                                                                 
 dense_17 (Dense)            (None, 64)                4160      
                                                                 
 dense_18 (Dense)            (None, 32)                2080      
                                                                 
 dense_19 (Dense)            (None, 2)                 66        
                                                                 
Total params: 121,442
Trainable params: 121,442
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.6776
Test accuracy: 0.5950


In [7]:
import chess.pgn
import pandas as pd

def extract_moves(game):
    moves = []
    node = game
    move_number = 1
    board = chess.Board()  # Create a new board object
    while not node.is_end():
        node = node.variations[0]
        move = node.move
        color = "Black" if node.board().turn == chess.WHITE else "White"  # Reverse the color assignment
        move_from = chess.square_name(move.from_square)
        move_to = chess.square_name(move.to_square)

        move_combined = move_from + move_to  # Combine move squares into one string

        moves.append((color, move_number, move_combined, get_piece_from_move(board, move)))
        move_number += 1
        board.push(move)  # Push the move onto the board

    return moves

def get_piece_from_move(board, move):
    piece = board.piece_at(move.from_square)
    if piece is None:
        return "Pawn"
    elif piece.piece_type == chess.KNIGHT:
        return "Knight"
    elif piece.piece_type == chess.BISHOP:
        return "Bishop"
    elif piece.piece_type == chess.ROOK:
        return "Rook"
    elif piece.piece_type == chess.QUEEN:
        return "Queen"
    elif piece.piece_type == chess.KING:
        return "King"
    else:
        return "Pawn"

# Initialize empty lists for each column of the DataFrame
game_id_list = []
white_result_list = []
black_result_list = []
color_list = []
move_number_list = []
move_list = []
piece_list = []

# Open the PGN files
game_number = 1
for file_num in range(1, 26):
    file_path = f"../data/Magnus_Carlsen{file_num}.pgn"
    with open(file_path) as f:
        while True:
            game = chess.pgn.read_game(f)
            if game is None:
                break

            # Extract information about the game
            headers = game.headers
            game_id = game_number  # Use the game_number as the game ID
            game_number += 1
            result = headers["Result"]
            white_result = 0
            black_result = 0

            if result == "1-0":
                white_result = 1
                black_result = 0
            elif result == "0-1":
                white_result = 0
                black_result = 1

            # Extract the moves
            moves = extract_moves(game)

            # Append the information for the game to the lists
            for color, move_number, move_combined, piece in moves:
                game_id_list.append(game_id)
                white_result_list.append(white_result)
                black_result_list.append(black_result)
                color_list.append(color)
                move_number_list.append(move_number)
                move_list.append(move_combined)
                piece_list.append(piece)

# Create a DataFrame from the extracted data
data = {
    "Game ID": game_id_list,
    "White Result": white_result_list,
    "Black Result": black_result_list,
    "Color": color_list,
    "Move Number": move_number_list,
    "Move": move_list,
    "Piece": piece_list
}
df = pd.DataFrame(data)
small_df


Unnamed: 0,Game ID,White Result,Black Result,Color,Move Number,Move,Piece
0,1,1,0,White,1,d2d4,Pawn
1,1,1,0,Black,2,g8f6,Knight
2,1,1,0,White,3,c2c4,Pawn
3,1,1,0,Black,4,e7e6,Pawn
4,1,1,0,White,5,g1f3,Knight
...,...,...,...,...,...,...,...
58126,728,0,0,Black,42,g8h8,King
58127,728,0,0,White,43,h6f7,Knight
58128,728,0,0,Black,44,h8g8,King
58129,728,0,0,White,45,f7h6,Knight


Stockfish

In [None]:
from stockfish import Stockfish

stockfish = Stockfish(path="..\stockfish\stockfish-windows-2022-x86-64-avx2.exe")

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Get the move columns from the dataset
move_columns = df.loc[:, 'White Move 1':'Black Move 135']  # Adjust column names accordingly

# Create a list to store the move sequences
move_sequences = []

# Iterate over each row and collect the moves into a sequence
for _, row in move_columns.iterrows():
    move_sequence = row.dropna().tolist()
    move_sequences.append(move_sequence)

# Use MultiLabelBinarizer for one-hot encoding
mlb = MultiLabelBinarizer()
encoded_moves = mlb.fit_transform(move_sequences)

# Create a DataFrame with the encoded moves
encoded_moves_df = pd.DataFrame(encoded_moves, columns=mlb.classes_)
encoded_moves_df

In [None]:
import pandas as pd

# Get the move columns from the dataset
move_columns = df.loc[:, 'White Move 1':'Black Move 135']  # Adjust column names accordingly

# Create a set to store unique moves
unique_moves = set()

# Iterate over each row and collect unique moves
for _, row in move_columns.iterrows():
    unique_moves.update(row.dropna().tolist())

# Create a dictionary to map each unique move to a numerical value
move_to_index = {move: index+1 for index, move in enumerate(unique_moves)}

# Encode the moves with sequential numerical values
encoded_moves = move_columns.applymap(lambda x: move_to_index.get(x))

# Create a DataFrame with the encoded moves
encoded_moves_df = encoded_moves.fillna(0).astype(int)
encoded_moves_df

In [None]:
# Concatenate the encoded move dataframes with the original dataframe
combined_df = pd.concat([df[['White Result', 'Black Result']],
                        encoded_moves_df, encoded_moves_df], axis=1)

In [None]:
combined_df

In [None]:
# train/test split

from sklearn.model_selection import train_test_split

X = combined_df.drop(['White Result', 'Black Result'], axis=1)  # Input features
y = combined_df[['White Result', 'Black Result']]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define the architecture of the neural network

from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test))