# Web Scraping

Import the libraries.

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep

Open the Browser.

In [None]:
driver = webdriver.Edge()

Navigate to the event page.

In [None]:
driver.get("https://chess.com/events/2023-bullet-chess-championship-main-event/games")

Before going any further, it's necessary a scroll-down of the page to load more matches.
Then, get every element having the `round-games-list-item-games` class.

In [None]:
game_items = driver.find_elements(By.CLASS_NAME, "round-games-list-item-games")

Now, extract all the links to the games.

In [None]:
# Create a new empty list
links = []

# For each game_item, find every elements 'a' with href attribute
for game_item in game_items:
    game_links = game_item.find_elements(by=By.TAG_NAME, value="a")

    # Add all links to the list
    for game_link in game_links:
        links.append(game_link.get_attribute("href"))

# Check if it loaded enough matches
if len(links) > 31:
    links = links[:32]
    print('Success')
else:
    print('Load more matches')

## Extract informations

In [None]:
# Variables for summary Dataframe
player = []
color_generic = []
clock_value = []
clock_result = []
accuracy_score = []
game = []

# Variables for in-game specifics Dataframe
move = []
skill = []
turn = []
color = []
match = []
n = 0


for link in links:
    # Load the page
    driver.get(link)
    sleep(5)
    

    # Save the player ...
    if 'Nakamura_Hikaru-Carlsen_Magnus' in link:
        player.append('Hikaru Nakamura')
        player.append('Magnus Carlsen')
    elif 'Carlsen_Magnus-Nakamura_Hikaru' in link:
        player.append('Magnus Carlsen')
        player.append('Hikaru Nakamura')
    # ... and the piece colors
    color_generic.append('W')
    color_generic.append('B')

    # Saving clock-values and results
    for i in range(1, -1, -1):
        clock_value.append(driver.find_elements(By.CLASS_NAME, 'clock-value')[i].text)
        clock_result.append(driver.find_elements(By.CLASS_NAME, 'clock-result')[i].text)
    
    # Saving the accuracy
    for i in range(2):
        accuracy_score.append(driver.find_elements(By.CLASS_NAME, 'accuracy-score-value')[i].text)
    
    # Then save the game
    n += 1
    game.append(n)
    game.append(n)
    

    # Load the match-analysis tab
    driver.find_element(By.CSS_SELECTOR, "button[data-cy='game-sidebar-tab-analysis']").click()
    sleep(10)

    # Takes moves in the game analysis
    moves_driver = driver.find_elements(By.CLASS_NAME, 'move-node')
    moves = []
    for m in moves_driver:
        moves.append(m.find_element(By.CLASS_NAME, 'move-text').text)
        # Check if the move is 'bad/good' or 'normal'
        if 'customColor' in m.get_attribute('class'):
            # Get style
            style = m.get_attribute('style')
            index = style.find('#')
            skill.append(style[index:index + 7])
        else:
            skill.append('=')
    
    # Storing all moves
    move += moves

    # Saving the colors and the number of the game
    for i in range(len(moves)):
        turn.append(int(i/2) + 1)
        if i % 2 == 0:
            color.append('W')
        else:
            color.append('B')
        match.append(n)

## Summary
DataFrame containing game statistics per player.

In [None]:
df_summary = pd.DataFrame({
    'player': player,
    'color_generic': color_generic,
    'clock_value': clock_value,
    'clock_result': clock_result,
    'accuracy_score': accuracy_score,
    'game': game
})

Sometimes the computer/browser is too slow to load, resulting in some `accuracy_score` values being incorrect. Below is an example of how to replace the values one by one.

In [None]:
indexes = df_summary[df_summary.accuracy_score == 100].index
df_summary['accuracy_score'].iloc[indexes[0]] = 83.0
df_summary['accuracy_score'].iloc[indexes[1]] = 76.1
df_summary['accuracy_score'].iloc[indexes[2]] = 93.4
df_summary['accuracy_score'].iloc[indexes[3]] = 94.1
df_summary['accuracy_score'].iloc[indexes[4]] = 90.8
df_summary['accuracy_score'].iloc[indexes[5]] = 91.5

Convert `clock_value` to seconds.

In [None]:
df_summary['clock_value_numeric'] = pd.to_numeric(df_summary['clock_value'].str.replace(':', ''))

Export to CSV.

In [None]:
df_summary.to_csv('data/summary.csv', index=False)

## Games
A DataFrame of all moves made during the matches.

In [None]:
df_games = pd.DataFrame({
    'move': move,
    'skill': skill,
    'turn': turn,
    'color': color,
    'match': match
})

Trim all extra spaces from `skill`.

In [None]:
df_games.skill = df_games.skill.str.strip()

Create two additional columns for the capturing pieces and captured pieces. Possible values are:
- _R_, _N_, _B_, _Q_, _K_ for the main pieces
- _p_ per for pawns
- _0_ if no piece has been taken

To do this, we split the values in the move column that have an `x` (which represents that a piece has been taken): the first part represents the capturing piece, the second the captured piece.

In [None]:
df_games['piece_that_take'] = [move.split('x')[0] if 'x' in move else '0' for move in df_games['move']]

split_moves = df_games['move'].str.split('x', n=1, expand=True)
df_games['piece_taken'] = split_moves[1]

Removes check (+) signs from `piece_taken` and replaces None values with `"0"`.

In [None]:
df_games['piece_taken'] = df_games['piece_taken'].str.replace('[+-]', '')
df_games['piece_taken'].fillna(value='0', inplace=True)

Note that `piece_taken` does not store a piece, but a position. To find the piece, we need to:
- identify the last piece that moved to that position during that match;
- if no piece has moved to that position, but something has been taken, it means that a piece has been taken from the initial position. An `initial_piece` function is therefore created that returns the piece from the initial position.

In [None]:
def initial_piece(position):
    piece = 0
    if position in ['a2', 'b2', 'c2', 'd2', 'e2', 'f2', 'g2', 'h2', 'a7', 'b7', 'c7', 'd7', 'e7', 'f7', 'g7', 'h7']:
        piece = 'p'
    elif position in ['a1', 'h1', 'a8', 'h8']:
        piece = 'R'
    elif position in ['b1', 'g1', 'b8', 'g8']:
        piece = 'N'
    elif position in ['c1', 'f1', 'c8', 'f8']:
        piece = 'B'
    elif position in ['d1', 'd8']:
        piece = 'Q'
    elif position in ['e1', 'e8']:
        piece = 'K'
    return piece

In [None]:
piece_taken = list(df_games['piece_taken'])

for i in range(len(piece_taken)):
    if piece_taken[i] != '0':
        passed_moves = list(df_games.head(i)['move'])
        j = len(passed_moves) - 1
        piece = ''

        match_n = df_games['match'].iloc[i]
        while piece_taken[i] not in passed_moves[j]:
            j = j - 1
            if (j < 0) or (match_n != df_games['match'].iloc[j]):
                piece = initial_piece(piece_taken[i])
                break
        if piece != '':
            piece_taken[i] = piece
        else:
            piece_taken[i] = passed_moves[j].replace(piece_taken[i], '')
        if piece_taken[i] == '':
            piece_taken[i] = 'p'

df_games['piece_taken'] = piece_taken

Clears the values of the two columns just created.

In [None]:
df_games['piece_that_take'] = df_games['piece_that_take'].str.replace('[1-9]', '')
df_games['piece_that_take'] = df_games['piece_that_take'].str.replace('[a-h]', 'p')
df_games['piece_that_take'] = df_games['piece_that_take'].str[0]

df_games['piece_taken'] = df_games['piece_taken'].str.replace('[1-9]', '')
df_games['piece_taken'] = df_games['piece_taken'].str.replace('[+-]', '')
df_games['piece_taken'] = df_games['piece_taken'].str.replace('x', '')
df_games['piece_taken'] = df_games['piece_taken'].str.replace('[a-h]', 'p')
df_games['piece_taken'].fillna('p', inplace=True)
df_games['piece_taken'].iloc[df_games[df_games['piece_taken'] == ''].index] = 'p'
df_games['piece_taken'] = df_games['piece_taken'].str[0]

A check column is created that contains:
- `0` if there has been no check
- `1` if there has been a check

In [None]:
df_games['check'] = [1 if '+' in m else 0 for m in df_games['move']]

Merge the DataFrames to get `player`.

In [None]:
df_games = df_games.merge(df_summary, how='left', left_on=['match', 'color'], right_on=['game', 'color_generic'])[list(df_games.columns) + ['player']]

Export to CSV.

In [None]:
df_games.to_csv('data/games.csv', index=False)

## Annotation
A DataFrame that contains the mapping of symbols-meaning-color for `skills`.

In [None]:
symbol = ['??', '?', 'X', '!', '!!']
meaning = ['Blunder', 'Mistake', 'Missed win', 'Good move', 'Brilliant move']
hex = ['#b33430', '#e6912c', '#ee6b55', '#5c8bb0', '#1baca6']

df_annotation = pd.DataFrame({
    'symbol': symbol,
    'meaning': meaning,
    'hex': hex
})

df_annotation

Export to CSV.

In [None]:
df_annotation.to_csv('data/annotations.csv')