# Reading the Raw Data / Essential Imports

In [13]:
import bz2
import numpy as np
import pandas as pd
import math
import re

In [3]:
with bz2.open("../data/lichess_db_standard_rated_2014-08.pgn.bz2", "rb") as f:
    data = f.read()

In [4]:
data = str(data) # Convert binary data into string for easier functionality
raw_games = data.split('[Event') # Split the data into chess games using the '[Event' string
print("Game at 0th index: %s" % raw_games[0])
del raw_games[0] # The first index isn't a game
del data # Remove binary string to save memory

Game at 0th index: b'


# Raw data EDA

<p style="font-size:15px; font-family:verdana; line-height: 1.7em; padding-top: 15px"> In this dataset, there are three types of games: "normal" games (consistent with PGN), games with analysis (i.e., there is 'eval' attached which an engine's evaluation of the position), and empty games (no moves were played at all). Below, I give an example of each of these types of games. </p>

In [5]:
analysis_games = 0
empty_games = 0

print("Sample normal game: %s\n" % raw_games[0])
for game in raw_games:
    if game.find('eval') != -1:
        if analysis_games == 0:
            print("Sample game with analysis attached: %s\n" % game)
        analysis_games += 1

for game in raw_games:
    if game.find('1.') == -1:
        if empty_games == 0:
            print("Sample empty game: %s" % game)
        empty_games += 1

Sample normal game:  "Rated Classical game"]\n[Site "https://lichess.org/Gg06eUOY"]\n[White "rima65"]\n[Black "aywee"]\n[Result "0-1"]\n[UTCDate "2014.07.31"]\n[UTCTime "22:00:00"]\n[WhiteElo "1407"]\n[BlackElo "1669"]\n[WhiteRatingDiff "-5"]\n[BlackRatingDiff "+5"]\n[ECO "B06"]\n[Opening "Robatsch (Modern) Defense"]\n[TimeControl "420+8"]\n[Termination "Time forfeit"]\n\n1. e4 g6 2. d4 Bg7 3. c3 d6 4. Qf3 Nf6 5. h3 O-O 6. Bg5 Nbd7 7. Bc4 a6 8. h4 b5 9. Bd5 Nxd5 10. exd5 Bb7 11. h5 Nf6 12. h6 Bh8 13. Bxf6 Bxf6 14. Nd2 Qd7 15. Qh3 Qxh3 16. Rxh3 Bxd5 17. g4 Be6 18. Rg3 Bh4 19. Rg2 f5 20. Ngf3 Bf6 21. g5 Bh8 22. Nh4 c5 23. Rh2 cxd4 24. cxd4 Bxd4 25. Nxg6 hxg6 26. h7+ Kh8 27. Nf3 Bxb2 28. Rb1 Bc3+ 29. Kf1 Rf7 30. Nh4 Rg7 31. Nf3 Rxh7 32. Rxh7+ Kxh7 33. Rc1 b4 34. Ke2 Bxa2 35. Nd2 Kg7 36. Nf3 e5 37. Nh4 a5 38. Rd1 Bc4+ 39. Ke3 d5 40. Nf3 a4 41. Nd2 Bxd2+ 0-1\n\n

Sample game with analysis attached:  "Rated Bullet game"]\n[Site "https://lichess.org/s3CHmrgH"]\n[White "JekyllHyde"]\n[Black "M

In [6]:
normal_games = len(raw_games)-analysis_games-empty_games

print("Number of total games: %d" % len(raw_games))
print("Number of normal games: %d" % normal_games)
print("Number of games with eval attached: %d" % analysis_games)
print("Number of empty games: %d" % empty_games)

Number of total games: 1013294
Number of normal games: 843230
Number of games with eval attached: 169854
Number of empty games: 210


# Data Cleaning

The `EVAL` variable denotes if we parse games with eval or not. Depending on this variable we only keep normal games or games with eval as the parsing is different in each case.

In [7]:
EVAL = True

In [8]:
all_games = []
for i in range(len(raw_games)):
    if EVAL:
        if raw_games[i].find('eval') != -1:
            all_games.append(raw_games[i])
    elif raw_games[i].find('eval') == -1 and raw_games[i].find('1.') != -1:
        all_games.append(raw_games[i])


del raw_games # remove old uncleaned version to save memory

In [9]:
len(all_games)

169854

In [10]:
game = all_games[1]
game

' "Rated Classical game"]\\n[Site "https://lichess.org/CZl9BDW6"]\\n[White "arbutus"]\\n[Black "Guendabiaani"]\\n[Result "1-0"]\\n[UTCDate "2014.07.31"]\\n[UTCTime "22:00:03"]\\n[WhiteElo "1417"]\\n[BlackElo "1500"]\\n[WhiteRatingDiff "+9"]\\n[BlackRatingDiff "-222"]\\n[ECO "D00"]\\n[Opening "Queen\\\'s Pawn Game #2"]\\n[TimeControl "900+8"]\\n[Termination "Normal"]\\n\\n1. d4 { [%eval 0.24] } 1... d5 { [%eval 0.24] } 2. e3 { [%eval 0.19] } 2... Nc6 { [%eval 0.31] } 3. c3 { [%eval -0.1] } 3... Nf6 { [%eval 0.1] } 4. h3 { [%eval -0.34] } 4... Bd7 { [%eval 0.0] } 5. Qf3? { [%eval -1.02] } 5... e6 { [%eval -0.62] } 6. b4 { [%eval -0.96] } 6... Bd6 { [%eval -1.06] } 7. Bd3 { [%eval -0.96] } 7... O-O { [%eval -0.96] } 8. Ne2? { [%eval -3.23] } 8... Re8? { [%eval -0.96] } 9. Ng3? { [%eval -2.01] } 9... e5 { [%eval -2.07] } 10. Nh5?! { [%eval -3.0] } 10... Nxh5?! { [%eval -2.31] } 11. Qxh5 { [%eval -2.18] } 11... e4?! { [%eval -1.21] } 12. Be2 { [%eval -1.51] } 12... Be6 { [%eval -1.15] } 13.

# Converting data into dataframe

In this section, I convert the raw data I have into a pandas dataframe, so that later it can be transformed into a readable CSV file.

In [11]:
def extract_pgn(game):
    if EVAL:
        game = re.sub("[\{].*?[\}] ", "", re.sub('[0-9]+\.\.\.', "", game))
    index = game.find("1. ") + 2
    while True:
        if game[index:index+2] == '0-' or game[index:index+2] == '1-' or game[index:index+2] == '1/':
            # Game termination
            break
        index += 1
    return game[game.find("1."):index-1]
    
def extract_eval(game):
#     return [float(re.findall("\d+\.\d+",s)[0]) for s in re.findall(r'\{.*?\}', game)]
    return [s[9:-3] for s in re.findall(r'\{.*?\}', game)]

def extract_info(starting_str, ending_str='"'):
    index = game.find(starting_str) + len(starting_str) + 1
    s = ""
    while True:
        if game[index] == ending_str:
            break
        s += game[index]
        index += 1
    return s

In [14]:
%%time
PGN_list = []
eval_list = []
mode_list = []
termination_list = []
site_list = []
UTCDate = []
UTCTime = []
eco = []
opening = []
timeControl = []
wElo, bElo = [], []
wPseudo, bPseudo = [],[]
wWins, bWins, draws = [], [], []
wRatingDiff, bRatingDiff = [], []

for i, game in enumerate(all_games):
    # PGN
    PGN_list.append(extract_pgn(game))
    try:
        eval_list.append(extract_eval(game))
    except:
        print(i)
    # Result
    index = game.find('Result')+8
    result = game[index:index+2]
    white = 0
    black = 0
    draw = 0
    if result == "1-":
        white = 1
    elif result == "0-":
        black = 1 
    elif result == "1/":
        draw = 1
    wWins.append(white)
    bWins.append(black)
    draws.append(draw)
    
    # Mode
    mode_list.append(extract_info("d"," "))
    # Elo
    wElo.append(int(extract_info('WhiteElo ')))
    bElo.append(int(extract_info('BlackElo ')))
    
    termination_list.append(extract_info("Termination "))
    site_list.append(extract_info("Site "))
    wPseudo.append(extract_info("White "))
    bPseudo.append(extract_info("Black "))  
    UTCDate.append(extract_info("UTCDate "))
    UTCTime.append(extract_info("UTCTime "))
    eco.append(extract_info("ECO "))
    opening.append(extract_info("Opening "))
    timeControl.append(extract_info("TimeControl "))
    wRatingDiff.append(extract_info("WhiteRatingDiff "))
    bRatingDiff.append(extract_info("BlackRatingDiff "))

CPU times: user 1min 30s, sys: 1.92 s, total: 1min 32s
Wall time: 1min 33s


In [15]:
chess_df = pd.DataFrame({})
chess_df['PGN'] = PGN_list
chess_df['Eval'] = eval_list
chess_df['Mode'] = mode_list
chess_df['Site'] = site_list
chess_df['UTCDate'] = UTCDate
chess_df['UTCTime'] = UTCTime
chess_df['ECO'] = eco
chess_df['Opening'] = opening
chess_df['TimeControl'] = timeControl
chess_df['Termination Type'] = termination_list
chess_df['wElo'] = wElo
chess_df['bElo'] = bElo
chess_df['wWin'] = wWins
chess_df['bWin'] = bWins
chess_df['Draw'] = draws
chess_df['wPseudo'] = wPseudo
chess_df['bPseudo'] = bPseudo
chess_df['wRatingDiff'] = wRatingDiff
chess_df['bRatingDiff'] = bRatingDiff

In [16]:
chess_df.head(2)

Unnamed: 0,PGN,Eval,Mode,Site,UTCDate,UTCTime,ECO,Opening,TimeControl,Termination Type,wElo,bElo,wWin,bWin,Draw,wPseudo,bPseudo,wRatingDiff,bRatingDiff
0,1. e4 d5 2. exd5 Nf6 3. Nc3 Nxd5 4. Nf3 Nc...,"[0.2, 0.47, 0.45, 0.58, 0.38, 0.32, 0.27, 0.52...",Bullet,https://lichess.org/s3CHmrgH,2014.07.31,22:01:10,B01,Scandinavian Defense: Modern Variation #2,60+0,Normal,1627,1662,1,0,0,JekyllHyde,Maconouchi,52,-13
1,1. d4 d5 2. e3 Nc6 3. c3 Nf6 4. h3 Bd7 5. ...,"[0.24, 0.24, 0.19, 0.31, -0.1, 0.1, -0.34, 0.0...",Classical,https://lichess.org/CZl9BDW6,2014.07.31,22:00:03,D00,Queen\'s Pawn Game #2,900+8,Normal,1417,1500,1,0,0,arbutus,Guendabiaani,9,-222


In [17]:
assert len(all_games) == len(chess_df)

In [18]:
EXPORT = False

if EXPORT:
    chess_df.to_csv("lichess_complete_with_eval.csv", index=True, header=True)