In [1]:
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

## Extracting data

In [81]:
%%time
df = pd.read_csv('chess_games.csv', nrows=500000,
                usecols=['Event', 'Result', 'UTCDate', 'Opening', 'Termination', 'AN']) # chunksize=100000, index_col=''
print(f"{len(df)} rows was extracted")

500000 rows was extracted
CPU times: total: 5 s
Wall time: 5.32 s


In [82]:
df.head(3)

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN
0,Classical,1-0,2016.06.30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1,Blitz,0-1,2016.06.30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2,Blitz tournament,1-0,2016.06.30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....


## Transform data

### Filter dataframe

#### Get rid of ambiguous match results and matches that was abandoned or have rules infractions

In [83]:
filt = (df['Result'] != '*') & (df['Termination'] != 'Abandoned') & (df['Termination'] != 'Rules infraction')
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

1057 rows was removed
498943 rows left


#### Remove short matches (~less than 10 moves)

In [84]:
filt = df["AN"].apply(len) > 50
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

5689 rows was removed
493254 rows left


#### Get rid of unstructured string values in "AN" column

In [85]:
filt = ~df['AN'].str.contains('\[%eval')
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

63866 rows was removed
429388 rows left


#### Remove unpopular openings (less than 1500 matches)

In [86]:
# value_counts = df.stack().value_counts() # Entire DataFrame 
# to_remove = value_counts[value_counts <= 1500].index
# df.replace(to_remove, np.nan, inplace=True)

In [87]:
vc = df['Opening'].value_counts()
vals_to_remove = vc[vc < 1500].index.values
df['Opening'].loc[df['Opening'].isin(vals_to_remove)] = '111'
filt = df['Opening'] != '111'
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

255831 rows was removed
173557 rows left


### Expand and rearrange dataframe

#### Reset index

In [88]:
df = df.reset_index(drop=True)

#### Create column "Tournament"

In [89]:
df['Tournament'] = df['Event'].str.contains('tournament')

#### Rearrange column "Event"

In [90]:
df['Event'] = df['Event'].map({' Classical ': 'Classical',
                               ' Blitz ': 'Blitz',
                               ' Blitz tournament ': 'Blitz',
                               ' Correspondence ': 'Classical',
                               ' Classical tournament ': 'Classical',
                               ' Bullet tournament ': 'Bullet',
                               ' Bullet ': 'Bullet',
                               'Blitz tournament ': 'Blitz',
                               'Bullet ': 'Bullet',
                               'Classical ': 'Classical',
                               'Blitz ': 'Blitz'
                               })

#### Change columns datatype

In [91]:
df['Event'] = df['Event'].astype('category')
df['Result'] = df['Result'].astype('category')
df['Termination'] = df['Termination'].astype('category')
df['UTCDate'] = pd.to_datetime(df['UTCDate'])

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173557 entries, 0 to 173556
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Event        173557 non-null  category      
 1   Result       173557 non-null  category      
 2   UTCDate      173557 non-null  datetime64[ns]
 3   Opening      173557 non-null  object        
 4   Termination  173557 non-null  category      
 5   AN           173557 non-null  object        
 6   Tournament   173557 non-null  bool          
dtypes: bool(1), category(3), datetime64[ns](1), object(2)
memory usage: 4.6+ MB


#### Create moves_total from AN column

In [93]:
df.iloc[6]['AN']

'1. e4 d5 2. exd5 Qxd5 3. Nc3 Qa5 4. Nf3 c6 5. Be2 Bg4 6. O-O e6 7. Ne4 Nd7 8. d4 Ngf6 9. Neg5 h6 10. Nh3 O-O-O 11. Nf4 Bd6 12. Nd3 Qc7 13. c4 h5 14. h3 Bxf3 15. Bxf3 Ng4 16. c5 Bxc5 17. Nxc5 Qh2# 0-1'

In [94]:
s = df.iloc[6]['AN']
sep_num = 1
while True:
    # separator = str(sep_num)+'. '
    if str(sep_num)+'. ' in s:
        s = re.sub(str(sep_num)+'\. ', '', s, 1) # replace the first occurrence
        sep_num+=1
    else:
        s = delete_result_from_an(s)
        break
s

'e4 d5 exd5 Qxd5 Nc3 Qa5 Nf3 c6 Be2 Bg4 O-O e6 Ne4 Nd7 d4 Ngf6 Neg5 h6 Nh3 O-O-O Nf4 Bd6 Nd3 Qc7 c4 h5 h3 Bxf3 Bxf3 Ng4 c5 Bxc5 Nxc5 Qh2#'

In [96]:
def delete_result_from_an(move: str) -> str:
    '''Delete result mark in the last move.'''
    move = re.sub(' 1-0',     '', move)
    move = re.sub(' 0-1',     '', move)
    move = re.sub(' 1/2-1/2', '', move)
    move = re.sub(' \*',      '', move)
    return move

In [97]:
def populate_moves_total2(s: str, ind: int) -> None:
    '''
    Split each move (string) into a tuple.
    Tuple format: (match_id, move_num, player, move)
    Match_id is foreign key of the main dataframe (index column).
    Player is categorical column with values 'white' or 'black'.
    '''
    player_cnt = 0
    moves_list = s.split(' ')
    for i, move in enumerate(moves_list, start=1):
        player = 'black' if i % 2 == 0 else 'white'
        tuple_move = tuple([ind] + [i] + [player] + [move])
        moves_total_test.append(tuple_move)

In [98]:
def create_moves_total2(row) -> None:
    s = row["AN"]
    sep_num = 1
    while True:
        if str(sep_num)+'. ' in s:
            s = re.sub(str(sep_num)+'\. ', '', s, 1) # replace the first occurrence
            sep_num+=1
        else:
            s = delete_result_from_an(s)
            break

    populate_moves_total2(s, row.name)

In [99]:
%%time
%%capture
moves_total_test: list[tuple] = [] # template for data frame
df.apply(create_moves_total2, axis = 1)

CPU times: total: 33.7 s
Wall time: 35.4 s


In [100]:
len(moves_total_test)

11971290

### OLD

In [35]:
def populate_moves_total(row: list[str], ind: int) -> None:
    '''
    Split each move (string) into a tuple.
    Tuple format: (match_id, move_num, player, move)
    Match_id is foreign key of the main dataframe (index column).
    Player is categorical column with values 'white' or 'black'.
    '''
    for move in row:
        split_list = move.split(' ')
        tuple_move = tuple([ind] + [int(split_list[0])] + split_list[1:])

        if len(tuple_move) == 3:
            tuple_move = tuple_move + (None, ) # in case of last move ('0', '38', 'Rfd1', None)

        moves_total.append(tuple_move)

In [36]:
def create_moves_total(row) -> None:
    '''Separate each move.'''
    match_moves = []
    match_moves.append(row["AN"])
    sep_num = 2
    while True:  # parse as many moves as possible
        separator = str(sep_num)+'.'
        if separator in match_moves[-1]:  # if the next move is found
            match_moves = match_moves[0:-1] + match_moves[-1].split(' '+separator) # concatenate all previous moves and remainder
            match_moves[-1] = str(sep_num) + match_moves[-1]
            sep_num+=1
        else:  # if the next move is not found
            match_moves[-1] = delete_result_from_an(match_moves[-1])
            match_moves[0] =  re.sub('1.', '1', match_moves[0])
            break
    
    populate_moves_total(match_moves, row.name)

In [38]:
%%time
%%capture
moves_total: list[tuple] = [] # template for data frame
df.apply(create_moves_total, axis = 1)
# 429388 rows = 1min 1s ± 1.74 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

CPU times: total: 1min 23s
Wall time: 1min 43s


#### Convert moves_total to df_moves

In [101]:
%%time
df_moves = pd.DataFrame(moves_total_test, columns=['match_id', 'move_num', 'player', 'move'])

CPU times: total: 7.31 s
Wall time: 7.32 s


In [102]:
df_moves.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11971290 entries, 0 to 11971289
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   match_id  int64 
 1   move_num  int64 
 2   player    object
 3   move      object
dtypes: int64(2), object(2)
memory usage: 365.3+ MB


In [103]:
df_moves['player'] = df_moves['player'].astype('category')
df_moves.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11971290 entries, 0 to 11971289
Data columns (total 4 columns):
 #   Column    Dtype   
---  ------    -----   
 0   match_id  int64   
 1   move_num  int64   
 2   player    category
 3   move      object  
dtypes: category(1), int64(2), object(1)
memory usage: 285.4+ MB


In [104]:
df_moves[1000:1005]

Unnamed: 0,match_id,move_num,player,move
1000,13,11,white,Qg4
1001,13,12,black,g6
1002,13,13,white,Nf3
1003,13,14,black,h5
1004,13,15,white,Qf4


#### OLD

In [49]:
%%time
df_moves = pd.DataFrame(moves_total, columns=['match_id', 'move_num', 'white_move', 'black_move'])

NameError: name 'moves_total' is not defined

In [43]:
df_moves.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14822951 entries, 0 to 14822950
Data columns (total 4 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   match_id    int64 
 1   move_num    int64 
 2   white_move  object
 3   black_move  object
dtypes: int64(2), object(2)
memory usage: 452.4+ MB


#### Drop AN column

In [238]:
df = df.drop('AN', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429388 entries, 0 to 429387
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Event        429388 non-null  category      
 1   Result       429388 non-null  category      
 2   UTCDate      429388 non-null  datetime64[ns]
 3   Opening      429388 non-null  object        
 4   Termination  429388 non-null  category      
 5   Tournament   429388 non-null  bool          
dtypes: bool(1), category(3), datetime64[ns](1), object(1)
memory usage: 8.2+ MB


## Draft

In [44]:
df_moves

Unnamed: 0,match_id,move_num,white_move,black_move
0,0,1,d4,d5
1,0,2,c4,c6
2,0,3,e3,a6
3,0,4,Nf3,e5
4,0,5,cxd5,e4
...,...,...,...,...
14822946,429387,43,bxc5,Kg4
14822947,429387,44,Be5,Kf5
14822948,429387,45,Bg3,Kxe6
14822949,429387,46,Kc3,Kd5


In [105]:
df_moves['move_num'].max()

295

### Make sure that all move have correct [Algebraic notation](https://en.wikipedia.org/wiki/Algebraic_notation_(chess))

#### Format
piece_name, disambiguating (optional), capture (optional), square_name, pawn_promotion (optional), check or checkmate (optional) \
kingside or queenside castling, check or checkmate (optional)

In [106]:
filt = ~(df_moves['move'].str.match(r'(^|[KQRBN])[a-h]?[1-8]?([a-h]?x)?[a-h][1-8](=[QRBN])?[\+#]?$') == True) \
     & ~(df_moves['move'].str.match(r'^(O-O|O-O-O)[\+#]?$')== True)
df_moves[filt]

Unnamed: 0,match_id,move_num,player,move


#### OLD

In [132]:
filt = ~(df_moves['white_move'].str.match(r'(^|[KQRBN])[a-h]?[1-8]?([a-h]?x)?[a-h][1-8](=[QRBN])?[\+#]?$') == True) \
     & ~(df_moves['white_move'].str.match(r'^(O-O|O-O-O)[\+#]?$')== True)
# & ~(df_moves['white_move'].isin(['O-O', 'O-O+', 'O-O-O', 'O-O-O+']))
df_moves[filt]

Unnamed: 0,match_id,move_num,white_move,black_move


In [123]:
filt = ~(df_moves['black_move'].str.match(r'(^|[KQRBN])[a-h]?[1-8]?([a-h]?x)?[a-h][1-8](=[QRBN])?[\+#]?$') == True) \
     & ~(df_moves['black_move'].str.match(r'^(O-O|O-O-O)[\+#]?$')== True) & ~df_moves['black_move'].isnull()
# & ~(df_moves['white_move'].isin(['O-O', 'O-O+', 'O-O-O', 'O-O-O+']))
df_moves[filt]

Unnamed: 0,match_id,move_num,white_move,black_move


In [135]:
m = re.match(r'(?P<piece_name>^|[KQRBN])[a-h]?[1-8]?(?P<capture>[a-h]?x)?[a-h][1-8](?P<promotion>=[QRBN])?(?P<position>[\+#])?$', 'Qh4xe1+')
m.groupdict()

{'piece_name': 'Q', 'capture': 'x', 'promotion': None, 'position': '+'}

In [None]:
# %%time
# for i, row in df.iterrows():
#     df.at[i, 'Event'] = df.at[i, 'Event']

In [None]:
# %%time
# for i in df.index:
#     df['Event'].iloc[i] = df['Event'].iloc[i]