In [67]:
import re
import pandas as pd

pd.set_option('display.max_columns', None)

In [68]:
%%time
df = pd.read_csv('chess_games.csv', nrows=10,
                usecols=['Event', 'Result', 'UTCDate', 'Opening', 'Termination', 'AN']) # chunksize=100000, index_col=''

CPU times: total: 31.2 ms
Wall time: 14 ms


In [41]:
df.head(3)

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN
0,Classical,1-0,2016.06.30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1,Blitz,0-1,2016.06.30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2,Blitz tournament,1-0,2016.06.30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....


In [42]:
filt = (df['Result'] != '*') & (df['Termination'] != 'Abandoned') & (df['Termination'] != 'Rules infraction')
df = df[filt]

In [43]:
df['UTCDate'] = pd.to_datetime(df['UTCDate'])

In [44]:
df['Tournament'] = df['Event'].str.contains('tournament')

In [45]:
df['Event'] = df['Event'].map({' Classical ': 'Classical',
                               ' Blitz ': 'Blitz',
                               ' Blitz tournament ': 'Blitz',
                               ' Correspondence ': 'Classical',
                               ' Classical tournament ': 'Classical',
                               ' Bullet tournament ': 'Bullet',
                               ' Bullet ': 'Bullet',
                               'Blitz tournament ': 'Blitz',
                               'Bullet ': 'Bullet',
                               'Classical ': 'Classical',
                               'Blitz ': 'Blitz'
                               })

In [46]:
df.head()

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN,Tournament
0,Classical,1-0,2016-06-30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...,False
1,Blitz,0-1,2016-06-30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...,False
2,Blitz,1-0,2016-06-30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....,True
3,Classical,1-0,2016-06-30,Van't Kruijs Opening,Normal,1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...,False
4,Blitz,0-1,2016-06-30,"Sicilian Defense: Najdorf, Lipnitsky Attack",Time forfeit,1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...,True


In [47]:
filt = df["AN"].apply(len) > 50
df = df[filt]

In [48]:
# value_counts = df.stack().value_counts() # Entire DataFrame 
# to_remove = value_counts[value_counts <= 1500].index
# df.replace(to_remove, np.nan, inplace=True)

In [49]:
vc = df['Opening'].value_counts()
vals_to_remove = vc[vc <= 1500].index.values
df['Opening'].loc[df['Opening'].isin(vals_to_remove)] = '111'
filt = df['Opening'] != '111'
df = df[filt]

In [50]:
df = df.reset_index(drop=True)

In [69]:
def get_raw_list(row: str) -> list:
    '''Parse a string to list of moves.
       List elements are strings.'''
    raw_list = []
    raw_list.append(row)
    sep_num = 2
    while True:
        separator = str(sep_num)+'.'
        if separator in raw_list[-1]:
            raw_list = raw_list[0:-1] + raw_list[-1].split(' '+separator) # concatenate all previous moves and remainder
            raw_list[-1] = str(sep_num) + raw_list[-1]
            sep_num+=1
        else:
            raw_list[-1] = re.sub(' 1-0', '', raw_list[-1])
            raw_list[-1] = re.sub(' 0-1', '', raw_list[-1])
            raw_list[-1] = re.sub(' 1/2-1/2', '', raw_list[-1])
            raw_list[0] = re.sub('1.', '1', raw_list[0])
            break
    return raw_list

In [52]:
# def get_tuples_list(raw_list: list, fk_id) -> list:
#     '''Parse each element of the raw_list to tuple.'''
#     tuples_list = []
#     for elem in raw_list:
#         tuple_elem = tuple(list(fk_id) + elem.split(' '))
#         tuples_list.append(tuple_elem)
#     return tuples_list

In [53]:
# def get_moves_as_list(row: str) -> list:
#     raw_list = get_raw_list(row)
#     #tuples_list = get_tuples_list(raw_list, '_') 
#     return raw_list

In [70]:
%%time
df["AN"] = df["AN"].apply(get_raw_list)

CPU times: total: 15.6 ms
Wall time: 8 ms


In [71]:
def create_moves_table(df):
    df_moves_total = pd.DataFrame()
    
    for row in range(len(df)):
        df_moves = pd.DataFrame()
        
        for move in range(len(df['AN'][row])):
            tuple_move = tuple([row] + df['AN'][row][move].split(' ')) # ('0','1','e4','b6') or ('0', '38', 'Rfd1', None)
            
            if len(tuple_move) == 3:
                tuple_move = tuple_move + (None, ) # last move
            elif len(tuple_move) < 3 or len(tuple_move) > 4:
                print(f'Unexpected number of elements after split: {len(tuple_move)} elements in {tuple_move} in {row} row')
            
            df_temp = pd.DataFrame([tuple_move], columns=['match_id', 'move_num', 'white_move', 'black_move']) # parse each list element
            df_moves = pd.concat([df_moves, df_temp]) # add to df for this row (match) # add %time for this row
        
        df_moves_total = pd.concat([df_moves_total, df_moves]) # add %time for this row
    
    print('Done')
    return df_moves_total

### Get second data frame with moves per match

In [72]:
%%time
df_moves_total = create_moves_table(df)

Done
CPU times: total: 484 ms
Wall time: 523 ms


In [73]:
df_moves_total.head(3)

Unnamed: 0,match_id,move_num,white_move,black_move
0,0,1,d4,d5
0,0,2,c4,c6
0,0,3,e3,a6


In [74]:
df_moves_total.tail(3)

Unnamed: 0,match_id,move_num,white_move,black_move
0,9,41,Kg3,Qxd3+
0,9,42,Kh2,Qd1
0,9,43,Kg3,


In [21]:
# for row in range(df.shape[0]):
#     cell = df.loc[row, 'AN']
#     if type(cell) == str:
#         cell = get_moves_as_list(row)
#     else:
#         print(f'In row {row} value is a {type(cell)}')

In [22]:
# v = pd.DataFrame([('0','38','Rfd1'),
#                   ('0','2','d4','Bb7')
#                  ],
#                   columns=['match_id', 'move_num', 'white_move', 'black_move'])
# v1 = pd.DataFrame([('1','1','e4','b6'),
#                   ('1','2','d4','Bb7')
#                  ],
#                   columns=['match_id', 'move_num', 'white_move', 'black_move'])
# v = pd.concat([v, v1])
# v

In [62]:
for x in range(len(df['AN'][33])):
    print(df['AN'][33][x])
    print('===')

1. e4 { [%eval 0.27] } 1... e5 { [%eval 0.27] }
===
 f4?! { [%eval -0.26] }
===
.. Nc6?! { [%eval 0.34] } 3. fxe5?? { [%eval -4.91] } 3... Nxe5?? { [%eval 0.59] } 4. Nf3?! { [%eval 0.03] } 4... d6?! { [%eval 0.76] } 5. Nxe5? { [%eval -1.06] } 5... dxe5?! { [%eval -0.32] } 6. Qf3 { [%eval -0.23] } 6... Bd6 { [%eval 0.0] } 7. Bc4 { [%eval -0.15] } 7... Qe7 { [%eval 0.24] } 8. Nc3 { [%eval 0.27] } 8... Nf6 { [%eval 0.38] } 9. Nd5?! { [%eval -0.26] } 9... Nxd5 { [%eval -0.32] } 10. Bxd5 { [%eval -0.28] } 10... c6 { [%eval -0.25] } 11. Bc4 { [%eval -0.5] } 11... Be6 { [%eval -0.12] } 12. Bb3 { [%eval -0.4] } 12... Qd7 { [%eval -0.03] } 13. O-O { [%eval -0.34] } 13... Bxb3 { [%eval -0.27] } 14. axb3 { [%eval -0.23] } 14... f6 { [%eval -0.12] } 15. Qh5+ { [%eval -0.25] } 15... Qf7 { [%eval -0.27] } 16. Qg4 { [%eval -0.29] } 16... h5 { [%eval -0.24] } 17. Qf5 { [%eval -0.24] } 17... Be7 { [%eval 0.07] } 18. Ra4?! { [%eval -0.54] } 18... g6?! { [%eval 0.21] } 19. Qf3 { [%eval -0.21] } 19... Qe6