In [5]:
import re
import pandas as pd

pd.set_option('display.max_columns', None)

### Extracting data

In [6]:
%%time
df = pd.read_csv('chess_games.csv', nrows=100000,
                usecols=['Event', 'Result', 'UTCDate', 'Opening', 'Termination', 'AN']) # chunksize=100000, index_col=''

CPU times: total: 1.05 s
Wall time: 1.06 s


In [118]:
df.head(3)

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN
0,Classical,1-0,2016.06.30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1,Blitz,0-1,2016.06.30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2,Blitz tournament,1-0,2016.06.30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....


### Transform data

#### Get rid of ambiguous match results and matches that was abandoned or have rules infractions

In [119]:
filt = (df['Result'] != '*') & (df['Termination'] != 'Abandoned') & (df['Termination'] != 'Rules infraction')
df = df[filt]

#### Change column datatype

In [120]:
df['UTCDate'] = pd.to_datetime(df['UTCDate'])

#### Create column "Tournament"

In [121]:
df['Tournament'] = df['Event'].str.contains('tournament')

#### Rearrange column "Event"

In [122]:
df['Event'] = df['Event'].map({' Classical ': 'Classical',
                               ' Blitz ': 'Blitz',
                               ' Blitz tournament ': 'Blitz',
                               ' Correspondence ': 'Classical',
                               ' Classical tournament ': 'Classical',
                               ' Bullet tournament ': 'Bullet',
                               ' Bullet ': 'Bullet',
                               'Blitz tournament ': 'Blitz',
                               'Bullet ': 'Bullet',
                               'Classical ': 'Classical',
                               'Blitz ': 'Blitz'
                               })

In [123]:
df.head()

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN,Tournament
0,Classical,1-0,2016-06-30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...,False
1,Blitz,0-1,2016-06-30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...,False
2,Blitz,1-0,2016-06-30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....,True
3,Classical,1-0,2016-06-30,Van't Kruijs Opening,Normal,1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...,False
4,Blitz,0-1,2016-06-30,"Sicilian Defense: Najdorf, Lipnitsky Attack",Time forfeit,1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...,True


#### Remove short matches (~less than 10 moves)

In [124]:
filt = df["AN"].apply(len) > 50
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]

1161 rows was removed


#### Get rid of unstructured string values in "AN" column

In [125]:
filt = ~df['AN'].str.contains('\[%eval')
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]

12795 rows was removed


In [73]:
# if '[%eval' in df['AN'][998]:
#     print('ERR')

In [126]:
# for x in range(len(df['AN'][33])):
#     print(df['AN'][33][x])
#     print('===')
#     if '[%eval' in df['AN'][33]:
#         print('Err')

#### Remove unpopular openings (less than 1500 matches)

In [127]:
# value_counts = df.stack().value_counts() # Entire DataFrame 
# to_remove = value_counts[value_counts <= 1500].index
# df.replace(to_remove, np.nan, inplace=True)

In [128]:
vc = df['Opening'].value_counts()
vals_to_remove = vc[vc < 1500].index.values
df['Opening'].loc[df['Opening'].isin(vals_to_remove)] = '111'
filt = df['Opening'] != '111'
df = df[filt]

#### Reset index

In [129]:
df = df.reset_index(drop=True)

#### Parse column "AN". Transform the string of moves to list

In [130]:
def get_raw_list(row: str) -> list:
    '''Parse a string to list of moves.
       List elements are strings.'''
    raw_list = []
    raw_list.append(row)
    sep_num = 2
    while True:
        separator = str(sep_num)+'.'
        if separator in raw_list[-1]:
            raw_list = raw_list[0:-1] + raw_list[-1].split(' '+separator) # concatenate all previous moves and remainder
            raw_list[-1] = str(sep_num) + raw_list[-1]
            sep_num+=1
        else:
            raw_list[-1] = re.sub(' 1-0', '', raw_list[-1])
            raw_list[-1] = re.sub(' 0-1', '', raw_list[-1])
            raw_list[-1] = re.sub(' 1/2-1/2', '', raw_list[-1])
            raw_list[0] = re.sub('1.', '1', raw_list[0])
            break
    return raw_list

In [131]:
# def get_tuples_list(raw_list: list, fk_id) -> list:
#     '''Parse each element of the raw_list to tuple.'''
#     tuples_list = []
#     for elem in raw_list:
#         tuple_elem = tuple(list(fk_id) + elem.split(' '))
#         tuples_list.append(tuple_elem)
#     return tuples_list

In [132]:
# def get_moves_as_list(row: str) -> list:
#     raw_list = get_raw_list(row)
#     #tuples_list = get_tuples_list(raw_list, '_') 
#     return raw_list

In [133]:
%%time
df["AN"] = df["AN"].apply(get_raw_list)

CPU times: total: 328 ms
Wall time: 320 ms


#### Get second data frame with moves per match

In [134]:
def create_moves_table(df):
    df_moves_total = pd.DataFrame()
    
    for row in range(len(df)):
        df_moves = pd.DataFrame()
        
        for move in range(len(df['AN'][row])):
            tuple_move = tuple([row] + df['AN'][row][move].split(' ')) # ('0','1','e4','b6') or ('0', '38', 'Rfd1', None)
            
            if len(tuple_move) == 3:
                tuple_move = tuple_move + (None, ) # last move
            elif len(tuple_move) < 3 or len(tuple_move) > 4:
                print(f'Unexpected number of elements after split: {len(tuple_move)} elements in {tuple_move} in {row} row')
            
            df_temp = pd.DataFrame([tuple_move], columns=['match_id', 'move_num', 'white_move', 'black_move']) # parse each list element
            df_moves = pd.concat([df_moves, df_temp]) # add to df for this row (match) # add %time for this row
        
        df_moves_total = pd.concat([df_moves_total, df_moves]) # add %time for this row
    
    print('Done')
    return df_moves_total

In [135]:
%%time
df_moves_total = create_moves_table(df)

Done
CPU times: total: 2min 6s
Wall time: 2min 11s


In [136]:
df_moves_total.head(3)

Unnamed: 0,match_id,move_num,white_move,black_move
0,0,1,e4,d5
0,0,2,exd5,Qxd5
0,0,3,Nf3,Bg4


In [137]:
df_moves_total.tail(3)

Unnamed: 0,match_id,move_num,white_move,black_move
0,3391,40,Kxh5,Nxf4+
0,3391,41,Bxf4,Rxf4
0,3391,42,d5,Rh4#


In [141]:
length = 0
for row in range(len(df)):
    length = length + len(df['AN'][row])
length

116130

In [143]:
len(df_moves_total)

116130

In [21]:
# for row in range(df.shape[0]):
#     cell = df.loc[row, 'AN']
#     if type(cell) == str:
#         cell = get_moves_as_list(row)
#     else:
#         print(f'In row {row} value is a {type(cell)}')

In [22]:
# v = pd.DataFrame([('0','38','Rfd1'),
#                   ('0','2','d4','Bb7')
#                  ],
#                   columns=['match_id', 'move_num', 'white_move', 'black_move'])
# v1 = pd.DataFrame([('1','1','e4','b6'),
#                   ('1','2','d4','Bb7')
#                  ],
#                   columns=['match_id', 'move_num', 'white_move', 'black_move'])
# v = pd.concat([v, v1])
# v