In [1]:
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

## Extracting data

In [7]:
%%time
df = pd.read_csv('chess_games.csv', nrows=1000000,
                usecols=['Event', 'Result', 'UTCDate', 'Opening', 'Termination', 'AN']) # chunksize=100000, index_col=''
print(f"{len(df)} rows was extracted")

1000000 rows was extracted
CPU times: total: 11 s
Wall time: 14.5 s


In [8]:
df.head(3)

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN
0,Classical,1-0,2016.06.30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1,Blitz,0-1,2016.06.30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2,Blitz tournament,1-0,2016.06.30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....


## Transform data

### Filter dataframe

#### Get rid of ambiguous match results and matches that was abandoned or have rules infractions

In [9]:
filt = (df['Result'] != '*') & (df['Termination'] != 'Abandoned') & (df['Termination'] != 'Rules infraction')
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

2389 rows was removed
997611 rows left


#### Remove short matches (~less than 10 moves)

In [10]:
filt = df["AN"].apply(len) > 50
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

11218 rows was removed
986393 rows left


#### Get rid of unstructured string values in "AN" column

In [11]:
filt = ~df['AN'].str.contains('\[%eval')
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

127159 rows was removed
859234 rows left


#### Remove unpopular openings (less than 1500 matches)

In [12]:
# value_counts = df.stack().value_counts() # Entire DataFrame 
# to_remove = value_counts[value_counts <= 1500].index
# df.replace(to_remove, np.nan, inplace=True)

In [13]:
vc = df['Opening'].value_counts()
vals_to_remove = vc[vc < 1500].index.values
df['Opening'].loc[df['Opening'].isin(vals_to_remove)] = '111'
filt = df['Opening'] != '111'
print(f"{len(df)-len(df[filt])} rows was removed")
df = df[filt]
print(f"{len(df)} rows left")

353753 rows was removed
505481 rows left


### Expand and rearrange dataframe

#### Reset index

In [14]:
df = df.reset_index(drop=True)

#### Create column "Tournament"

In [15]:
df['Tournament'] = df['Event'].str.contains('tournament')

#### Rearrange column "Event"

In [16]:
df['Event'] = df['Event'].map({' Classical ': 'Classical',
                               ' Blitz ': 'Blitz',
                               ' Blitz tournament ': 'Blitz',
                               ' Correspondence ': 'Classical',
                               ' Classical tournament ': 'Classical',
                               ' Bullet tournament ': 'Bullet',
                               ' Bullet ': 'Bullet',
                               'Blitz tournament ': 'Blitz',
                               'Bullet ': 'Bullet',
                               'Classical ': 'Classical',
                               'Blitz ': 'Blitz'
                               })

#### Change columns datatype

In [27]:
df['Event'] = df['Event'].astype('category')
df['Result'] = df['Result'].astype('category')
df['Termination'] = df['Termination'].astype('category')
df['UTCDate'] = pd.to_datetime(df['UTCDate'])

In [29]:
df.head()

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN,Tournament
0,Classical,1-0,2016-06-30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...,False
1,Blitz,1-0,2016-06-30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....,True
2,Classical,1-0,2016-06-30,Van't Kruijs Opening,Normal,1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...,False
3,Blitz,0-1,2016-06-30,Vienna Game,Normal,1. e4 e5 2. Nc3 d6 3. Nf3 h6 4. Bc4 c6 5. b3 Q...,True
4,Blitz,1-0,2016-06-30,Queen's Pawn Game: London System,Normal,1. d4 d5 2. Nf3 Nf6 3. Bf4 c6 4. e3 Bg4 5. Be2...,True


#### Parse column "AN". Transform the string of moves to list

In [30]:
def get_raw_list(row: str) -> list:
    '''Parse a string to list of moves.
       List elements are strings.'''
    raw_list = []
    raw_list.append(row)
    sep_num = 2
    while True:  # parse as many moves as possible
        separator = str(sep_num)+'.'
        if separator in raw_list[-1]:  # if the next move is found
            raw_list = raw_list[0:-1] + raw_list[-1].split(' '+separator) # concatenate all previous moves and remainder
            raw_list[-1] = str(sep_num) + raw_list[-1]
            sep_num+=1
        else:  # if the next move is not found
            raw_list[-1] = re.sub(' 1-0',     '', raw_list[-1])
            raw_list[-1] = re.sub(' 0-1',     '', raw_list[-1])
            raw_list[-1] = re.sub(' 1/2-1/2', '', raw_list[-1])
            raw_list[-1] = re.sub(' \*',      '', raw_list[-1])
            raw_list[0] = re.sub('1.', '1', raw_list[0])
            break
    return raw_list

In [31]:
%%time
df["AN"] = df["AN"].apply(get_raw_list)

CPU times: total: 47.1 s
Wall time: 48.2 s


#### NEW: apply

In [32]:
def populate_list_moves_total(row: list):
    for move in row:
        tuple_move = tuple([0] + move.split(' '))

        if len(tuple_move) == 3:
            tuple_move = tuple_move + (None, ) # ('0', '38', 'Rfd1', None)
        elif len(tuple_move) < 3 or len(tuple_move) > 4:
            print(f'Unexpected number of elements after split: {len(tuple_move)} elements in {tuple_move} in {row} row')

        list_moves_total.append(tuple_move)

In [33]:
%%time
%%capture
list_moves_total: list[tuple] = []
df["AN"].apply(populate_list_moves_total)

CPU times: total: 28.6 s
Wall time: 51.8 s


In [34]:
len(list_moves_total)

17541335

#### list_moves_total -> df_moves_total

In [35]:
%%time
%%capture
df_moves_total = pd.DataFrame(list_moves_total, columns=['match_id', 'move_num', 'white_move', 'black_move'])

CPU times: total: 29.3 s
Wall time: 2min 9s


In [40]:
df_moves_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17541335 entries, 0 to 17541334
Data columns (total 4 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   match_id    int64 
 1   move_num    object
 2   white_move  object
 3   black_move  object
dtypes: int64(1), object(3)
memory usage: 535.3+ MB


#### OLD: Get second data frame with moves per match

In [48]:
def create_moves_table(df):
    df_moves_total = pd.DataFrame()
    
    for row in range(len(df)):
        df_moves = pd.DataFrame()
        
        for move in range(len(df['AN'][row])):
            tuple_move = tuple([row] + df['AN'][row][move].split(' ')) # ('0','1','e4','b6')
            
            if len(tuple_move) == 3:
                tuple_move = tuple_move + (None, ) # ('0', '38', 'Rfd1', None)
            elif len(tuple_move) < 3 or len(tuple_move) > 4:
                print(f'Unexpected number of elements after split: {len(tuple_move)} elements in {tuple_move} in {row} row')
            
            df_temp = pd.DataFrame([tuple_move], columns=['match_id', 'move_num', 'white_move', 'black_move']) # parse each list element
            df_moves = pd.concat([df_moves, df_temp]) # add to df for this row (match)
        
        df_moves_total = pd.concat([df_moves_total, df_moves])
    
    print('Done')
    return df_moves_total

In [49]:
%%time
df_moves_total = create_moves_table(df)

Done
CPU times: total: 531 ms
Wall time: 518 ms


#### NEW: bare

In [20]:
%%time
list_moves_total: list[tuple] = [] # pd.DataFrame()

for i, row in enumerate(df['AN'].values):
    #df_moves: list[tuple] = [] # pd.DataFrame()
    
    for move in row:
        tuple_move = tuple([i] + move.split(' '))

        if len(tuple_move) == 3:
            tuple_move = tuple_move + (None, ) # ('0', '38', 'Rfd1', None)
        elif len(tuple_move) < 3 or len(tuple_move) > 4:
            print(f'Unexpected number of elements after split: {len(tuple_move)} elements in {tuple_move} in {row} row')

        #df_temp = pd.DataFrame([tuple_move], columns=['match_id', 'move_num', 'white_move', 'black_move'])
        #df_moves.append(tuple_move)
        #df_moves = pd.concat([df_moves, df_temp])
    
    #df_moves_total = pd.concat([df_moves_total, df_moves])
        list_moves_total.append(tuple_move)

CPU times: total: 0 ns
Wall time: 1 ms


#### Draft

In [22]:
# %%time
# for i, row in df.iterrows():
#     df.at[i, 'Event'] = df.at[i, 'Event']

In [23]:
# %%time
# for i in df.index:
#     df['Event'].iloc[i] = df['Event'].iloc[i]

In [24]:
df['AN'].values[0][0]

'1 d4 d5'

In [None]:
# %%time
# df_moves_total = create_moves_table_no_fk_1(df['AN'].values)

In [71]:
# dfm = pd.DataFrame([df_moves_total], columns=['match_id', 'move_num', 'white_move', 'black_move'])#.head(3)
# dfm

In [209]:
df_moves_total.tail(3)

Unnamed: 0,match_id,move_num,white_move,black_move
0,0,13,Rxd2,Be6
0,0,14,b3,Rfd8
0,0,15,Ba3,Rxd2


In [210]:
length = 0
for row in range(len(df)):
    length = length + len(df['AN'][row])
length

30793

In [6]:
len(df_moves_total)

453

In [22]:
# v = pd.DataFrame([('0','38','Rfd1'),
#                   ('0','2','d4','Bb7')
#                  ],
#                   columns=['match_id', 'move_num', 'white_move', 'black_move'])
# v1 = pd.DataFrame([('1','1','e4','b6'),
#                   ('1','2','d4','Bb7')
#                  ],
#                   columns=['match_id', 'move_num', 'white_move', 'black_move'])
# v = pd.concat([v, v1])
# v