In [115]:
import re
import pandas as pd

pd.set_option('display.max_columns', None)

In [151]:
%%time
df = pd.read_csv('chess_games.csv', nrows=1000000,
                usecols=['Event', 'Result', 'UTCDate', 'Opening', 'Termination', 'AN']) # chunksize=100000, index_col=''

CPU times: total: 10.2 s
Wall time: 12 s


In [152]:
df.head(3)

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN
0,Classical,1-0,2016.06.30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1,Blitz,0-1,2016.06.30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2,Blitz tournament,1-0,2016.06.30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....


In [153]:
filt = (df['Result'] != '*') & (df['Termination'] != 'Abandoned') & (df['Termination'] != 'Rules infraction')
df = df[filt]

In [154]:
df['UTCDate'] = pd.to_datetime(df['UTCDate'])

In [155]:
df['Tournament'] = df['Event'].str.contains('tournament')

In [156]:
df['Event'].value_counts()

Event
 Blitz                    374663
 Classical                241289
 Bullet                   196192
 Bullet tournament         79983
 Blitz tournament          74760
 Classical tournament      26900
 Correspondence             3816
Classical                      4
Blitz                          2
Blitz tournament               1
Bullet                         1
Name: count, dtype: int64

In [157]:
df['Event'].unique()

array([' Classical ', ' Blitz ', ' Blitz tournament ', ' Correspondence ',
       ' Classical tournament ', ' Bullet tournament ', ' Bullet ',
       'Blitz tournament ', 'Bullet ', 'Classical ', 'Blitz '],
      dtype=object)

In [158]:
df['Event'] = df['Event'].map({' Classical ': 'Classical',
                               ' Blitz ': 'Blitz',
                               ' Blitz tournament ': 'Blitz',
                               ' Correspondence ': 'Classical',
                               ' Classical tournament ': 'Classical',
                               ' Bullet tournament ': 'Bullet',
                               ' Bullet ': 'Bullet',
                               'Blitz tournament ': 'Blitz',
                               'Bullet ': 'Bullet',
                               'Classical ': 'Classical',
                               'Blitz ': 'Blitz'
                               })

In [159]:
df['Event'].value_counts()

Event
Blitz        449426
Bullet       276176
Classical    272009
Name: count, dtype: int64

In [160]:
df.head()

Unnamed: 0,Event,Result,UTCDate,Opening,Termination,AN,Tournament
0,Classical,1-0,2016-06-30,Slav Defense,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...,False
1,Blitz,0-1,2016-06-30,King's Pawn Opening: 2.b3,Normal,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...,False
2,Blitz,1-0,2016-06-30,Scandinavian Defense: Mieses-Kotroc Variation,Time forfeit,1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....,True
3,Classical,1-0,2016-06-30,Van't Kruijs Opening,Normal,1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...,False
4,Blitz,0-1,2016-06-30,"Sicilian Defense: Najdorf, Lipnitsky Attack",Time forfeit,1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...,True


In [161]:
df['Opening'].value_counts()

Opening
Van't Kruijs Opening                                              21159
Scandinavian Defense: Mieses-Kotroc Variation                     18487
Modern Defense                                                    16940
Horwitz Defense                                                   15408
Sicilian Defense                                                  14136
                                                                  ...  
Queen's Indian, Averbakh Variation                                    1
Russian Game: Modern Attack, Murrey Variation                         1
Four Knights Game: Spanish Variation, Symmetrical Variation #2        1
Ruy Lopez: Classical Defense: Zaitsev Variation                       1
Vienna Game: Mieses Variation, Erben Gambit                           1
Name: count, Length: 2707, dtype: int64

In [162]:
df_open = df['Opening'].value_counts().rename_axis('unique_values').reset_index(name='counts')

In [163]:
df_open[df_open['counts']>1500]
#df_open.head(100)

Unnamed: 0,unique_values,counts
0,Van't Kruijs Opening,21159
1,Scandinavian Defense: Mieses-Kotroc Variation,18487
2,Modern Defense,16940
3,Horwitz Defense,15408
4,Sicilian Defense,14136
...,...,...
146,Slav Defense: Three Knights Variation,1557
147,Ruy Lopez: Classical Variation,1544
148,"Nimzowitsch Defense: Scandinavian Variation, A...",1526
149,Petrov's Defense,1519


In [164]:
# value_counts = df.stack().value_counts() # Entire DataFrame 
# to_remove = value_counts[value_counts <= 1500].index
# df.replace(to_remove, np.nan, inplace=True)

In [165]:
vc = df['Opening'].value_counts()
vals_to_remove = vc[vc <= 1500].index.values
df['Opening'].loc[df['Opening'].isin(vals_to_remove)] = '111'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Opening'].loc[df['Opening'].isin(vals_to_remove)] = '111'


In [166]:
filt = df['Opening'] != '111'
df = df[filt]

In [167]:
df = df.reset_index(drop=True)

In [168]:
def get_moves_as_list(row) -> list:
    ori_list = []
    # ori_list.append(df.loc[row, 'AN'])
    ori_list.append(row)
    sep_num = 2
    while True:
        separator = str(sep_num)+'.'
        if separator in ori_list[-1]:
            ori_list = ori_list[0:-1] + ori_list[-1].split(' '+separator) # concatenate all previous moves and remainder
            ori_list[-1] = separator + ori_list[-1]
            sep_num+=1
        else:
            ori_list[-1] = re.sub(' 1-0', '', ori_list[-1])
            ori_list[-1] = re.sub(' 0-1', '', ori_list[-1])
            ori_list[-1] = re.sub(' 1/2-1/2', '', ori_list[-1])
            break
    return ori_list

In [169]:
%%time
df["AN"] = df["AN"].apply(get_moves_as_list)
# df.loc[0, 'AN']

CPU times: total: 1min 3s
Wall time: 2min 33s


In [None]:
# for row in range(df.shape[0]):
#     cell = df.loc[row, 'AN']
#     if type(cell) == str:
#         cell = get_moves_as_list(row)
#     else:
#         print(f'In row {row} value is a {type(cell)}')

In [None]:
df["AN"].str.len().hist()

In [None]:
filt = df["AN"].str.len() == 6119
df.loc[52412, 'AN']

In [None]:
df.head()

In [None]:
filt = df["AN"].apply(len) < 50
df[filt] # Time forfeit but an is short

In [170]:
df["AN"]

0         [1. d4 d5, 2. c4 c6, 3. e3 a6, 4. Nf3 e5, 5. c...
1         [1. e4 d5, 2. exd5 Qxd5, 3. Nf3 Bg4, 4. Be2 Nf...
2         [1. e3 Nf6, 2. Bc4 d6, 3. e4 e6, 4. Nf3 Nxe4, ...
3         [1. e4 e5, 2. Nc3 d6, 3. Nf3 h6, 4. Bc4 c6, 5....
4         [1. d4 d5, 2. Nf3 Nf6, 3. Bf4 c6, 4. e3 Bg4, 5...
                                ...                        
618862    [1. c4 e5, 2. Nc3 Bc5, 3. Nf3 d6, 4. d3 Nc6, 5...
618863    [1. e4 e5, 2. Nf3 d6, 3. Bc4 h6, 4. c3 Ne7, 5....
618864    [1. d4 d5, 2. Nf3 Nc6, 3. Ng5 e5, 4. h4 e4, 5....
618865    [1. e4 { [%eval 0.18] } 1... e6 { [%eval 0.21]...
618866    [1. e4 e6, 2. f4 d5, 3. f5 c5, 4. fxe6 fxe6, 5...
Name: AN, Length: 618867, dtype: object