# Imports

In [1]:
import os
import pandas as pd
import datetime
import re

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

## load sample raw data


In [50]:
import pickle
path_to_data = os.path.join("..", "data")
sample_fname = "sample_raw.pkl"

with open(os.path.join(path_to_data, sample_fname), "rb") as file:
    raw_data = pickle.load(file)

raw_data[:1]

[{'url': 'https://www.chess.com/game/daily/437377841',
  'pgn': '[Event "Let\'s Play"]\n[Site "Chess.com"]\n[Date "2022.10.12"]\n[Round "-"]\n[White "sylvainau"]\n[Black "JammyNinja"]\n[Result "0-1"]\n[CurrentPosition "2r1k2r/pp3ppp/q7/4pb2/1b1P4/1P2B3/1P3PPP/K2R2NR w k - 7 21"]\n[Timezone "UTC"]\n[ECO "B10"]\n[ECOUrl "https://www.chess.com/openings/Caro-Kann-Defense-Hillbilly-Attack-2...d5"]\n[UTCDate "2022.10.12"]\n[UTCTime "08:17:06"]\n[WhiteElo "1298"]\n[BlackElo "1461"]\n[TimeControl "1/604800"]\n[Termination "JammyNinja won by checkmate"]\n[StartTime "08:17:06"]\n[EndDate "2023.01.07"]\n[EndTime "14:35:53"]\n[Link "https://www.chess.com/game/daily/437377841"]\n\n1. e4 {[%clk 0:56:50.2]} 1... c6 {[%clk 0:27:49.5]} 2. Bc4 {[%clk 0:07:35.1]} 2... d5 {[%clk 3:13:05.6]} 3. exd5 {[%clk 0:55:52.1]} 3... cxd5 {[%clk 9:10:15.2]} 4. Bb3 {[%clk 0:51:51.9]} 4... Nf6 {[%clk 1:04:14.9]} 5. d4 {[%clk 1:03:41.6]} 5... Nc6 {[%clk 6:14:14.9]} 6. Qe2 {[%clk 0:53:48.2]} 6... Nxd4 {[%clk 11:09:15.8]}

## create cleaning pipeline

steps:
- convert games from list/dict to pandas Series
- convert dates to pd.datetimes

In [None]:
# .strftime('%d-%m-%Y')

In [138]:
test_game = raw_data[0]
def extract_game_data(game, USERNAME = "JammyNinja"):
    """ 
        converts a game as received from chess.com API JSON to pandas Series

        try out the raw form at: 
            https://api.chess.com/pub/player/jammyninja/games/2023/01
    """
    # display(game)
    game_dict = {}
    
    game_pgn = game["pgn"]
    game_dict["result"] = game_pgn.split("}")[-1].strip()
    game_dict['time_control'] = game["time_control"]
    game_dict['time_class'] = game["time_class"]
    game_dict['rules'] = game["rules"] #to exclude any chess960 or other that may have been played

    #extract date from pgn 'date' field
    date_pattern = r'\[Date\s+"(\d{4}.\d{2}.\d{2})"\]'
    date_str = re.search(date_pattern, game_pgn).group(1)
    game_dict['date'] = pd.to_datetime(date_str).strftime('%d/%m/%Y')

    user_colour = "white" if game["white"]["username"] == USERNAME else "black"
    opp_colour = "black" if user_colour == "white" else "white" #used to save time below

    game_dict["user_colour"] = user_colour
    game_dict['opp_username'] = game[opp_colour]["username"]
    
    game_dict['user_rating'] = game[user_colour]["rating"]
    game_dict['opp_rating'] = game[opp_colour]["rating"]

    game_dict['user_result'] = game[user_colour]["result"]
    game_dict['opp_result'] = game[opp_colour]["result"]

    return game_dict

test_game = extract_game_data(test_game)
test_game

{'result': '0-1',
 'time_control': '1/604800',
 'time_class': 'daily',
 'rules': 'chess',
 'date': '12/10/2022',
 'user_colour': 'black',
 'opp_username': 'sylvainau',
 'user_rating': 1461,
 'opp_rating': 1298,
 'user_result': 'win',
 'opp_result': 'checkmated'}

In [148]:
def type_conversion(df):
    dt_cols = ["date"]
    for col in dt_cols:
        df[col] = pd.to_datetime(df[col], format='%d/%m/%Y')
    return df
type_conversion(pd.Series(test_game))

result                          0-1
time_control               1/604800
time_class                    daily
rules                         chess
date            2022-10-12 00:00:00
user_colour                   black
opp_username              sylvainau
user_rating                    1461
opp_rating                     1298
user_result                     win
opp_result               checkmated
dtype: object

In [149]:
def games_to_pandas(raw_games_list):
    clean_list = [extract_game_data(game) for game in raw_games_list]
    return pd.DataFrame(clean_list)
games_to_pandas(raw_data[:15])

Unnamed: 0,result,time_control,time_class,rules,date,user_colour,opp_username,user_rating,opp_rating,user_result,opp_result
0,0-1,1/604800,daily,chess,12/10/2022,black,sylvainau,1461,1298,win,checkmated
1,0-1,1/604800,daily,chess,26/11/2022,white,Timkee,1429,1234,timeout,win
2,0-1,1/604800,daily,chess,06/10/2022,black,ELNAHAS4,1449,1427,win,timeout
3,1-0,300,blitz,chess,30/01/2023,black,ivangard57,1030,1022,timeout,win
4,1/2-1/2,1/604800,daily,chess,30/01/2023,black,ajbouchie,1449,1584,agreed,agreed
5,0-1,300,blitz,chess,15/03/2023,black,axdasilva129,1057,999,win,resigned
6,1-0,1/432000,daily,chess,30/01/2023,black,lonelywolf457,1422,1269,timeout,win
7,0-1,1/432000,daily,chess,30/01/2023,white,lonelywolf457,1398,1287,timeout,win
8,1-0,1/604800,daily,chess,26/11/2022,black,Sotttacqua,1388,1556,resigned,win
9,1-0,300,blitz,chess,25/03/2023,black,Bhashyam2K23,1036,1158,timeout,win


In [150]:
clean_pipe = Pipeline([
    ("raw list to df" , FunctionTransformer(games_to_pandas)),
    ("dtypes", FunctionTransformer(type_conversion))
])

clean_data = clean_pipe.fit_transform(raw_data[:15])
clean_data

Unnamed: 0,result,time_control,time_class,rules,date,user_colour,opp_username,user_rating,opp_rating,user_result,opp_result
0,0-1,1/604800,daily,chess,2022-10-12,black,sylvainau,1461,1298,win,checkmated
1,0-1,1/604800,daily,chess,2022-11-26,white,Timkee,1429,1234,timeout,win
2,0-1,1/604800,daily,chess,2022-10-06,black,ELNAHAS4,1449,1427,win,timeout
3,1-0,300,blitz,chess,2023-01-30,black,ivangard57,1030,1022,timeout,win
4,1/2-1/2,1/604800,daily,chess,2023-01-30,black,ajbouchie,1449,1584,agreed,agreed
5,0-1,300,blitz,chess,2023-03-15,black,axdasilva129,1057,999,win,resigned
6,1-0,1/432000,daily,chess,2023-01-30,black,lonelywolf457,1422,1269,timeout,win
7,0-1,1/432000,daily,chess,2023-01-30,white,lonelywolf457,1398,1287,timeout,win
8,1-0,1/604800,daily,chess,2022-11-26,black,Sotttacqua,1388,1556,resigned,win
9,1-0,300,blitz,chess,2023-03-25,black,Bhashyam2K23,1036,1158,timeout,win


In [151]:
clean_data.dtypes

result                  object
time_control            object
time_class              object
rules                   object
date            datetime64[ns]
user_colour             object
opp_username            object
user_rating              int64
opp_rating               int64
user_result             object
opp_result              object
dtype: object

### create preproc pipeline

add new columns