# Imports

In [3]:
import os
import pandas as pd

## load sample raw data


In [34]:
import pickle
path_to_data = os.path.join("..", "data")
sample_fname = "sample_raw.pkl"

with open(os.path.join(path_to_data, sample_fname), "rb") as file:
    raw_data = pickle.load(file)

raw_data

[{'url': 'https://www.chess.com/game/daily/437377841',
  'pgn': '[Event "Let\'s Play"]\n[Site "Chess.com"]\n[Date "2022.10.12"]\n[Round "-"]\n[White "sylvainau"]\n[Black "JammyNinja"]\n[Result "0-1"]\n[CurrentPosition "2r1k2r/pp3ppp/q7/4pb2/1b1P4/1P2B3/1P3PPP/K2R2NR w k - 7 21"]\n[Timezone "UTC"]\n[ECO "B10"]\n[ECOUrl "https://www.chess.com/openings/Caro-Kann-Defense-Hillbilly-Attack-2...d5"]\n[UTCDate "2022.10.12"]\n[UTCTime "08:17:06"]\n[WhiteElo "1298"]\n[BlackElo "1461"]\n[TimeControl "1/604800"]\n[Termination "JammyNinja won by checkmate"]\n[StartTime "08:17:06"]\n[EndDate "2023.01.07"]\n[EndTime "14:35:53"]\n[Link "https://www.chess.com/game/daily/437377841"]\n\n1. e4 {[%clk 0:56:50.2]} 1... c6 {[%clk 0:27:49.5]} 2. Bc4 {[%clk 0:07:35.1]} 2... d5 {[%clk 3:13:05.6]} 3. exd5 {[%clk 0:55:52.1]} 3... cxd5 {[%clk 9:10:15.2]} 4. Bb3 {[%clk 0:51:51.9]} 4... Nf6 {[%clk 1:04:14.9]} 5. d4 {[%clk 1:03:41.6]} 5... Nc6 {[%clk 6:14:14.9]} 6. Qe2 {[%clk 0:53:48.2]} 6... Nxd4 {[%clk 11:09:15.8]}

## create cleaning pipeline

steps:
- convert games from list/dict to pandas Series
- convert dates to pd.datetimes

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [52]:
USERNAME = "JammyNinja"
def game_to_pandas(game):
    """ 
        converts a game as received from chess.com API JSON to pandas Series

        try out the raw form at: 
            https://api.chess.com/pub/player/jammyninja/games/2023/01
    """
    display(game)
    game_dict = {}
    
    game_pgn = game["pgn"]
    game_dict["result"] = game_pgn.split("}")[-1].strip()
    game_dict['time_control'] = game["time_control"]
    game_dict['time_class'] = game["time_class"]
    game_dict['rules'] = game["rules"] #to exclude any chess960 or other that may have been played

    game_dict['start_time'] = game["start_time"]

    user_colour = "white" if game["white"]["username"] == USERNAME else "black"
    opp_colour = "black" if user_colour == "white" else "white" #used to save time below
    
    game_dict['opp_username'] = game[opp_colour]["username"]
    
    game_dict['user_rating'] = game[user_colour]["rating"]
    game_dict['user_result'] = game[user_colour]["result"]

    game_dict['opp_rating'] = game[opp_colour]["rating"]
    game_dict['opp_result'] = game[opp_colour]["result"]

    game_dict["user_colour"] = user_colour
    game_dict["user_rating"] = user_rating
    
    return game_dict
    


game_to_pandas(raw_data[0])

{'url': 'https://www.chess.com/game/daily/437377841',
 'pgn': '[Event "Let\'s Play"]\n[Site "Chess.com"]\n[Date "2022.10.12"]\n[Round "-"]\n[White "sylvainau"]\n[Black "JammyNinja"]\n[Result "0-1"]\n[CurrentPosition "2r1k2r/pp3ppp/q7/4pb2/1b1P4/1P2B3/1P3PPP/K2R2NR w k - 7 21"]\n[Timezone "UTC"]\n[ECO "B10"]\n[ECOUrl "https://www.chess.com/openings/Caro-Kann-Defense-Hillbilly-Attack-2...d5"]\n[UTCDate "2022.10.12"]\n[UTCTime "08:17:06"]\n[WhiteElo "1298"]\n[BlackElo "1461"]\n[TimeControl "1/604800"]\n[Termination "JammyNinja won by checkmate"]\n[StartTime "08:17:06"]\n[EndDate "2023.01.07"]\n[EndTime "14:35:53"]\n[Link "https://www.chess.com/game/daily/437377841"]\n\n1. e4 {[%clk 0:56:50.2]} 1... c6 {[%clk 0:27:49.5]} 2. Bc4 {[%clk 0:07:35.1]} 2... d5 {[%clk 3:13:05.6]} 3. exd5 {[%clk 0:55:52.1]} 3... cxd5 {[%clk 9:10:15.2]} 4. Bb3 {[%clk 0:51:51.9]} 4... Nf6 {[%clk 1:04:14.9]} 5. d4 {[%clk 1:03:41.6]} 5... Nc6 {[%clk 6:14:14.9]} 6. Qe2 {[%clk 0:53:48.2]} 6... Nxd4 {[%clk 11:09:15.8]} 7

{'result': '0-1',
 'time_control': '1/604800',
 'time_class': 'daily',
 'rules': 'chess',
 'start_time': 1665562626}

In [10]:
def date_conversion(df):
    dt_cols = ["date", "start_time", "end_time"]
    for col in dt_cols:
        df[col] = pd.to_datetime(df[col])
    return df

date_conversion(raw_data).dtypes

Unnamed: 0                     int64
date                  datetime64[ns]
start_time            datetime64[ns]
end_time              datetime64[ns]
time_class                    object
time_control                  object
rated                           bool
rules                         object
url                           object
moves                         object
final_position_fen            object
opening_code                  object
opening_name                  object
white_username                object
black_username                object
result                        object
white_rating                   int64
black_rating                   int64
white_result                  object
black_result                  object
dtype: object

In [12]:
date_converter = FunctionTransformer(date_conversion)

clean_pipe = Pipeline([
    ("date dtype", date_converter)
])

In [20]:
clean_pipe.fit_transform(raw_data).dtypes

Unnamed: 0                     int64
date                  datetime64[ns]
start_time            datetime64[ns]
end_time              datetime64[ns]
time_class                    object
time_control                  object
rated                           bool
rules                         object
url                           object
moves                         object
final_position_fen            object
opening_code                  object
opening_name                  object
white_username                object
black_username                object
result                        object
white_rating                   int64
black_rating                   int64
white_result                  object
black_result                  object
dtype: object

In [None]:
test_data = [0,1,2,3,4,5]

test_pipe = Pipeline([
    ("doubler", FunctionTransformer(lambda x : x*2))
])

In [33]:
test_pipe.fit_transform(test_data)

[0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5]

### create preproc pipeline

add new columns