# Imports

In [None]:
import os
import pandas as pd
import datetime
import re

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

## load sample raw data


In [None]:
import pickle
path_to_data = os.path.join("..", "data")
sample_fname = "sample_raw.pkl"

with open(os.path.join(path_to_data, sample_fname), "rb") as file:
    raw_data = pickle.load(file)

raw_data[:1]

## create cleaning pipeline

steps:
- convert games from list/dict to pandas Series
- convert dates to pd.datetimes

In [None]:
# .strftime('%d-%m-%Y')

In [None]:
test_game = raw_data[0]
def extract_game_data(game, USERNAME = "JammyNinja"):
    """ 
        converts a game as received from chess.com API JSON to pandas Series

        try out the raw form at: 
            https://api.chess.com/pub/player/jammyninja/games/2023/01
    """
    # display(game)
    game_dict = {}
    
    game_pgn = game["pgn"]
    game_dict["result"] = game_pgn.split("}")[-1].strip()
    game_dict['time_control'] = game["time_control"]
    game_dict['time_class'] = game["time_class"]
    game_dict['rules'] = game["rules"] #to exclude any chess960 or other that may have been played

    #extract date from pgn 'date' field
    date_pattern = r'\[Date\s+"(\d{4}.\d{2}.\d{2})"\]'
    date_str = re.search(date_pattern, game_pgn).group(1)
    game_dict['date'] = pd.to_datetime(date_str).strftime('%d/%m/%Y')

    user_colour = "white" if game["white"]["username"] == USERNAME else "black"
    opp_colour = "black" if user_colour == "white" else "white" #used to save time below

    game_dict["user_colour"] = user_colour
    game_dict['opp_username'] = game[opp_colour]["username"]
    
    game_dict['user_rating'] = game[user_colour]["rating"]
    game_dict['opp_rating'] = game[opp_colour]["rating"]

    game_dict['user_result'] = game[user_colour]["result"]
    game_dict['opp_result'] = game[opp_colour]["result"]

    return game_dict

test_game = extract_game_data(test_game)
test_game

In [None]:
def type_conversion(df):
    dt_cols = ["date"]
    for col in dt_cols:
        df[col] = pd.to_datetime(df[col], format='%d/%m/%Y')
    return df
type_conversion(pd.Series(test_game))

In [None]:
def games_to_pandas(raw_games_list):
    clean_list = [extract_game_data(game) for game in raw_games_list]
    return pd.DataFrame(clean_list)
games_to_pandas(raw_data[:15])

In [None]:
clean_pipe = Pipeline([
    ("raw list to df" , FunctionTransformer(games_to_pandas)),
    ("dtypes", FunctionTransformer(type_conversion))
])

clean_data = clean_pipe.fit_transform(raw_data[:15])
clean_data

In [None]:
clean_data.dtypes

In [None]:
clean_pipe_fname = "clean_pipeline.pkl"
clean_pipe_path = os.path.join(path_to_data, "pipes", clean_pipe_fname)
with open(clean_pipe_path, "wb") as file:
    pickle.dump(clean_pipe, file)

In [None]:
pickle.dump(clean_pipe, open(clean_pipe_path, "wb"))

In [None]:
test_clean_pipe = pickle.load(open(clean_pipe_path,"rb"))
test_clean_pipe

In [None]:
test_clean_pipe.transform(raw_data)

### try importing from local scripts

shipped notebook code out into public/back/api/src/preprocessing.py

In [None]:
! ls ../../public/back/api/src

In [None]:
import sys, os
module_path = os.path.abspath(os.path.join('..','..','public','back','api'))
if module_path not in sys.path:
    src_path = os.path.join(module_path, "src")
    if src_path not in sys.path:
        sys.path.append(src_path)
    display(sys.path)

In [171]:
from preprocessing import clean_raw_data

In [173]:
clean_df = clean_raw_data()

In [175]:
clean_df.opp_rating.mean()

np.float64(1064.8798185941043)

### create preproc pipeline

add new columns