In [1]:
import math
from datetime import datetime, timedelta

import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow

In [2]:
def get_or_create_experiment(experiment_name):
  """
  Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

  This function checks if an experiment with the given name exists within MLflow.
  If it does, the function returns its ID. If not, it creates a new experiment
  with the provided name and returns its ID.

  Parameters:
  - experiment_name (str): Name of the MLflow experiment.

  Returns:
  - str: ID of the existing or newly created MLflow experiment.
  """

  if experiment := mlflow.get_experiment_by_name(experiment_name):
      return experiment.experiment_id
  else:
      return mlflow.create_experiment(experiment_name)

In [3]:
experiment_id = get_or_create_experiment("XGBoost_Duration_3moves")

In [4]:
from typing import List, Tuple, Dict, Optional

def get_train_split(dp):
    x = dp.drop(columns="winner")
    y = dp.get("winner")
    return train_test_split(x, y, random_state=42)


def load_raw_dataset(file_path: str) -> pd.DataFrame:
    """Load the CSV dataset from disk."""
    return pd.read_csv(file_path)


def filter_to_rated(df: pd.DataFrame) -> pd.DataFrame:
    """Keep only rated games (rated == True)."""
    if 'rated' not in df.columns:
        return df.copy()
    return df[df['rated'] == True].copy()


def remove_duplicate_ids(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicate rows sharing the same game `id` (keep first)."""
    if 'id' not in df.columns:
        return df.copy()
    return df.drop_duplicates(subset='id', keep='first').copy()


def remove_duplicate_games(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates that share the same (created_at, white_id, black_id)."""
    required = {'created_at', 'white_id', 'black_id'}
    if not required.issubset(df.columns):
        return df.copy()
    return df.drop_duplicates(subset=['created_at', 'white_id', 'black_id'], keep='first').copy()


def add_game_duration_seconds(df: pd.DataFrame) -> pd.DataFrame:
    """Add `time` as seconds between `last_move_at` and `created_at`, then drop both timestamp columns."""
    cols = {'created_at', 'last_move_at'}
    out = df.copy()
    if cols.issubset(out.columns):
        out['time'] = (
                pd.to_datetime(out['last_move_at'], unit='ms') - pd.to_datetime(out['created_at'], unit='ms')
        ).dt.total_seconds()
        out = out.drop(columns=['last_move_at', 'created_at'])
    return out


def split_by_duration_variants(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Create two variants:
    - dataset_duration: keep only rows where time != 0 and time != 10000.0
    - dataset_noduration: drop the `time` column entirely (keep all rows)
    If `time` is missing, returns (df copy, df copy without `time`).
    """
    out = df.copy()
    if 'time' in out.columns:
        dataset_duration = out[(out['time'] != 0) & (out['time'] != 10000.0)].copy()
        dataset_noduration = out.drop(columns=['time']).copy()
    else:
        dataset_duration = out.copy()
        dataset_noduration = out.copy()
    return dataset_duration, dataset_noduration


def drop_columns(df: pd.DataFrame, columns: Optional[List[str]] = None) -> pd.DataFrame:
    """Drop provided columns if present. No-op when `columns` is None or empty."""
    if not columns:
        return df.copy()
    return df.drop(columns=columns, errors='ignore').copy()


def first_k(s: str, fk: int) -> Optional[str]:
    if not isinstance(s, str):
        return None
    parts = s.split()
    if len(parts) == 0:
        return None
    return ' '.join(parts[:fk]) if len(parts) >= fk else s


def keep_first_n_moves(
        df: pd.DataFrame,
        n: int,
        only_n: bool = True,
        new_column: Optional[str] = None,
        add_all_prefix: str = 'moves_'
) -> pd.DataFrame:
    """Create truncated move sequences.

    - If only_n is True: keep exactly the first n SAN tokens. If `new_column` is None,
      overwrite `moves`; otherwise, write to `new_column`.
    - If only_n is False: add cumulative columns for k in [1..n] named
      f"{add_all_prefix}{k}", each containing the first k moves. The original
      `moves` column is not preserved.
    """
    if 'moves' not in df.columns:
        return df.copy()

    out = df.copy()

    if only_n:
        col = 'moves' if new_column is None else new_column
        out[col] = out['moves'].apply(lambda m: first_k(m, n))
    else:
        for k in range(1, max(1, n) + 1):
            out[f"{add_all_prefix}{k}"] = out['moves'].apply(lambda m: first_k(m, k))

    return out


def clean_chess_data(
        file_path: str,
        columns_to_drop: Optional[List[str]] = None,
        moves_n: Optional[int] = None,
        moves_only_n: bool = True,
        moves_new_column: Optional[str] = None,
        moves_add_all_prefix: str = 'moves_'
) -> Dict[str, pd.DataFrame]:
    """High-level pipeline that reproduces the notebook cleaning steps.

    Steps:
    1) load -> 2) filter rated -> 3) remove duplicate ids ->
    4) remove duplicate games by (created_at, white_id, black_id) ->
    5) add `time` (seconds) and drop raw timestamps ->
    6) create two variants: (duration != 0 AND != 10000.0) and (no `time`) ->
    7) optionally derive first-n moves (single or cumulative) ->
    8) drop requested columns in both variants.

    Returns a dict with keys: 'duration', 'noduration'.
    """
    df = load_raw_dataset(file_path)
    df = filter_to_rated(df)
    df = remove_duplicate_ids(df)
    df = remove_duplicate_games(df)
    df = add_game_duration_seconds(df)

    dataset_duration, dataset_noduration = split_by_duration_variants(df)

    # Optionally derive first-n moves on both variants before dropping columns
    if moves_n is not None and moves_n > 0:
        dataset_duration = keep_first_n_moves(
            dataset_duration,
            n=moves_n,
            only_n=moves_only_n,
            new_column=moves_new_column,
            add_all_prefix=moves_add_all_prefix,
        )
        dataset_noduration = keep_first_n_moves(
            dataset_noduration,
            n=moves_n,
            only_n=moves_only_n,
            new_column=moves_new_column,
            add_all_prefix=moves_add_all_prefix,
        )

    # Default columns to drop based on the notebook
    default_drop = ['id', 'white_id','black_id','opening_name', 'moves']
    cols = columns_to_drop if columns_to_drop is not None else default_drop

    dataset_duration = drop_columns(dataset_duration, cols)
    dataset_noduration = drop_columns(dataset_noduration, cols)

    return {
        'duration': dataset_duration,
        'noduration': dataset_noduration,
    }

In [5]:
# Load the dataset
df_3moves = clean_chess_data(
    "../res/games.csv",
    moves_n=3,
    moves_only_n=False
)

df_duration, df_noduration = df_3moves["duration"], df_3moves["noduration"]

df_duration = df_duration.drop(columns=["rated"])
df_duration.head()
df_duration.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8049 entries, 9291 to 20057
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   turns           8049 non-null   int64  
 1   victory_status  8049 non-null   object 
 2   winner          8049 non-null   object 
 3   increment_code  8049 non-null   object 
 4   white_rating    8049 non-null   int64  
 5   black_rating    8049 non-null   int64  
 6   opening_eco     8049 non-null   object 
 7   opening_ply     8049 non-null   int64  
 8   time            8049 non-null   float64
 9   moves_1         8049 non-null   object 
 10  moves_2         8049 non-null   object 
 11  moves_3         8049 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 817.5+ KB


We need to convert the object columns:
```
 0   rated           8049 non-null   bool   
 1   turns           8049 non-null   int64  
 2   victory_status  8049 non-null   object 
 3   winner          8049 non-null   object 
 4   increment_code  8049 non-null   object 
 5   white_rating    8049 non-null   int64  
 6   black_rating    8049 non-null   int64  
 7   opening_eco     8049 non-null   object 
 8   opening_ply     8049 non-null   int64  
 9   time            8049 non-null   float64
 10  moves_1         8049 non-null   object 
 11  moves_2         8049 non-null   object 
 12  moves_3         8049 non-null   object 
 ```


In [6]:
# Train the model
mlflow.set_experiment(experiment_id=experiment_id)

from sklearn import set_config
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

set_config(transform_output="pandas")

# Ensure target exists
assert "winner" in df_duration.columns, f"winner not in: {df_duration.columns.tolist()}"

# Define features/target
X = df_duration.drop(columns=["winner"])
y = df_duration["winner"].copy()

# Split
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=42)

# Encode y
le = LabelEncoder()
train_y_enc = le.fit_transform(train_y)
valid_y_enc = le.transform(valid_y)

# Build preprocessing strictly from X (no 'winner' here)
cat_cols = X.select_dtypes(['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns

try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # sklearn >=1.2
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)         # sklearn <1.2

preprocessor = ColumnTransformer([
    ('cat', ohe, cat_cols),
    ('num', 'passthrough', num_cols),
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(objective='multi:softprob', num_class=len(le.classes_))),
])

# Fit
model.fit(train_x, train_y_enc)

# Optional: encoded features (pandas DataFrame)
X_train_enc = model[:-1].transform(train_x)
X_valid_enc = model[:-1].transform(valid_x)

# Predict (decoded back to original labels)
y_pred_enc = model.predict(valid_x)
y_pred = le.inverse_transform(y_pred_enc)

In [7]:
# Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(valid_y, y_pred))

# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(valid_y, y_pred))

0.7888198757763976
[[ 801    1  264]
 [   2  100    0]
 [ 242    1 1004]]
