### Catboost Class

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from catboost import CatBoostClassifier, Pool

class CatBoostMatchPredictor:
    """
    A class for training, evaluating, and saving a CatBoost model
    to predict match winners.
    """
    def __init__(self,
                 input_cols: list,
                 target_col: str,
                 cat_feats: list = None,
                 parameters: dict = None,
                 test_size: float = 0.2,
                 random_state: int = 42):
        self.feature_cols = input_cols
        self.target_col = target_col
        self.cat_features = cat_feats
        self.test_size = test_size
        self.random_state = random_state
    
        
        # Default CatBoost parameters
        if parameters is None:
            parameters = {
                'iterations': 1000,
                'learning_rate': 0.05,
                'depth': 6,
                'eval_metric': 'AUC',
                'random_seed': random_state,
                'verbose': 100,
                'early_stopping_rounds': 50
            }
        self.params = parameters
        self.model = None
        self.metrics = None
        self.report = None

    def fit(self, df: pd.DataFrame):
        """
        Train the CatBoost model on the provided DataFrame.
        """
        X = df[self.feature_cols].copy()  # Create a copy to avoid modifying the original DataFrame
        y = df[self.target_col].copy()
        
        if self.cat_features:
            for col in self.cat_features:
                X[col] = X[col].astype(str)  # enforce string type
        
        
        x_train, x_test, y_train, y_test = train_test_split(
            X, y,
            test_size=self.test_size,
            random_state=self.random_state,
            stratify=y
        )
        
        
        train_pool = Pool(data=x_train, label=y_train, cat_features=self.cat_features)
        test_pool = Pool(data=x_test, label=y_test, cat_features=self.cat_features)

        # Initialize and train the model
        self.model = CatBoostClassifier(**self.params)
        self.model.fit(train_pool, eval_set=test_pool)

        # Predict and evaluate
        y_pred = self.model.predict(x_test)
        y_prob = self.model.predict_proba(x_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        report = classification_report(y_test, y_pred)

        self.metrics = {'accuracy': accuracy, 'auc': auc}
        self.report = report

        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test AUC: {auc:.4f}")
        print("Classification Report:\n", report)
        return self

    def evaluate(self, df: pd.DataFrame, verbose: bool = True) -> dict:
        """
        Evaluate model performance on the given dataset.
        
        Args:
            df: DataFrame containing both features and target
            verbose: Whether to print metrics
            
        Returns:
            Dictionary containing accuracy, AUC, and classification report
        """
        if self.model is None:
            raise ValueError("Model not trained yet. Call fit() first.")

        X = df[self.feature_cols].copy()
        y = df[self.target_col].copy()

        # Apply same preprocessing as in fit()
        if self.cat_features:
            for col in self.cat_features:
                X[col] = X[col].fillna('missing').astype(str)

        # Create evaluation pool
        eval_pool = Pool(
            data=X,
            label=y,
            cat_features=self.cat_features
        )

        # Generate predictions
        y_pred = self.model.predict(eval_pool)
        y_prob = self.model.predict_proba(eval_pool)[:, 1]

        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y, y_pred),
            'auc': roc_auc_score(y, y_prob),
            'report': classification_report(y, y_pred)
        }

        if verbose:
            print(f"Evaluation Accuracy: {metrics['accuracy']:.4f}")
            print(f"Evaluation AUC: {metrics['auc']:.4f}")
            print("Classification Report:\n", metrics['report'])

        return metrics
    
    def save_model(self, filepath: str):
        """
        Save the trained model to a file.
        """
        if self.model is None:
            raise ValueError("Model has not been trained yet. Call fit() before saving.")
        self.model.save_model(filepath)
        print(f"Model saved to {filepath}")


### Train model

In [21]:
import pandas as pd
import numpy as np

long_df = pd.read_csv('long_df.csv')


feature_cols = ['rank_diff','age_diff','ft_diff','elo_diff_pre','surface','winner_hand','loser_hand']

cat_features = ['surface','winner_hand','loser_hand']

#Train model
params = {
    'task_type': 'GPU',  # Enable GPU acceleration
    'devices': '0:1',  # Use first GPU (optional, specify which GPU(s) to use)
    'bootstrap_type': 'Poisson',
    'depth': 7, 
    'learning_rate': np.float64(0.13206014408119843), 
    'l2_leaf_reg': np.float64(1.961928922715895), 
    'bagging_temperature': np.float64(0.5692672525377181), 
    'subsample': np.float64(0.7335742829177031), 
    'min_data_in_leaf': 90, 
    'random_strength': np.float64(1.2000000000000002), 
    'iterations': 1500, 
    'eval_metric': 'AUC', 
    'early_stopping_rounds': 50, 
    'random_seed': 42
 }


model = CatBoostMatchPredictor(
    input_cols=feature_cols,
    target_col='label',
    cat_feats=cat_features,
    parameters=params
)

model.fit(long_df)


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8904256	best: 0.8904256 (0)	total: 37.9ms	remaining: 56.9s
1:	total: 72.1ms	remaining: 54s
2:	total: 106ms	remaining: 52.8s
3:	total: 140ms	remaining: 52.5s
4:	total: 186ms	remaining: 55.5s
5:	test: 0.8968807	best: 0.8968807 (5)	total: 269ms	remaining: 1m 6s
6:	total: 311ms	remaining: 1m 6s
7:	total: 347ms	remaining: 1m 4s
8:	total: 380ms	remaining: 1m 2s
9:	total: 414ms	remaining: 1m 1s
10:	test: 0.8998763	best: 0.8998763 (10)	total: 450ms	remaining: 1m
11:	total: 490ms	remaining: 1m
12:	total: 525ms	remaining: 1m
13:	total: 559ms	remaining: 59.3s
14:	total: 592ms	remaining: 58.6s
15:	test: 0.9006478	best: 0.9006478 (15)	total: 627ms	remaining: 58.1s
16:	total: 662ms	remaining: 57.8s
17:	total: 695ms	remaining: 57.3s
18:	total: 731ms	remaining: 57s
19:	total: 768ms	remaining: 56.8s
20:	test: 0.9014336	best: 0.9014336 (20)	total: 800ms	remaining: 56.3s
21:	total: 833ms	remaining: 56s
22:	total: 866ms	remaining: 55.6s
23:	total: 906ms	remaining: 55.7s
24:	total: 941ms	remaini

<__main__.CatBoostMatchPredictor at 0x2b404e791e0>

### Evaluate Model

In [22]:
# Evaluate Model

# After training (predictor.fit(long_df)), evaluate on full dataset
full_dataset_metrics = model.evaluate(long_df)

# Get metrics programmatically
print(f"Overall AUC: {full_dataset_metrics['auc']:.3f}")

Evaluation Accuracy: 0.8184
Evaluation AUC: 0.9113
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82    193337
           1       0.82      0.82      0.82    193337

    accuracy                           0.82    386674
   macro avg       0.82      0.82      0.82    386674
weighted avg       0.82      0.82      0.82    386674

Overall AUC: 0.911


In [None]:
# Hyperparameter tuning

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool

long_df = pd.read_csv("long_df.csv")
# Prepare data (reuse your existing preprocessing)

X = long_df[feature_cols].copy()
y = long_df['label'].copy()

# Handle categorical NaNs (same as in your fit method)
for col in cat_features:
    X[col] = X[col].fillna('missing').astype(str)

# Split into train/validation (keep test set untouched)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

# Define search space
space = {
    'depth': hp.quniform('depth', 4, 8, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1), np.log(50)),
    'bootstrap_type': hp.choice('bootstrap_type', ['Bayesian', 'Bernoulli']),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.01, 1.0),
    'subsample': hp.pchoice('subsample_choice', [
        (0.5, None),  # 50% chance to ignore subsample
        (0.5, hp.uniform('subsample', 0.5, 1.0))
    ]),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 10, 100, 5),
    'random_strength': hp.quniform('random_strength', 1, 3, 0.1),
    'iterations': hp.quniform('iterations', 800, 2000, 100),
}

def objective(params):
    # Convert parameters
    params = {
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'l2_leaf_reg': params['l2_leaf_reg'],
        'bootstrap_type': params['bootstrap_type'],
        'bagging_temperature': (params['bagging_temperature'] 
                               if params['bootstrap_type'] == 'Bayesian' 
                               else None),
        'subsample': (params['subsample'] 
                     if params['bootstrap_type'] == 'Bernoulli' 
                     else None),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'random_strength': params['random_strength'],
        'iterations': int(params['iterations']),
        'eval_metric': 'AUC',
        'early_stopping_rounds': 50,
        'random_seed': 42,
        'verbose': False,
        'task_type': 'GPU'  # Remove if not using GPU
    }
    
    # Clean up None values
    params = {k: v for k, v in params.items() if v is not None}
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=val_pool)
    
    val_pred = model.predict_proba(val_pool)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    
    return {'loss': -auc, 'status': STATUS_OK, 'model': model}
# Run optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Increase for better results
    trials=trials,
    rstate=np.random.default_rng(42)
)

# Get best parameters
best_params = {
    'depth': int(best['depth']),
    'learning_rate': best['learning_rate'],
    'l2_leaf_reg': best['l2_leaf_reg'],
    'bagging_temperature': best['bagging_temperature'],
    'subsample': best['subsample'],
    'min_data_in_leaf': int(best['min_data_in_leaf']),
    'random_strength': best['random_strength'],
    'iterations': int(best['iterations']),
    'eval_metric': 'AUC',
    'early_stopping_rounds': 50,
    'random_seed': 42
}

print("Best parameters:")
print(best_params)

# Retrain with best params on full training data
final_model = CatBoostClassifier(**best_params)
final_model.fit(Pool(X, y, cat_features=cat_features))

# Evaluate on test set (make sure you have held-out test data)
# test_metrics = predictor.evaluate(test_df)

In [23]:
model.save_model("trained_model_test.cbm")

Model saved to trained_model_test.cbm


### USAGE

In [30]:
latest_elo.head()

Unnamed: 0,player_id,elo
0,100001,1513.484392
1,100002,1530.643493
2,100003,1592.90596
3,100004,1507.72878
4,100005,1831.593713


In [37]:
import pandas as pd

def predict_winner_probability(
    configs : dict,
    atp_players: pd.DataFrame,
    player_elos: pd.DataFrame,
    cat_model
) -> pd.DataFrame:
    """
    Predicts the probability of `player1` winning against `player2` using the CatBoost model.
    
    Args:
        :param cat_model:
        :param latest_elos:
        :param atp_players:
        :param configs:
    Returns:
        Probability of player1 winning (between 0 and 1)

    """
   # Function to get player ID from first and last name
    def get_player_id(first_name, last_name, df):
        player_row = df[(df['name_first'] == first_name) & (df['name_last'] == last_name)]
        
        if player_row.empty:
            raise ValueError(f"Player '{first_name} {last_name}' not found.")
        return player_row.iloc[0]
    
    try:        
        # Retrieve player IDs
        p1 = get_player_id(configs['fname_p1'].strip(), configs['lname_p1'].strip(), atp_players)
        p2 = get_player_id(configs['fname_p2'].strip(), configs['lname_p2'].strip(), atp_players)
    except ValueError as e:
        print(e)
        return pd.DataFrame()
    
    # Calculate age diff
    today = pd.Timestamp('today')
    age_p1 = today.year - pd.to_datetime(p1['dob']).year
    age_p2 = today.year - pd.to_datetime(p2['dob']).year
    age_diff = age_p1 - age_p2
    
     # Height difference
    ht_diff = p1['height'] - p2['height']

    # Get latest Elo ratings 
    def get_player_elo(player_id, elo_df=player_elos):
        return elo_df[elo_df['player_id'] == player_id]['elo'].values[0]

    p1_elo = get_player_elo(p1['player_id'], player_elos)
    p2_elo = get_player_elo(p2['player_id'], player_elos)
    elo_diff = p1_elo - p2_elo

    # Create feature vector (assumes player1 is hypothetical winner)
    features = pd.DataFrame([{
        'rank_diff': configs['rank_diff'],
        'age_diff': age_diff,
        'ft_diff': ht_diff,
        'elo_diff_pre': elo_diff,
        'surface': configs['surface'].lower(),
        'winner_hand': p1['hand'].lower(),
        'loser_hand': p2['hand'].lower()

    }])
    
     # Create a Pool with categorical features
    prediction_pool = Pool(
        data=features,
        cat_features=['surface', 'winner_hand', 'loser_hand']
    )
    
    prob = cat_model.predict_proba(prediction_pool)
    return prob

# Replace 'your_file.csv' with the path to your CSV file
player_info = pd.read_csv('data\\tennis_atp\\atp_players.csv')
latest_elo = pd.read_csv('latest_player_elos.csv')


configs={

    "fname_p1" : "Frances ",
    "lname_p1" : "Tiafoe",
    "fname_p2" : "Bu",
    "lname_p2" : "Yunchaokete ",

    "rank_diff" : 85-12,
    "surface": "clay"
}



# Predict probability
prob = predict_winner_probability(
    configs,
    atp_players=player_info,
    player_elos=latest_elo,
    cat_model= model.model
)

print(f"Probability of {configs['fname_p1']} {configs['lname_p1']} winning: {prob[0][1]}")
print(f"Probability of {configs['fname_p2']} {configs['lname_p2']} winning: {prob[0][0]}")


Probability of Frances  Tiafoe winning: 0.9997344324203963
Probability of Bu Yunchaokete  winning: 0.00026556757960372046


In [12]:
long_df.columns

Index(['match_num', 'rank_diff', 'age_diff', 'ft_diff', 'elo_diff_pre',
       'surface', 'winner_hand', 'loser_hand', 'elo_diff_post', 'winner_id',
       'loser_id', 'elo_diff_pre.1', 'elo_diff_post.1', 'player_role',
       'label'],
      dtype='object')