### Catboost Class

In [6]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report


class CatBoostMatchPredictor:
    """
    A class for training, evaluating, and saving a CatBoost model
    to predict match winners.
    """
    def __init__(self,
                 feature_cols: list,
                 target_col: str ,
                 cat_features: list = None,
                 params: dict = None,
                 test_size: float = 0.2,
                 random_state: int = 42):
        self.feature_cols = feature_cols
        self.target_col = target_col
        self.cat_features = cat_features
        self.test_size = test_size
        self.random_state = random_state
    
        
        # Default CatBoost parameters
        if params is None:
            params = {
                'iterations': 1000,
                'learning_rate': 0.05,
                'depth': 6,
                'eval_metric': 'AUC',
                'random_seed': random_state,
                'verbose': 100,
                'early_stopping_rounds': 50
            }
        self.params = params
        self.model = None
        self.metrics = None
        self.report = None

    def fit(self, df: pd.DataFrame):
        """
        Train the CatBoost model on the provided DataFrame.
        """
        X = df[self.feature_cols].copy()  # Create a copy to avoid modifying the original DataFrame
        y = df[self.target_col].copy()
        
        if self.cat_features:
            for col in self.cat_features:
                X[col] = X[col].astype(str)  # enforce string type
        
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=self.test_size,
            random_state=self.random_state,
            stratify=y
        )
        
        
        train_pool = Pool(data=X_train, label=y_train, cat_features=self.cat_features)
        test_pool = Pool(data=X_test, label=y_test, cat_features=self.cat_features)

        # Initialize and train the model
        self.model = CatBoostClassifier(**self.params)
        self.model.fit(train_pool, eval_set=test_pool)

        # Predict and evaluate
        y_pred = self.model.predict(X_test)
        y_prob = self.model.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        report = classification_report(y_test, y_pred)

        self.metrics = {'accuracy': accuracy, 'auc': auc}
        self.report = report

        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test AUC: {auc:.4f}")
        print("Classification Report:\n", report)
        return self

    def evaluate(self, df: pd.DataFrame, verbose: bool = True) -> dict:
        """
        Evaluate model performance on the given dataset.
        
        Args:
            df: DataFrame containing both features and target
            verbose: Whether to print metrics
            
        Returns:
            Dictionary containing accuracy, AUC, and classification report
        """
        if self.model is None:
            raise ValueError("Model not trained yet. Call fit() first.")

        X = df[self.feature_cols].copy()
        y = df[self.target_col].copy()

        # Apply same preprocessing as in fit()
        if self.cat_features:
            for col in self.cat_features:
                X[col] = X[col].fillna('missing').astype(str)

        # Create evaluation pool
        eval_pool = Pool(
            data=X,
            label=y,
            cat_features=self.cat_features
        )

        # Generate predictions
        y_pred = self.model.predict(eval_pool)
        y_prob = self.model.predict_proba(eval_pool)[:, 1]

        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y, y_pred),
            'auc': roc_auc_score(y, y_prob),
            'report': classification_report(y, y_pred)
        }

        if verbose:
            print(f"Evaluation Accuracy: {metrics['accuracy']:.4f}")
            print(f"Evaluation AUC: {metrics['auc']:.4f}")
            print("Classification Report:\n", metrics['report'])

        return metrics
    
    def save_model(self, filepath: str):
        """
        Save the trained model to a file.
        """
        if self.model is None:
            raise ValueError("Model has not been trained yet. Call fit() before saving.")
        self.model.save_model(filepath)
        print(f"Model saved to {filepath}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
import numpy as np

long_df = pd.read_csv('long_df.csv')

extra_cols = ["winner_id","loser_id"]

feature_cols = ['age_diff','ft_diff','elo_diff_pre','surface','winner_hand','loser_hand']

cat_features = ['surface','winner_hand','loser_hand']

#Train model
params = {
    'depth': 7, 
    'learning_rate': np.float64(0.13206014408119843), 
    'l2_leaf_reg': np.float64(1.961928922715895), 
    'bagging_temperature': np.float64(0.5692672525377181), 
    'subsample': np.float64(0.7335742829177031), 
    'min_data_in_leaf': 90, 
    'random_strength': np.float64(1.2000000000000002), 
    'iterations': 1500, 
    'eval_metric': 'AUC', 
    'early_stopping_rounds': 50, 
    'random_seed': 42
 }


predictor = CatBoostMatchPredictor(
    feature_cols=feature_cols,
    target_col='label',
    cat_features=cat_features,
    params=params
)

predictor.fit(long_df)


0:	test: 0.7319889	best: 0.7319889 (0)	total: 155ms	remaining: 3m 52s
1:	test: 0.7366022	best: 0.7366022 (1)	total: 302ms	remaining: 3m 46s
2:	test: 0.7394661	best: 0.7394661 (2)	total: 461ms	remaining: 3m 50s
3:	test: 0.7401917	best: 0.7401917 (3)	total: 617ms	remaining: 3m 50s
4:	test: 0.7408289	best: 0.7408289 (4)	total: 765ms	remaining: 3m 48s
5:	test: 0.7410310	best: 0.7410310 (5)	total: 924ms	remaining: 3m 50s
6:	test: 0.7419648	best: 0.7419648 (6)	total: 1.09s	remaining: 3m 52s
7:	test: 0.7424372	best: 0.7424372 (7)	total: 1.25s	remaining: 3m 53s
8:	test: 0.7427428	best: 0.7427428 (8)	total: 1.4s	remaining: 3m 51s
9:	test: 0.7430960	best: 0.7430960 (9)	total: 1.56s	remaining: 3m 52s
10:	test: 0.7433358	best: 0.7433358 (10)	total: 1.7s	remaining: 3m 50s
11:	test: 0.7433980	best: 0.7433980 (11)	total: 1.85s	remaining: 3m 49s
12:	test: 0.7437079	best: 0.7437079 (12)	total: 2s	remaining: 3m 48s
13:	test: 0.7445074	best: 0.7445074 (13)	total: 2.15s	remaining: 3m 48s
14:	test: 0.74458

In [None]:
# Hyperparameter tuning

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool

long_df = pd.read_csv("long_df.csv")
# Prepare data (reuse your existing preprocessing)

extra_cols = ["winner_id","loser_id"]

feature_cols = ['age_diff','ft_diff','elo_diff_pre','surface','winner_hand','loser_hand']

cat_features = ['surface','winner_hand','loser_hand']

X = long_df[feature_cols].copy()
y = long_df['label'].copy()

# Handle categorical NaNs (same as in your fit method)
for col in cat_features:
    X[col] = X[col].fillna('missing').astype(str)

# Split into train/validation (keep test set untouched)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

# Define search space
space = {
    'depth': hp.quniform('depth', 4, 8, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1), np.log(50)),
    'bootstrap_type': hp.choice('bootstrap_type', ['Bayesian', 'Bernoulli']),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.01, 1.0),
    'subsample': hp.pchoice('subsample_choice', [
        (0.5, None),  # 50% chance to ignore subsample
        (0.5, hp.uniform('subsample', 0.5, 1.0))
    ]),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 10, 100, 5),
    'random_strength': hp.quniform('random_strength', 1, 3, 0.1),
    'iterations': hp.quniform('iterations', 800, 2000, 100),
}

def objective(params):
    # Convert parameters
    params = {
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'l2_leaf_reg': params['l2_leaf_reg'],
        'bootstrap_type': params['bootstrap_type'],
        'bagging_temperature': (params['bagging_temperature'] 
                               if params['bootstrap_type'] == 'Bayesian' 
                               else None),
        'subsample': (params['subsample'] 
                     if params['bootstrap_type'] == 'Bernoulli' 
                     else None),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'random_strength': params['random_strength'],
        'iterations': int(params['iterations']),
        'eval_metric': 'AUC',
        'early_stopping_rounds': 50,
        'random_seed': 42,
        'verbose': False,
        'task_type': 'GPU'  # Remove if not using GPU
    }
    
    # Clean up None values
    params = {k: v for k, v in params.items() if v is not None}
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=val_pool)
    
    val_pred = model.predict_proba(val_pool)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    
    return {'loss': -auc, 'status': STATUS_OK, 'model': model}
# Run optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Increase for better results
    trials=trials,
    rstate=np.random.default_rng(42)
)

# Get best parameters
best_params = {
    'depth': int(best['depth']),
    'learning_rate': best['learning_rate'],
    'l2_leaf_reg': best['l2_leaf_reg'],
    'bagging_temperature': best['bagging_temperature'],
    'subsample': best['subsample'],
    'min_data_in_leaf': int(best['min_data_in_leaf']),
    'random_strength': best['random_strength'],
    'iterations': int(best['iterations']),
    'eval_metric': 'AUC',
    'early_stopping_rounds': 50,
    'random_seed': 42
}

print("Best parameters:")
print(best_params)

# Retrain with best params on full training data
final_model = CatBoostClassifier(**best_params)
final_model.fit(Pool(X, y, cat_features=cat_features))

# Evaluate on test set (make sure you have held-out test data)
# test_metrics = predictor.evaluate(test_df)

712:	total: 1m 59s	remaining: 2m 11s
713:	total: 1m 59s	remaining: 2m 11s
714:	total: 1m 59s	remaining: 2m 11s
715:	total: 2m	remaining: 2m 11s
716:	total: 2m	remaining: 2m 11s
717:	total: 2m	remaining: 2m 11s
718:	total: 2m	remaining: 2m 10s
719:	total: 2m	remaining: 2m 10s
720:	total: 2m	remaining: 2m 10s
721:	total: 2m 1s	remaining: 2m 10s
722:	total: 2m 1s	remaining: 2m 10s
723:	total: 2m 1s	remaining: 2m 10s
724:	total: 2m 1s	remaining: 2m 10s
725:	total: 2m 1s	remaining: 2m 9s
726:	total: 2m 1s	remaining: 2m 9s
727:	total: 2m 2s	remaining: 2m 9s
728:	total: 2m 2s	remaining: 2m 9s
729:	total: 2m 2s	remaining: 2m 9s
730:	total: 2m 2s	remaining: 2m 9s
731:	total: 2m 2s	remaining: 2m 8s
732:	total: 2m 2s	remaining: 2m 8s
733:	total: 2m 3s	remaining: 2m 8s
734:	total: 2m 3s	remaining: 2m 8s
735:	total: 2m 3s	remaining: 2m 8s
736:	total: 2m 3s	remaining: 2m 7s
737:	total: 2m 3s	remaining: 2m 7s
738:	total: 2m 3s	remaining: 2m 7s
739:	total: 2m 4s	remaining: 2m 7s
740:	total: 2m 4s	rema

<catboost.core.CatBoostClassifier at 0x214fe1a19f0>

In [None]:
predictor.save_model("trained_model.cbm")

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# After training (predictor.fit(long_df)), evaluate on full dataset
full_dataset_metrics = predictor.evaluate(long_df)

# Evaluate on specific tournament (example)
wimbledon_data = long_df
wimbledon_metrics = predictor.evaluate(wimbledon_data)

# Get metrics programmatically
print(f"Overall AUC: {full_dataset_metrics['auc']:.3f}")

In [None]:
import pandas as pd

def predict_winner_probability(
    fname_p1: str,
    lname_p1: str,
    fname_p2: str,
    lname_p2: str ,
    surface: str,
    atp_players: pd.DataFrame,
    atp_matches: pd.DataFrame,
    model
) -> float:
    """
    Predicts the probability of `player1` winning against `player2` using the CatBoost model.
    
    Args:
        player1: Full name of player 1 (e.g., "Rafael Nadal")
        player2: Full name of player 2 (e.g., "Novak Djokovic")
        surface: Match surface ('Hard', 'Clay', 'Grass')
        round_level: Tournament round ('F', 'SF', 'QF', etc.)
        atp_players: DataFrame with columns ['player_id', 'name_first', 'name_last', 'dob', 'hand', 'height']
        atp_matches: DataFrame containing historical match data
        model_path: Path to the trained CatBoost model
    
    Returns:
        Probability of player1 winning (between 0 and 1)
    """
   # Function to get player ID from first and last name
    def get_player_id(first_name, last_name, df):
        player_row = df[(df['name_first'] == first_name) & (df['name_last'] == last_name)]
        
        if player_row.empty:
            raise ValueError(f"Player '{first_name} {last_name}' not found.")
        return player_row.iloc[0]
    
    try:
        # Split input names into first and last

        
        # Retrieve player IDs
        p1 = get_player_id(fname_p1.strip(), lname_p1.strip(), atp_players)
        p2 = get_player_id(fname_p2.strip(), lname_p2.strip(), atp_players)
    except ValueError as e:
        print(e)
        return pd.DataFrame()
    
    # Calculate age diff
    today = pd.Timestamp('today')
    age_p1 = today.year - pd.to_datetime(p1['dob']).year
    age_p2 = today.year - pd.to_datetime(p2['dob']).year
    age_diff = age_p1 - age_p2
    
     # Height difference
    ht_diff = p1['height'] - p2['height']

    # Get latest Elo ratings 
    def get_latest_elo(player_id: int, df: pd.DataFrame) -> float:
        # Check if player was a winner or loser in their latest match
        winner_matches = df[df['winner_id'] == player_id]
        loser_matches = df[df['loser_id'] == player_id]
        
        if not winner_matches.empty:
            return winner_matches.iloc[-1]['winner_elo_post']
        elif not loser_matches.empty:
            return loser_matches.iloc[-1]['loser_elo_post']
        else:
            return 1500  # Initial rating
    
    elo_p1 = get_latest_elo(p1['player_id'], atp_matches)
    elo_p2 = get_latest_elo(p2['player_id'], atp_matches)
    elo_diff = elo_p1 - elo_p2

    # Create feature vector (assumes player1 is hypothetical winner)
    features = pd.DataFrame([{
        'rank_diff': 0,  # Placeholder (requires ranking data)
        'age_diff': age_diff,
        'ft_diff': ht_diff,
        'server_advantage': 0,  # Placeholder
        'bp_effectiveness': 0,  # Placeholder
        'total_points_played': 0,  # Placeholder
        'match_efficiency': 0,  # Placeholder
        'elo_diff_pre': elo_diff,
        'surface': surface.lower(),
        'winner_hand': p1['hand'].lower(),
        'loser_hand': p2['hand'].lower()
    }])
    
     # Create a Pool with categorical features
    prediction_pool = Pool(
        data=features,
        cat_features=['surface', 'winner_hand', 'loser_hand']
    )
    
    prob = model.predict_proba(prediction_pool)
    return prob

# Replace 'your_file.csv' with the path to your CSV file
player_info = pd.read_csv('data\\tennis_atp\\atp_players.csv')



fname_p1 = "Jannik "
lname_p1 = "Sinner"
fname_p2 = "Alexander"
lname_p2 = "Zverev"


# Predict probability
prob = predict_winner_probability(
    fname_p1=fname_p1, 
    lname_p1=lname_p1,
    fname_p2=fname_p2,
    lname_p2=lname_p2,
    surface="hard",
    atp_players=player_info,
    atp_matches=df,
    model = predictor.model
)

print(f"Probability of {fname_p1} {lname_p1} winning: {prob[0][1]}")
print(f"Probability of {fname_p2} {lname_p2} winning: {prob[0][0]}")


### Evaluate Model