In [13]:
import pandas as pd
df = pd.read_csv('data/atp_matches_combined.csv')
df

  df = pd.read_csv('data/atp_matches_combined.csv')


Unnamed: 0.1,Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,...,,,,,,,,,,
1,1,1968-2029,Dublin,Grass,32.0,A,19680708,271,126914,,...,,,,,,,,,,
2,2,1968-2029,Dublin,Grass,32.0,A,19680708,272,209523,,...,,,,,,,,,,
3,3,1968-2029,Dublin,Grass,32.0,A,19680708,273,100084,,...,,,,,,,,,,
4,4,1968-2029,Dublin,Grass,32.0,A,19680708,274,100132,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193332,193332,2024-M-DC-2024-WG2-PO-URU-MDA-01,Davis Cup WG2 PO: URU vs MDA,Clay,4.0,D,20240203,5,212051,,...,30.0,17.0,7.0,6.0,8.0,14.0,1109.0,8.0,740.0,34.0
193333,193333,2024-M-DC-2024-WG2-PO-VIE-RSA-01,Davis Cup WG2 PO: VIE vs RSA,Hard,4.0,D,20240202,1,122533,,...,41.0,25.0,6.0,9.0,1.0,4.0,554.0,67.0,748.0,32.0
193334,193334,2024-M-DC-2024-WG2-PO-VIE-RSA-01,Davis Cup WG2 PO: VIE vs RSA,Hard,4.0,D,20240202,2,144748,,...,51.0,25.0,7.0,11.0,5.0,12.0,416.0,109.0,,
193335,193335,2024-M-DC-2024-WG2-PO-VIE-RSA-01,Davis Cup WG2 PO: VIE vs RSA,Hard,4.0,D,20240202,4,122533,,...,51.0,32.0,17.0,14.0,5.0,9.0,554.0,67.0,416.0,109.0


### Augmented Data

- rank diff: loser_rank - winner_rank
- age diff: loser_age - winner_age
- ft_diff: loser_ht - winner ht
- server_adavantage: (w_1stWon + w_2ndWon) - (l_1stWon + l_2ndWon)
- bp_effectiveness: (w_bpSaved / w_bpFaced) -> Break-point mental strenght
- total_points_played: w_svpt + l_svpt
- match_efficiency: minutes/total_points_played

-------------

- Elo rating
- Past results
- Recent form
- Tournament history
- Surface preference (win% on each surface)



Notes:
All numerical data will be normalized
Categorical ft will be encoded

In [14]:
import pandas as pd
import numpy as np


def clean_tennis_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Basic data cleanup for tennis match dataframe.
    - Converts date to datetime
    - Fills or drops key missing values
    - Casts data to appropriate types
    - Removes problematic infinite values
    """

    # Fill missing numerical values with median (or 0, depending on the feature)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isna().sum() > 0:
            df[col] = df[col].fillna(df[col].median())

    # Drop rows with missing essential identifiers
    df.dropna(subset=['winner_id', 'loser_id', 'winner_name', 'loser_name'], inplace=True)

    # Standardize categorical strings (strip and lowercase)
    for col in ['surface', 'tourney_level', 'winner_hand', 'loser_hand']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower()

    return df

# Clean dataset
df = clean_tennis_data(df)

# Rank difference
df['rank_diff'] = df['loser_rank'] - df['winner_rank']

# Age difference
df['age_diff'] = df['loser_age'] - df['winner_age']

# Height difference
df['ft_diff'] = df['loser_ht'] - df['winner_ht']

# Server advantage
df['server_advantage'] = (df['w_1stWon'] + df['w_2ndWon']) - (df['l_1stWon'] + df['l_2ndWon'])

# Break point effectiveness (mental strength)
df['bp_effectiveness'] = df['w_bpSaved'] / df['w_bpFaced']
df['bp_effectiveness'] = df['bp_effectiveness'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Total points played
df['total_points_played'] = df['w_svpt'] + df['l_svpt']

# Match efficiency: minutes per point
df['match_efficiency'] = df['minutes'] / df['total_points_played']
df['match_efficiency'] = df['match_efficiency'].replace([np.inf, -np.inf], np.nan).fillna(0)


# Display new feature columns
print(df[['rank_diff', 'age_diff', 'ft_diff', 'server_advantage', 'bp_effectiveness',
          'total_points_played', 'match_efficiency']].head())


   rank_diff  age_diff  ft_diff  server_advantage  bp_effectiveness  \
0       25.0      -1.3      0.0               5.0              0.75   
1       25.0       0.1      0.0               5.0              0.75   
2       25.0       0.1      0.0               5.0              0.75   
3       25.0       1.1      7.0               5.0              0.75   
4       25.0       4.1      0.0               5.0              0.75   

   total_points_played  match_efficiency  
0                149.0          0.651007  
1                149.0          0.651007  
2                149.0          0.651007  
3                149.0          0.651007  
4                149.0          0.651007  


### Calculate Elo for each player

In [16]:
import pandas as pd

def add_pre_match_elo(df: pd.DataFrame,
                      k: float = 32,
                      initial_rating: float = 1500,
                      date_col: str = 'tourney_date',
                      winner_col: str = 'winner_id',
                      loser_col: str = 'loser_id') -> pd.DataFrame:
    """
    Calculates and writes pre-match Elo ratings for both winner and loser.

    Parameters:
    - df: DataFrame containing at least date_col, winner_col, loser_col
    - k: Elo K-factor
    - initial_rating: starting Elo for unseen players
    - date_col: name of the match-date column
    - winner_col: name of the winner-ID column
    - loser_col: name of the loser-ID column

    Returns:
    - df with two new columns: 'winner_elo_pre', 'loser_elo_pre'
    """
    # Ensure chronological order
    df = df.sort_values(by=date_col).reset_index(drop=True)
    
    # Elo storage
    elo = {}
    
    # Lists to collect pre-match ratings
    winner_pre = []
    loser_pre = []
    
    for _, row in df.iterrows():
        w = row[winner_col]
        l = row[loser_col]
        
        # Pull current ratings or start at initial
        r_w = elo.get(w, initial_rating)
        r_l = elo.get(l, initial_rating)
        
        # Record pre-match
        winner_pre.append(r_w)
        loser_pre.append(r_l)
        
        # Compute expected scores
        e_w = 1 / (1 + 10 ** ((r_l - r_w) / 400))
        e_l = 1 - e_w
        
        # Update ratings post-match
        elo[w] = r_w + k * (1 - e_w)
        elo[l] = r_l + k * (0 - e_l)
    
    # Assign back to DataFrame
    df['winner_elo_pre'] = winner_pre
    df['loser_elo_pre']  = loser_pre
    
    # (Optional) Elo difference before match
    df['elo_diff_pre'] = df['winner_elo_pre'] - df['loser_elo_pre']
    
    return df


df = add_pre_match_elo(df)

### Implement Catboost

In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

class CatBoostMatchPredictor:
    """
    A class for training, evaluating, and saving a CatBoost model
    to predict match winners.
    """
    def __init__(self,
                 feature_cols: list,
                 target_col: str = 'winner',
                 cat_features: list = None,
                 params: dict = None,
                 test_size: float = 0.2,
                 random_state: int = 42):
        self.feature_cols = feature_cols
        self.target_col = target_col
        self.cat_features = cat_features
        self.test_size = test_size
        self.random_state = random_state
        
        # Default CatBoost parameters
        if params is None:
            params = {
                'iterations': 1000,
                'learning_rate': 0.05,
                'depth': 6,
                'eval_metric': 'AUC',
                'random_seed': random_state,
                'verbose': 100,
                'early_stopping_rounds': 50
            }
        self.params = params
        self.model = None
        self.metrics = None
        self.report = None

    def fit(self, df: pd.DataFrame):
        """
        Train the CatBoost model on the provided DataFrame.
        """
        X = df[self.feature_cols]
        y = df[self.target_col]

        # Split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=self.test_size,
            random_state=self.random_state,
            stratify=y
        )

        # Create CatBoost Pools
        train_pool = Pool(data=X_train, label=y_train, cat_features=self.cat_features)
        test_pool = Pool(data=X_test, label=y_test, cat_features=self.cat_features)

        # Initialize and train the model
        self.model = CatBoostClassifier(**self.params)
        self.model.fit(train_pool, eval_set=test_pool)

        # Predict and evaluate
        y_pred = self.model.predict(X_test)
        y_prob = self.model.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        report = classification_report(y_test, y_pred)

        self.metrics = {'accuracy': accuracy, 'auc': auc}
        self.report = report

        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test AUC: {auc:.4f}")
        print("Classification Report:\n", report)
        return self

    def save_model(self, filepath: str):
        """
        Save the trained model to a file.
        """
        if self.model is None:
            raise ValueError("Model has not been trained yet. Call fit() before saving.")
        self.model.save_model(filepath)
        print(f"Model saved to {filepath}")


In [20]:
def make_long_format(df):
    # Select the features you engineered, plus IDs
    feats = [
      'rank_diff','age_diff','ft_diff','server_advantage',
      'bp_effectiveness','total_points_played','match_efficiency',
      'elo_diff_pre','surface','round','winner_hand','loser_hand'
    ]

    # Winner rows (label=1)
    win = df[['match_num'] + feats].copy()
    win['player_role'] = 'winner'
    win['label'] = 1

    # Loser rows (label=0): for features that reference winner/loser, you may need to swap or recompute.
    # For simplicity, if your features are already _differences_ (winner minus loser),
    # you can just flip the sign for the long loser row:
    lose = win.copy()
    lose['player_role'] = 'loser'
    lose['label'] = 0

    # If you want per‐player features, you'd need to reconstruct their individual stats here.
    # But for a quick toy example treating diff‐features:
    for c in ['rank_diff','age_diff','ft_diff','server_advantage','elo_diff_pre']:
        lose[c] = -lose[c]

    # Combine
    long_df = pd.concat([win, lose], ignore_index=True)
    return long_df



In [None]:


long_df = make_long_format(df)

# Example usage on the long‐form data:
feature_cols = [
    'rank_diff','age_diff','ft_diff','server_advantage',
    'bp_effectiveness','total_points_played','match_efficiency',
    'elo_diff_pre','surface','round','winner_hand','loser_hand'
]
cat_features = ['surface','round','winner_hand','loser_hand']


# Create and train the predictor
predictor = CatBoostMatchPredictor(
    feature_cols=feature_cols,
    target_col='winner_flag',
    cat_features=cat_features
)

# Fit to the DataFrame
predictor.fit(df)

0:	test: 0.9733643	best: 0.9733643 (0)	total: 296ms	remaining: 4m 55s
100:	test: 0.9816621	best: 0.9816621 (100)	total: 17.9s	remaining: 2m 38s
200:	test: 0.9822112	best: 0.9822112 (200)	total: 36.5s	remaining: 2m 25s
300:	test: 0.9824237	best: 0.9824247 (299)	total: 55s	remaining: 2m 7s
400:	test: 0.9825544	best: 0.9825544 (400)	total: 1m 12s	remaining: 1m 48s
500:	test: 0.9826450	best: 0.9826456 (496)	total: 1m 32s	remaining: 1m 31s
600:	test: 0.9826789	best: 0.9826792 (599)	total: 1m 48s	remaining: 1m 11s
700:	test: 0.9827026	best: 0.9827045 (688)	total: 2m 3s	remaining: 52.5s
800:	test: 0.9827118	best: 0.9827145 (785)	total: 2m 19s	remaining: 34.5s
900:	test: 0.9827826	best: 0.9827826 (900)	total: 2m 34s	remaining: 17s
999:	test: 0.9827984	best: 0.9827984 (999)	total: 2m 49s	remaining: 0us

bestTest = 0.9827983879
bestIteration = 999

Test Accuracy: 0.9204
Test AUC: 0.9828
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.

### EXAMPLE:
Save the model for future use

In [None]:
# --- 1) Load your existing model ---
model = CatBoostClassifier()
model.load_model('tennis_match_predictor.cbm')

# --- 2) Prepare your new data ---
# Suppose df_new is a DataFrame of newly arrived matches already
# preprocessed and with the same feature_cols & target_col
feature_cols = [
    'rank_diff', 'age_diff', 'ft_diff', 'server_advantage',
    'bp_effectiveness', 'total_points_played', 'match_efficiency',
    'elo_diff_pre', 'surface', 'round', 'winner_hand', 'loser_hand'
]
cat_features = ['surface', 'round', 'winner_hand', 'loser_hand']
X_new = df_new[feature_cols]
y_new = df_new['winner_flag']

new_pool = Pool(data=X_new, label=y_new, cat_features=cat_features)

# --- 3) Continue training ---
# Here we grow 200 more trees, with a smaller learning rate
model.fit(
    new_pool,
    init_model='tennis_match_predictor.cbm',  # keeps old trees
    iterations=200,
    learning_rate=0.02,
    verbose=50
)

# --- 4) Save the updated model ---
model.save_model('tennis_match_predictor_updated.cbm')
print("Retrained model saved as tennis_match_predictor_updated.cbm")