In [11]:
import pandas as pd
df = pd.read_csv('data/atp_matches_combined.csv')
df.columns

  df = pd.read_csv('data/atp_matches_combined.csv')


Index(['Unnamed: 0', 'tourney_id', 'tourney_name', 'surface', 'draw_size',
       'tourney_level', 'tourney_date', 'match_num', 'winner_id',
       'winner_seed', 'winner_entry', 'winner_name', 'winner_hand',
       'winner_ht', 'winner_ioc', 'winner_age', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'score', 'best_of', 'round', 'minutes', 'w_ace', 'w_df',
       'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved',
       'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
       'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank',
       'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

### Augmented Data

- rank diff: loser_rank - winner_rank
- age diff: loser_age - winner_age
- ft_diff: loser_ht - winner ht
- server_adavantage: (w_1stWon + w_2ndWon) - (l_1stWon + l_2ndWon)
- bp_effectiveness: (w_bpSaved / w_bpFaced) -> Break-point mental strenght
- total_points_played: w_svpt + l_svpt
- match_efficiency: minutes/total_points_played

-------------

- Elo rating
- Past results
- Recent form
- Tournament history
- Surface preference (win% on each surface)



Notes:
All numerical data will be normalized
Categorical ft will be encoded

In [12]:
# Augment data
import pandas as pd
import numpy as np


def clean_tennis_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Basic data cleanup for tennis match dataframe.
    - Converts date to datetime
    - Fills or drops key missing values
    - Casts data to appropriate types
    - Removes problematic infinite values
    """

    # Fill missing numerical values with median (or 0, depending on the feature)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isna().sum() > 0:
            df[col] = df[col].fillna(df[col].median())

    # Drop rows with missing essential identifiers
    df.dropna(subset=['winner_id', 'loser_id', 'winner_name', 'loser_name'], inplace=True)

    # Standardize categorical strings (strip and lowercase)
    for col in ['surface', 'tourney_level', 'winner_hand', 'loser_hand']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower()

    return df

# Clean dataset
df = clean_tennis_data(df)

# Rank difference
df['rank_diff'] = df['loser_rank'] - df['winner_rank']

# Age difference
df['age_diff'] = df['loser_age'] - df['winner_age']

# Height difference
df['ft_diff'] = df['loser_ht'] - df['winner_ht']







In [13]:
# Calculate Elo
import pandas as pd

def add_pre_match_elo(df: pd.DataFrame,
                      k: float = 32,
                      initial_rating: float = 1500,
                      date_col: str = 'tourney_date',
                      winner_col: str = 'winner_id',
                      loser_col: str = 'loser_id') -> pd.DataFrame:
    """
    Calculates and writes pre-match Elo ratings for both winner and loser.

    Parameters:
    - df: DataFrame containing at least date_col, winner_col, loser_col
    - k: Elo K-factor
    - initial_rating: starting Elo for unseen players
    - date_col: name of the match-date column
    - winner_col: name of the winner-ID column
    - loser_col: name of the loser-ID column

    Returns:
    - df with two new columns: 'winner_elo_pre', 'loser_elo_pre'
    """
    # Ensure chronological order
    df = df.sort_values(by=date_col).reset_index(drop=True)
    
    # Elo storage
    elo = {}
    
    # New: Track post-match Elo
    winner_post = []
    loser_post = []
    
    # Lists to collect pre-match ratings
    winner_pre = []
    loser_pre = []
    
    for _, row in df.iterrows():
        w = row[winner_col]
        l = row[loser_col]
        
        # Pull current ratings or start at initial
        r_w = elo.get(w, initial_rating)
        r_l = elo.get(l, initial_rating)
        
        # Record pre-match
        winner_pre.append(r_w)
        loser_pre.append(r_l)
        
        # Compute expected scores
        e_w = 1 / (1 + 10 ** ((r_l - r_w) / 400))
        e_l = 1 - e_w
        
        # Update ratings post-match
        elo[w] = r_w + k * (1 - e_w)
        elo[l] = r_l + k * (0 - e_l)
        
        # Record post-match ratings
        winner_post.append(elo[w])
        loser_post.append(elo[l])
    
    # Assign back to DataFrame
    df['winner_elo_pre'] = winner_pre
    df['loser_elo_pre']  = loser_pre


    # (Optional) Elo difference before match
    df['elo_diff_pre'] = df['winner_elo_pre'] - df['loser_elo_pre']


    return df


df = add_pre_match_elo(df)

In [14]:
# Convert to long format
# Convert to only rows for winners -> winners and losers to not have class imbalance


extra_cols = ["winner_id","loser_id","tourney_date"]

feature_cols = ['rank_diff','age_diff','ft_diff','elo_diff_pre','surface','winner_hand','loser_hand']

cat_features = ['surface','winner_hand','loser_hand']


def make_long_format(data, feats, extras):
    # Winner rows (label=1)
    win = data[['match_num'] + feats + extras].copy()
    win['player_role'] = 'winner'
    win['label'] = 1

    # Loser rows (label=0): for features that reference winner/loser, you may need to swap or recompute.
    # For simplicity, if your features are already _differences_ (winner minus loser),
    # you can just flip the sign for the long loser row:
    lose = win.copy()
    lose['player_role'] = 'loser'
    lose['label'] = 0

    # If you want per‐player features, you'd need to reconstruct their individual stats here.
    # But for a quick toy example treating diff‐features:
    for c in ['age_diff','ft_diff','elo_diff_pre']:
        lose[c] = -lose[c]

    # Combine
    long_df = pd.concat([win, lose], ignore_index=True)
    return long_df


long_df = make_long_format(df,feature_cols,extra_cols)



In [15]:
long_df.to_csv('long_df.csv',index=False)

In [19]:
# Build table with each player's latest elo ranking

import pandas as pd

# Assuming tet_df is your matches dataframe
# Convert tourney_date to datetime
test_df = df
test_df['tourney_date'] = pd.to_datetime(test_df['tourney_date'], format='%Y%m%d')

# Create a function to get latest Elo for each player
def get_latest_elos(df):
    # Create two dataframes for winners and losers
    winners = df[['winner_id', 'winner_elo_pre', 'tourney_date']].rename(
        columns={'winner_id': 'player_id', 'winner_elo_pre': 'elo'}
    )

    losers = df[['loser_id', 'loser_elo_pre', 'tourney_date']].rename(
        columns={'loser_id': 'player_id', 'loser_elo_pre': 'elo'}
    )

    # Combine and sort
    all_players = pd.concat([winners, losers])
    all_players = all_players.sort_values(['player_id', 'tourney_date'])

    # Keep last elo for each player
    latest_elos = all_players.groupby('player_id').last().reset_index()

    return latest_elos[['player_id', 'elo']]

# Get latest Elos
player_elos = get_latest_elos(test_df)



   player_id          elo
0     100001  1513.484392
1     100002  1530.643493
2     100003  1592.905960
3     100004  1507.728780
4     100005  1831.593713


In [23]:
player_elos.to_csv('latest_player_elos.csv',index=False)