In [16]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

### Funções

In [17]:
K_BASE = 24.0

def event_importance(level, rnd):
    # Sackmann: G=Slams, M=Masters, F=Tour Finals, A=ATP (500/250), C=Challenger
    if level == 'G': return 1.50
    if level == 'M': return 1.25
    if level == 'F':  # Tour Finals
        return 1.60 if rnd in {'SF', 'F'} else 1.25
    if level == 'A': return 1.10   # treat ATP main-draw (250/500) similarly
    if level == 'C': return 1.00
    return 1.00

def sets_factor(best_of):
    try:
        return 1.35 if int(best_of) >= 5 else 1.00
    except Exception:
        return 1.00

def add_pre_match_elo(df_original):
    """Adds pre-match Elo columns for winner and loser and returns a new DataFrame."""
    df = df_original.copy()

    # Ensure proper chronological order (Sackmann has tourney_date + match_num)
    # tourney_date might be int yyyymmdd or string yyyy-mm-dd; both sort fine lexicographically if consistent.
    df = df.sort_values(["tourney_date", "match_num"], kind="mergesort").reset_index(drop=True)

    ratings = {}  # player_id -> current Elo

    pre_1, pre_2, exp_1, exp_2, post_1, post_2 = [], [], [], [], [], []

    for r in df.itertuples(index=False):
        w = r.player1_id
        l = r.player2_id

        # fetch current or seed (use rank-aware seed; change to constant 1500.0 if desired)
        Rw = ratings.get(w, 1500)
        Rl = ratings.get(l, 1500)

        # store pre-match elos
        pre_1.append(Rw)
        pre_2.append(Rl)

        # expected scores
        Ew = 1.0 / (1.0 + 10 ** ((Rl - Rw) / 400.0))
        El = 1.0 - Ew
        exp_1.append(Ew)
        exp_2.append(El)

        # per-match K
        K = K_BASE * event_importance(getattr(r, "tourney_level", "A"), getattr(r, "round", ""))
        K *= sets_factor(getattr(r, "best_of", 3))

        # update after result (winner gets 1, loser 0)
        Rw_new = Rw + K * (1.0 - Ew)
        Rl_new = Rl + K * (0.0 - El)

        ratings[w] = Rw_new
        ratings[l] = Rl_new
        post_1.append(Rw_new)
        post_2.append(Rl_new)

    # write columns
    df["pre_elo_1"] = pre_1
    df["pre_elo_2"] = pre_2
    df["exp_1"] = exp_1
    df["exp_2"] = exp_2
    df["post_elo_1"] = post_1
    df["post_elo_2"] = post_2
    return df

def get_players_elo_series(player_id, df):
    """Returns a pandas Series with the Elo ratings of the given player before each match they played."""
    player_matches = df[(df['player1_id'] == player_id) | (df['player2_id'] == player_id)].copy()
    
    elo_series = []
    for r in player_matches.itertuples(index=False):
        if r.player1_id == player_id:
            elo_series.append([r.tourney_date,r.pre_elo_1])
        else:
            elo_series.append([r.tourney_date,r.pre_elo_2])
    
    return pd.DataFrame(elo_series, columns=['date','elo']).set_index('date').sort_index()

def rolling_slope(seq, min_points=2):
    """
    Retorna o coeficiente angular (slope) de y ~ x para a sequência 'seq'.
    Aceita deque/list/ndarray com menos que o maxlen. 
    Se n < min_points → NaN. Se var(x)=0 → 0.0.
    """
    n = len(seq)
    if n < min_points:
        return np.nan

    x = np.arange(n, dtype=float)
    y = np.asarray(seq, dtype=float)

    # slope = cov(x,y) / var(x)  (mais leve que polyfit)
    xm = x.mean()
    ym = y.mean()
    denom = np.sum((x - xm) ** 2)
    if denom == 0.0:
        return 0.0
    return np.sum((x - xm) * (y - ym)) / denom

def safe_mean(seq):
    return np.mean(seq) if len(seq) > 0 else np.nan

def balance_matches(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    rng = np.random.default_rng(42)

    # Máscara para trocar metade das linhas
    swap_mask = np.zeros(len(df), dtype=bool)
    idx = np.arange(len(df))
    rng.shuffle(idx)
    swap_mask[idx[:len(df)//2]] = True

    # Detecta colunas de diferença
    diff_cols = [
        c for c in df.columns
        if c.lower().endswith('_diff') or 'differential' in c.lower()
    ]
    # Garante que são numéricas
    diff_cols = [c for c in diff_cols if np.issubdtype(df[c].dtype, np.number)]

    # Troca jogadores
    df.loc[swap_mask, ['player1_id', 'player2_id']] = (
        df.loc[swap_mask, ['player2_id', 'player1_id']].values
    )

    # Inverte sinal das diferenças
    df.loc[swap_mask, diff_cols] = -df.loc[swap_mask, diff_cols].values

    # Cria alvo (player1 vence = 1, player2 vence = 0)
    df['winner'] = np.where(swap_mask, 0, 1)

    return df


### Tratamento

In [48]:
matches_list = []
for year in range(1968, 2025):
    matches = pd.read_csv(f'./data/atp_matches_{year}.csv')
    matches_list.append(matches)
all_matches = pd.concat(matches_list, ignore_index=True)

In [49]:
all_matches['tourney_date'] = pd.to_datetime(all_matches['tourney_date'], format='%Y%m%d')

In [50]:
columns = [
    'winner_id', 'loser_id', 'winner_rank', 'loser_rank',
    'winner_age', 'loser_age', 'winner_ht', 'loser_ht',
    'w_ace', 'w_df', 'w_svpt','w_1stIn', 'w_1stWon', 'w_2ndWon','w_SvGms', 'w_bpSaved', 'w_bpFaced',
    'l_ace', 'l_df', 'l_svpt','l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
    'surface', 'loser_rank_points', 'winner_rank_points'
]
all_matches = all_matches.dropna(subset=columns)

In [52]:
all_matches.columns = [i.replace('l_', 'player2_').replace('loser_','player2_').replace('w_', 'player1_').replace('winner_','player1_') if i.startswith('w') or i.startswith('l') else i for i in all_matches.columns]
columns = all_matches.columns.to_list()
columns += ['pre_elo_1', 'pre_elo_2']

In [53]:
eloed_matches = add_pre_match_elo(all_matches)

In [54]:
eloed_matches = eloed_matches[columns]

In [55]:
from collections import defaultdict, deque

from tqdm import tqdm


h2h_surface_dict = defaultdict(lambda: defaultdict(int))
h2h_dict = defaultdict(int)
total_h2h_surface = []
total_h2h = []

for idx, (w_id,l_id,surface) in enumerate(tqdm(zip(
        eloed_matches['player1_id'],
        eloed_matches['player2_id'],
        eloed_matches['surface']), 
        total=len(eloed_matches))):
    
    wins = h2h_dict[(w_id, l_id)]
    losses = h2h_dict[(l_id, w_id)]

    wins_surface = h2h_surface_dict[surface][(w_id, l_id)]
    losses_surface = h2h_surface_dict[surface][(l_id, w_id)]

    total_h2h.append(wins - losses)
    total_h2h_surface.append(wins_surface - losses_surface)

    h2h_dict[(w_id, l_id)] += 1
    h2h_surface_dict[surface][(w_id, l_id)] += 1


  0%|          | 0/95375 [00:00<?, ?it/s]

100%|██████████| 95375/95375 [00:00<00:00, 392640.68it/s]


In [56]:
df_final = eloed_matches[['player1_id', 'player2_id', 'best_of', 'draw_size']].copy()
df_final['atp_points_differential'] = eloed_matches['player1_rank_points'] - eloed_matches['player2_rank_points']
df_final['atp_rank_differential'] = eloed_matches['player1_rank'] - eloed_matches['player2_rank']
df_final['age_differential'] = eloed_matches['player1_age'] - eloed_matches['player2_age']
df_final['ht_differential'] = eloed_matches['player1_ht'] - eloed_matches['player2_ht']
df_final['elo_differential'] = eloed_matches['pre_elo_1'] - eloed_matches['pre_elo_2']
df_final['h2h_differential'] = total_h2h
df_final['h2h_surface_differential'] = total_h2h_surface

In [57]:
df_final

Unnamed: 0,player1_id,player2_id,best_of,draw_size,atp_points_differential,atp_rank_differential,age_differential,ht_differential,elo_differential,h2h_differential,h2h_surface_differential
0,101142,101746,3,32.0,1028.0,-69.0,5.0,5.0,0.000000,0,0
1,100923,100656,3,32.0,-2026.0,52.0,-3.2,-8.0,0.000000,0,0
2,101613,100587,3,32.0,257.0,-126.0,-10.3,-8.0,0.000000,0,0
3,101196,101511,3,32.0,-295.0,24.0,2.9,10.0,0.000000,0,0
4,101179,101601,3,32.0,352.0,-135.0,3.7,0.0,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...
95370,210530,209950,5,8.0,-1862.0,102.0,-1.5,-5.0,-232.034982,-1,-1
95371,209950,210150,5,8.0,1219.0,-28.0,1.3,-8.0,46.529254,0,0
95372,210530,210506,5,8.0,-752.0,81.0,-1.3,-13.0,-157.014800,0,0
95373,211663,209414,5,8.0,-62.0,17.0,-2.2,7.0,80.052474,0,0


In [58]:
for k in [3, 5, 10, 20, 50, 100]:
    elo_hist = defaultdict(lambda: deque(maxlen=k))
    elo_grad_k = []

    k_last_matches = defaultdict(lambda: defaultdict(lambda: deque(maxlen=k)))
    p_ace_k = []
    p_df_k = []
    p_1stIn_k = []
    p_1stWon_k = []
    p_2ndWon_k = []
    p_bpSaved_k = []

    print(k, end=': ')
    for row in tqdm(eloed_matches.itertuples(index=False), total=len(eloed_matches)):
        player1_id, player2_id = row.player1_id, row.player2_id
        player1_ace, player2_ace = row.player1_ace, row.player2_ace
        player1_df, player2_df = row.player1_df, row.player2_df
        player1_svpt, player2_svpt = row.player1_svpt, row.player2_svpt
        player1_1stIn, player2_1stIn = row.player1_1stIn, row.player2_1stIn
        player1_1stWon, player2_1stWon = row.player1_1stWon, row.player2_1stWon
        player1_2ndWon, player2_2ndWon = row.player1_2ndWon, row.player2_2ndWon
        player1_SvGms, player2_SvGms = row.player1_SvGms, row.player2_SvGms
        player1_bpSaved, player2_bpSaved = row.player1_bpSaved, row.player2_bpSaved
        player1_bpFaced, player2_bpFaced = row.player1_bpFaced, row.player2_bpFaced
        player1_elo, player2_elo = row.pre_elo_1, row.pre_elo_2

        p_ace_k.append(safe_mean(k_last_matches[player1_id]["p_ace"]) - safe_mean(k_last_matches[player2_id]["p_ace"]))
        p_df_k.append(safe_mean(k_last_matches[player1_id]["p_df"]) - safe_mean(k_last_matches[player2_id]["p_df"]))
        p_1stIn_k.append(safe_mean(k_last_matches[player1_id]["p_1stIn"]) - safe_mean(k_last_matches[player2_id]["p_1stIn"]))
        p_1stWon_k.append(safe_mean(k_last_matches[player1_id]["p_1stWon"]) - safe_mean(k_last_matches[player2_id]["p_1stWon"]))
        p_2ndWon_k.append(safe_mean(k_last_matches[player1_id]["p_2ndWon"]) - safe_mean(k_last_matches[player2_id]["p_2ndWon"]))
        p_bpSaved_k.append(safe_mean(k_last_matches[player1_id]["p_bpSaved"]) - safe_mean(k_last_matches[player2_id]["p_bpSaved"]))

        grad_w = rolling_slope(elo_hist[player1_id])
        grad_l = rolling_slope(elo_hist[player2_id])
        elo_grad_k.append(grad_w - grad_l)

        # --------- UPDATE HISTORYS (after computing features!) ---------
        # update Elo history with current *pre-match* Elo
        elo_hist[player1_id].append(player1_elo)
        elo_hist[player2_id].append(player2_elo)


        # Update
        if (player1_svpt != 0) and (player1_svpt != player1_1stIn):
            # Percentage of aces
            k_last_matches[player1_id]["p_ace"].append(100 * (player1_ace / player1_svpt))

            # Percentage of double faults
            k_last_matches[player1_id]["p_df"].append(100 * (player1_df / player1_svpt))

            # Percentage of first serve in
            k_last_matches[player1_id]["p_1stIn"].append(100 * (player1_1stIn / player1_svpt))

            k_last_matches[player1_id]["p_2ndWon"].append(100 * (player1_2ndWon / (player1_svpt - player1_1stIn)))

        # Update loser stats
        if (player2_svpt != 0) and (player2_svpt != player2_1stIn):
            k_last_matches[player2_id]["p_ace"].append(100 * (player2_ace / player2_svpt))
            k_last_matches[player2_id]["p_df"].append(100 * (player2_df / player2_svpt))
            k_last_matches[player2_id]["p_1stIn"].append(100 * (player2_1stIn / player2_svpt))
            k_last_matches[player2_id]["p_2ndWon"].append(100 * (player2_2ndWon / (player2_svpt - player2_1stIn)))

        # Percentage of first serve won
        if player1_1stIn != 0:
            k_last_matches[player1_id]["p_1stWon"].append(100 * (player1_1stWon / player1_1stIn))
        if player2_1stIn != 0:
            k_last_matches[player2_id]["p_1stWon"].append(100 * (player2_1stWon / player2_1stIn))

        # Percentage of break points saved
        if player1_bpFaced != 0:
            k_last_matches[player1_id]["p_bpSaved"].append(100 * (player1_bpSaved / player1_bpFaced))
        if player2_bpFaced != 0:
            k_last_matches[player2_id]["p_bpSaved"].append(100 * (player2_bpSaved / player2_bpFaced))

    # Final data aggregation
    df_final["p_ace_last" + str(k) + "_differential"] = p_ace_k
    df_final["p_df_last" + str(k) + "_differential"] = p_df_k
    df_final["p_1st_in_last" + str(k) + "_differential"] = p_1stIn_k
    df_final["p_1st_won_last" + str(k) + "_differential"] = p_1stWon_k
    df_final["p_2nd_won_last" + str(k) + "_differential"] = p_2ndWon_k
    df_final["p_bp_saved_last" + str(k) + "_differential"] = p_bpSaved_k
    df_final[f"elo_gradient_{k}_differential"] = elo_grad_k



3: 

  0%|          | 0/95375 [00:00<?, ?it/s]

100%|██████████| 95375/95375 [00:10<00:00, 8813.33it/s]


5: 

100%|██████████| 95375/95375 [00:10<00:00, 8842.10it/s]


10: 

100%|██████████| 95375/95375 [00:11<00:00, 8631.04it/s]


20: 

100%|██████████| 95375/95375 [00:11<00:00, 8115.67it/s]


50: 

100%|██████████| 95375/95375 [00:13<00:00, 6819.04it/s]


100: 

100%|██████████| 95375/95375 [00:17<00:00, 5430.55it/s]


In [61]:
df_final.sample()

Unnamed: 0,player1_id,player2_id,best_of,draw_size,atp_points_differential,atp_rank_differential,age_differential,ht_differential,elo_differential,h2h_differential,h2h_surface_differential,p_ace_last3_differential,p_df_last3_differential,p_1st_in_last3_differential,p_1st_won_last3_differential,p_2nd_won_last3_differential,p_bp_saved_last3_differential,elo_gradient_3_differential,p_ace_last5_differential,p_df_last5_differential,p_1st_in_last5_differential,p_1st_won_last5_differential,p_2nd_won_last5_differential,p_bp_saved_last5_differential,elo_gradient_5_differential,p_ace_last10_differential,p_df_last10_differential,p_1st_in_last10_differential,p_1st_won_last10_differential,p_2nd_won_last10_differential,p_bp_saved_last10_differential,elo_gradient_10_differential,p_ace_last20_differential,p_df_last20_differential,p_1st_in_last20_differential,p_1st_won_last20_differential,p_2nd_won_last20_differential,p_bp_saved_last20_differential,elo_gradient_20_differential,p_ace_last50_differential,p_df_last50_differential,p_1st_in_last50_differential,p_1st_won_last50_differential,p_2nd_won_last50_differential,p_bp_saved_last50_differential,elo_gradient_50_differential,p_ace_last100_differential,p_df_last100_differential,p_1st_in_last100_differential,p_1st_won_last100_differential,p_2nd_won_last100_differential,p_bp_saved_last100_differential,elo_gradient_100_differential
5019,101686,101063,3,32.0,-478.0,48.0,-5.4,-2.0,-77.521851,-1,-1,-1.386579,2.90029,-1.794872,-5.155986,-11.850379,-7.154882,-10.043517,-1.579788,2.869545,-8.171534,-3.400434,-10.517635,-11.237374,-10.199921,-0.77898,2.024662,-6.809282,-1.374831,-4.279964,-3.399162,-5.101454,-0.67906,1.105309,-2.154989,-1.177687,-5.399679,-2.993352,2.590626,-0.171984,0.88768,-0.085475,-1.304686,-4.522258,-5.555362,1.089423,-0.173871,0.62359,-0.882124,-1.174967,-3.665279,-4.739163,0.0242


In [62]:
balanced_df = balance_matches(df_final)

In [None]:
balanced_df.to_csv('./data/atp_matches_features_balanced.csv', index=False)