In [1]:
import pandas as pd
import numpy as np
matches = pd.read_csv("../data/all_years_nc_tc_elo/matches.csv")
matches = matches.sort_values(by='Date')

In [2]:
matches['Date'] = pd.to_datetime(matches['Date'], errors='coerce')

In [3]:
def number_of_common_opponent_matches(row, df):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']
    
    # Filter for matches that occurred before the current match date
    df = df[df['Date'] < current_date]
    
    # Find common opponents
    player_a_opponents = set(df[(df['winner_id'] == player_a)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_a)]['winner_id']))
    player_b_opponents = set(df[(df['winner_id'] == player_b)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_b)]['winner_id']))

    common_opponents = player_a_opponents.intersection(player_b_opponents)
    return len(common_opponents)

In [4]:
matches[['num_CO_matches']] = matches.apply(
    lambda row: pd.Series(number_of_common_opponent_matches(row, matches)), axis=1
)

In [5]:
def calculate_weighted_mean(matches, column_name, time_column, decay_rate=0.3):
    if matches.empty:
        return None
    # Calculate weights using exponential decay
    matches['weight'] = np.exp(-decay_rate * matches[time_column])
    
    # Apply weights and calculate the weighted mean
    weighted_sum = (matches[column_name] * matches['weight']).sum()
    total_weight = matches['weight'].sum()
    weighted_mean = weighted_sum / total_weight if total_weight > 0 else 0
    return weighted_mean if pd.notnull(weighted_mean) else 0

def calculate_stat_avg(player_win_matches, player_lose_matches, win_column, lose_column):
    win_mean = calculate_weighted_mean(player_win_matches, win_column, 'time_since_match')
    lose_mean = calculate_weighted_mean(player_lose_matches, lose_column, 'time_since_match')
    if win_mean and lose_mean:
        return (win_mean + lose_mean) / 2
    elif win_mean:
        return win_mean
    elif lose_mean:
        return lose_mean
    else:
        return 0

def filter_matches_by_player(df, player_id):
    win_matches = df[df['winner_id'] == player_id]
    lose_matches = df[df['loser_id'] == player_id]
    return win_matches, lose_matches

def calculate_common_opponent_stats(df, player_a, player_b, winner_column, loser_column):
    # Find common opponents
    player_a_opponents = set(df[df['winner_id'] == player_a]['loser_id']).union(
        set(df[df['loser_id'] == player_a]['winner_id'])
    )
    player_b_opponents = set(df[df['winner_id'] == player_b]['loser_id']).union(
        set(df[df['loser_id'] == player_b]['winner_id'])
    )
    common_opponents = player_a_opponents.intersection(player_b_opponents)

    # Filter matches against common opponents
    co_player_a_win_matches = df[(df['winner_id'] == player_a) & (df['loser_id'].isin(common_opponents))]
    co_player_a_lose_matches = df[(df['loser_id'] == player_a) & (df['winner_id'].isin(common_opponents))]
    co_player_b_win_matches = df[(df['winner_id'] == player_b) & (df['loser_id'].isin(common_opponents))]
    co_player_b_lose_matches = df[(df['loser_id'] == player_b) & (df['winner_id'].isin(common_opponents))]

    # Calculate stat avg for players against common opponents
    co_player_a_stat_avg = calculate_stat_avg(co_player_a_win_matches, co_player_a_lose_matches, winner_column, loser_column)
    co_player_b_stat_avg = calculate_stat_avg(co_player_b_win_matches, co_player_b_lose_matches, winner_column, loser_column)
    
    return co_player_a_stat_avg, co_player_b_stat_avg

def calculate_player_stat_difference(row, df, winner_column_name, loser_column_name):
    winner = row['winner_id']
    loser = row['loser_id']
    current_date = row['Date']
    # Filter matches before the current match date
    past_matches = df[df['Date'] < current_date].copy()
    past_matches['time_since_match'] = (current_date - past_matches['Date']).dt.days / 365.0  # Convert to years
    
    # Calculate stat average for Player A and Player B
    winner_won_matches, player_a_lose_matches = filter_matches_by_player(past_matches, winner)
    loser_won_matches, player_b_lose_matches = filter_matches_by_player(past_matches, loser)

    winner_stat_avg = calculate_stat_avg(winner_won_matches, player_a_lose_matches, winner_column_name, loser_column_name)
    loser_stat_avg = calculate_stat_avg(loser_won_matches, player_b_lose_matches, winner_column_name, loser_column_name)

    #stat_avg_difference = player_a_stat_avg - player_b_stat_avg

    # Calculate common opponent stat averages if there were common opponents 
    if row["num_CO_matches"] > 0:
        winner_co_stat_avg, loser_co_stat_avg = calculate_common_opponent_stats(
            past_matches, winner, loser, winner_column_name, loser_column_name
        )
        return winner_co_stat_avg, loser_co_stat_avg, winner_stat_avg, loser_stat_avg
    else:
        return winner_stat_avg, loser_stat_avg, winner_stat_avg, loser_stat_avg


In [6]:
# Apply the function to each row
matches[['winner_CO_1st_serve_in_pct_avg', 'loser_CO_1st_serve_in_pct_avg', 'winner_1st_serve_in_pct_avg', 'loser_1st_serve_in_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_in_pct', 'loser_1st_serve_in_pct')), axis=1
)
matches[['winner_CO_1st_serve_win_pct_avg', 'loser_CO_1st_serve_win_pct_avg', 'winner_1st_serve_win_pct_avg', 'loser_1st_serve_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_win_pct', 'loser_1st_serve_win_pct')), axis=1
)
matches[['winner_CO_2nd_serve_win_pct_avg', 'loser_CO_2nd_serve_win_pct_avg', 'winner_2nd_serve_win_pct_avg', 'loser_2nd_serve_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_2nd_serve_win_pct', 'loser_2nd_serve_win_pct')), axis=1
)
matches[['winner_CO_serve_games_win_pct_avg', 'loser_CO_serve_games_win_pct_avg', 'winner_serve_games_win_pct_avg', 'loser_serve_games_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_service_games_won_pct', 'loser_service_games_won_pct')), axis=1
)
matches[['winner_CO_ace_avg', 'loser_CO_ace_avg', 'winner_ace_avg', 'loser_ace_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_ace', 'l_ace')), axis=1
)
matches[['winner_CO_df_avg', 'loser_CO_df_avg', 'winner_df_avg', 'loser_df_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_df', 'l_df')), axis=1
)
matches[['winner_CO_1st_serve_return_win_pct_avg', 'loser_CO_1st_serve_return_win_pct_avg', 'winner_1st_serve_return_win_pct_avg', 'loser_1st_serve_return_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_return_win_pct', 'loser_1st_serve_return_win_pct')), axis=1
)

In [7]:
new_columns = matches[['winner_name', 'loser_name', 'winner_1st_serve_in_pct', 'winner_CO_1st_serve_in_pct_avg', 'loser_CO_1st_serve_in_pct_avg', 'winner_1st_serve_in_pct_avg', 'loser_1st_serve_in_pct_avg', 
                       'winner_CO_1st_serve_win_pct_avg', 'loser_CO_1st_serve_win_pct_avg', 'winner_1st_serve_win_pct_avg', 'loser_1st_serve_win_pct_avg', 'winner_CO_2nd_serve_win_pct_avg', 'loser_CO_2nd_serve_win_pct_avg', 'winner_2nd_serve_win_pct_avg', 'loser_2nd_serve_win_pct_avg', 'winner_CO_serve_games_win_pct_avg', 'loser_CO_serve_games_win_pct_avg', 'winner_serve_games_win_pct_avg', 'loser_serve_games_win_pct_avg', 'winner_CO_ace_avg', 'loser_CO_ace_avg', 'winner_ace_avg', 'loser_ace_avg', 'winner_CO_df_avg', 'loser_CO_df_avg', 'winner_df_avg', 'loser_df_avg', 'winner_CO_1st_serve_return_win_pct_avg', 'loser_CO_1st_serve_return_win_pct_avg', 'winner_1st_serve_return_win_pct_avg', 'loser_1st_serve_return_win_pct_avg'
                       ]]
new_columns

Unnamed: 0,winner_name,loser_name,winner_1st_serve_in_pct,winner_CO_1st_serve_in_pct_avg,loser_CO_1st_serve_in_pct_avg,winner_1st_serve_in_pct_avg,loser_1st_serve_in_pct_avg,winner_CO_1st_serve_win_pct_avg,loser_CO_1st_serve_win_pct_avg,winner_1st_serve_win_pct_avg,...,winner_ace_avg,loser_ace_avg,winner_CO_df_avg,loser_CO_df_avg,winner_df_avg,loser_df_avg,winner_CO_1st_serve_return_win_pct_avg,loser_CO_1st_serve_return_win_pct_avg,winner_1st_serve_return_win_pct_avg,loser_1st_serve_return_win_pct_avg
0,Alexandr Dolgopolov,Diego Schwartzman,0.579710,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Alex De Minaur,Steve Johnson,0.560606,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
18,Nicolas Jarry,Pablo Andujar,0.600000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
17,Matthew Ebden,Frances Tiafoe,0.666667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
16,Ryan Harrison,Leonardo Mayer,0.597561,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14105,Ugo Humbert,Fabio Fognini,0.631579,0.603621,0.591150,0.608363,0.590666,0.706352,0.663999,0.718999,...,6.822136,3.385170,2.911432,4.204313,2.759831,4.246955,0.279622,0.293068,0.275725,0.304463
14106,Adrian Mannarino,Pavel Kotov,0.732143,0.597421,0.661397,0.601048,0.653978,0.700810,0.689725,0.706350,...,4.858574,3.395839,2.015781,3.274593,2.407648,2.603368,0.265323,0.290758,0.281909,0.281421
14108,Jack Draper,Jan Lennard Struff,0.552239,0.577831,0.573607,0.594940,0.565892,0.754065,0.770241,0.758816,...,7.704442,9.863169,2.904989,4.077242,2.898127,3.534084,0.272075,0.268626,0.288351,0.272784
14109,Ugo Humbert,Alexander Shevchenko,0.644444,0.614535,0.593610,0.608381,0.587859,0.711709,0.668346,0.720022,...,6.829337,4.255086,2.320190,2.419416,2.737763,2.509165,0.293957,0.247295,0.277352,0.276034


In [8]:
# Check for missing values in new columns
missing_values = new_columns.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
winner_1st_serve_in_pct    100
dtype: int64


In [9]:
matches.to_csv('../data/all_years_nc_tc_elo_aggr_stats/matches.csv', index=False)