In [1]:
import pandas as pd
matches = pd.read_csv("../data/all_years_nc_tc_elo/matches.csv")

In [2]:
def calculate_player_stat_difference(row, df, winner_column_name, loser_column_name):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']
    
    # Filter for matches that occurred before the current match date
    df = df[df['Date'] < current_date]

    # Not only common opponents matches
    player_a_win_matches = df[(df['winner_id'] == player_a)]
    player_a_lose_matches = df[(df['loser_id'] == player_a)]
    player_b_win_matches = df[(df['winner_id'] == player_b)]
    player_b_lose_matches = df[(df['loser_id'] == player_b)]
    
    player_a_wrp = ((player_a_win_matches[winner_column_name].mean() + player_a_lose_matches[loser_column_name].mean()) / 2 if not (player_a_win_matches.empty and player_a_lose_matches.empty) else 0)
    player_b_wrp = ((player_b_win_matches[winner_column_name].mean() + player_b_lose_matches[loser_column_name].mean()) / 2 if not (player_b_win_matches.empty and player_b_lose_matches.empty) else 0)
    
    wrp_difference = player_a_wrp - player_b_wrp

    # Find common opponents
    player_a_opponents = set(df[(df['winner_id'] == player_a)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_a)]['winner_id']))
    player_b_opponents = set(df[(df['winner_id'] == player_b)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_b)]['winner_id']))

    common_opponents = player_a_opponents.intersection(player_b_opponents)

    if not common_opponents:
        return None, wrp_difference  

    # Filter matches against common opponents
    co_player_a_win_matches = df[((df['winner_id'] == player_a) & (df['loser_id'].isin(common_opponents)))]
    co_player_a_lose_matches = df[((df['loser_id'] == player_a) & (df['winner_id'].isin(common_opponents)))]
    co_player_b_win_matches = df[((df['winner_id'] == player_b) & (df['loser_id'].isin(common_opponents)))]
    co_player_b_lose_matches = df[((df['loser_id'] == player_b) & (df['winner_id'].isin(common_opponents)))]
    
    # Calculate average WRP for each player against common opponents 
    co_player_a_wrp = ((co_player_a_win_matches[winner_column_name].mean() + co_player_a_lose_matches[loser_column_name].mean()) / 2 if not (co_player_a_win_matches.empty and co_player_a_lose_matches.empty) else 0)
    co_player_b_wrp = ((co_player_b_win_matches[winner_column_name].mean() + co_player_b_lose_matches[loser_column_name].mean()) / 2 if not (co_player_b_win_matches.empty and co_player_b_lose_matches.empty) else 0)

    # Calculate the difference
    co_wrp_difference = co_player_a_wrp - co_player_b_wrp
    return co_wrp_difference, wrp_difference


In [3]:
# Apply the function to each row
matches[['CO_1st_serve_in_p_diff', '1st_serve_in_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_in_pct', 'loser_1st_serve_in_pct')), axis=1
)
matches[['CO_1st_serve_win_p_diff', '1st_serve_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_win_pct', 'loser_1st_serve_win_pct')), axis=1
)
matches[['CO_2nd_serve_win_p_diff', '2nd_serve_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_2nd_serve_win_pct', 'loser_2nd_serve_win_pct')), axis=1
)
matches[['CO_serve_games_win_p_diff', 'serve_games_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_service_games_won_pct', 'loser_service_games_won_pct')), axis=1
)
matches[['CO_avg_ace_per_match_diff', 'avg_ace_per_match_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_ace', 'l_ace')), axis=1
)
matches[['CO_avg_df_per_match_diff', 'avg_df_per_match_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_df', 'l_df')), axis=1
)
matches[['CO_1st_serve_wrp_diff', '1st_serve_wrp_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_return_win_pct', 'loser_1st_serve_return_win_pct')), axis=1
)

In [5]:
matches[['winner_name', 'loser_name', 'CO_1st_serve_in_p_diff', '1st_serve_in_p_diff', 'CO_1st_serve_win_p_diff', '1st_serve_win_p_diff', 'CO_2nd_serve_win_p_diff', '2nd_serve_win_p_diff', 'CO_serve_games_win_p_diff', 'serve_games_win_p_diff', 'CO_avg_ace_per_match_diff', 'avg_ace_per_match_diff', 'CO_avg_df_per_match_diff', 'avg_df_per_match_diff', 'CO_1st_serve_wrp_diff', '1st_serve_wrp_diff']]

Unnamed: 0,winner_name,loser_name,CO_1st_serve_in_p_diff,1st_serve_in_p_diff,CO_1st_serve_win_p_diff,1st_serve_win_p_diff,CO_2nd_serve_win_p_diff,2nd_serve_win_p_diff,CO_serve_games_win_p_diff,serve_games_win_p_diff,CO_avg_ace_per_match_diff,avg_ace_per_match_diff,CO_avg_df_per_match_diff,avg_df_per_match_diff,CO_1st_serve_wrp_diff,1st_serve_wrp_diff
0,Marcos Giron,Richard Gasquet,0.020819,0.017228,0.013445,0.012872,0.412234,0.056280,0.004182,0.021159,-0.509341,0.248231,-0.847622,-1.034559,-0.041533,-0.033253
1,Mackenzie Mcdonald,Daniel Elahi Galan,0.061716,0.020702,0.038482,0.015915,0.009754,-0.166301,0.073671,0.038542,1.966346,1.543370,0.560096,1.291288,-0.033029,-0.016278
2,Miomir Kecmanovic,Christopher Oconnell,-0.010899,-0.023359,-0.106047,-0.044856,0.005892,0.153931,-0.140633,-0.055942,-7.618056,-4.659332,-0.506944,0.015621,0.023443,0.024988
3,Yoshihito Nishioka,Holger Rune,0.030987,0.033271,-0.068443,-0.073122,-0.003036,-0.019666,-0.055229,-0.073900,-2.666532,-2.186826,-1.738923,-2.009373,-0.035566,-0.027436
4,Alexei Popyrin,Felix Auger Aliassime,-0.023659,-0.032583,0.005078,-0.000925,-0.007384,-0.009488,0.027285,0.014171,0.092428,-0.085016,-0.157466,-0.351460,-0.037688,-0.032529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,Novak Djokovic,Marin Cilic,0.093789,0.090000,-0.062689,-0.053809,-0.147401,-0.154277,-0.037705,-0.005515,-5.989177,-5.249510,-0.872294,-0.557097,0.031339,0.033625
14107,Roger Federer,Kei Nishikori,0.001433,0.014717,0.084541,0.092904,-0.167201,-0.023264,0.104516,0.103578,8.742647,7.093651,1.164706,0.411111,0.040079,0.023222
14108,Karen Khachanov,Dominic Thiem,0.070168,0.069711,-0.023562,0.027395,-0.001519,-0.080463,0.026620,0.057839,1.511957,2.262487,-1.981522,-1.471780,0.001248,-0.033747
14109,Novak Djokovic,Roger Federer,0.050819,0.052107,-0.068381,-0.074497,-0.076850,-0.016345,-0.048978,-0.048079,-5.197917,-5.093815,1.716667,0.243685,-0.011870,0.028102


In [6]:
matches.to_csv("../data/all_years_nc_tc_elo_aggr_stats/matches.csv", index=False)