In [1]:
import pandas as pd
matches = pd.read_csv("../data/all_years_nc_tc_elo/matches.csv")

In [2]:
def check_if_common_opponents_exist(row, df):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']
    
    # Filter for matches that occurred before the current match date
    df = df[df['Date'] < current_date]
    
    # Find common opponents
    player_a_opponents = set(df[(df['winner_id'] == player_a)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_a)]['winner_id']))
    player_b_opponents = set(df[(df['winner_id'] == player_b)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_b)]['winner_id']))

    common_opponents = player_a_opponents.intersection(player_b_opponents)

    if not common_opponents:
        return 0
    else:
        return 1

In [3]:
matches[['CO_active']] = matches.apply(
    lambda row: pd.Series(check_if_common_opponents_exist(row, matches)), axis=1
)

In [4]:
import pandas as pd

def calculate_mean(matches, column_name):
    """Calculate mean excluding NaN, or return 0 if DataFrame is empty or only contains NaN."""
    if matches.empty:
        return 0
    column_mean = matches[column_name].dropna().mean()
    return column_mean if pd.notnull(column_mean) else 0

def calculate_wrp(player_win_matches, player_lose_matches, win_column, lose_column):
    """Calculate weighted return percentage (WRP) for a player."""
    win_mean = calculate_mean(player_win_matches, win_column)
    lose_mean = calculate_mean(player_lose_matches, lose_column)
    return (win_mean + lose_mean) / 2

def filter_matches_by_player(df, player_id):
    """Filter matches for a player as winner or loser."""
    win_matches = df[df['winner_id'] == player_id]
    lose_matches = df[df['loser_id'] == player_id]
    return win_matches, lose_matches

def calculate_common_opponent_stats(df, player_a, player_b, winner_column, loser_column):
    """Calculate WRP difference for common opponents."""
    # Find common opponents
    player_a_opponents = set(df[df['winner_id'] == player_a]['loser_id']).union(
        set(df[df['loser_id'] == player_a]['winner_id'])
    )
    player_b_opponents = set(df[df['winner_id'] == player_b]['loser_id']).union(
        set(df[df['loser_id'] == player_b]['winner_id'])
    )
    common_opponents = player_a_opponents.intersection(player_b_opponents)

    # Filter matches against common opponents
    co_player_a_win_matches = df[(df['winner_id'] == player_a) & (df['loser_id'].isin(common_opponents))]
    co_player_a_lose_matches = df[(df['loser_id'] == player_a) & (df['winner_id'].isin(common_opponents))]
    co_player_b_win_matches = df[(df['winner_id'] == player_b) & (df['loser_id'].isin(common_opponents))]
    co_player_b_lose_matches = df[(df['loser_id'] == player_b) & (df['winner_id'].isin(common_opponents))]

    # Calculate WRP for players against common opponents
    co_player_a_wrp = calculate_wrp(co_player_a_win_matches, co_player_a_lose_matches, winner_column, loser_column)
    co_player_b_wrp = calculate_wrp(co_player_b_win_matches, co_player_b_lose_matches, winner_column, loser_column)

    # Return the difference
    return co_player_a_wrp - co_player_b_wrp

def calculate_player_stat_difference(row, df, winner_column_name, loser_column_name):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']

    # Filter matches before the current match date
    past_matches = df[df['Date'] < current_date]

    # Calculate WRP for Player A and Player B
    player_a_win_matches, player_a_lose_matches = filter_matches_by_player(past_matches, player_a)
    player_b_win_matches, player_b_lose_matches = filter_matches_by_player(past_matches, player_b)

    player_a_wrp = calculate_wrp(player_a_win_matches, player_a_lose_matches, winner_column_name, loser_column_name)
    player_b_wrp = calculate_wrp(player_b_win_matches, player_b_lose_matches, winner_column_name, loser_column_name)

    wrp_difference = player_a_wrp - player_b_wrp

    # Calculate common opponent WRP difference if active
    if row["CO_active"] == 1:
        co_wrp_difference = calculate_common_opponent_stats(
            past_matches, player_a, player_b, winner_column_name, loser_column_name
        )
        return co_wrp_difference, wrp_difference
    else:
        return wrp_difference, wrp_difference


In [5]:
# Apply the function to each row
matches[['CO_1st_serve_in_p_diff', '1st_serve_in_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_in_pct', 'loser_1st_serve_in_pct')), axis=1
)
matches[['CO_1st_serve_win_p_diff', '1st_serve_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_win_pct', 'loser_1st_serve_win_pct')), axis=1
)
matches[['CO_2nd_serve_win_p_diff', '2nd_serve_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_2nd_serve_win_pct', 'loser_2nd_serve_win_pct')), axis=1
)
matches[['CO_serve_games_win_p_diff', 'serve_games_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_service_games_won_pct', 'loser_service_games_won_pct')), axis=1
)
matches[['CO_avg_ace_per_match_diff', 'avg_ace_per_match_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_ace', 'l_ace')), axis=1
)
matches[['CO_avg_df_per_match_diff', 'avg_df_per_match_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_df', 'l_df')), axis=1
)
matches[['CO_1st_serve_wrp_diff', '1st_serve_wrp_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_return_win_pct', 'loser_1st_serve_return_win_pct')), axis=1
)

In [6]:
new_columns = matches[['winner_name', 'loser_name', 'CO_1st_serve_in_p_diff', '1st_serve_in_p_diff', 'CO_1st_serve_win_p_diff', '1st_serve_win_p_diff', 'CO_2nd_serve_win_p_diff', '2nd_serve_win_p_diff', 'CO_serve_games_win_p_diff', 'serve_games_win_p_diff', 'CO_avg_ace_per_match_diff', 'avg_ace_per_match_diff', 'CO_avg_df_per_match_diff', 'avg_df_per_match_diff', 'CO_1st_serve_wrp_diff', '1st_serve_wrp_diff']]
new_columns

Unnamed: 0,winner_name,loser_name,CO_1st_serve_in_p_diff,1st_serve_in_p_diff,CO_1st_serve_win_p_diff,1st_serve_win_p_diff,CO_2nd_serve_win_p_diff,2nd_serve_win_p_diff,CO_serve_games_win_p_diff,serve_games_win_p_diff,CO_avg_ace_per_match_diff,avg_ace_per_match_diff,CO_avg_df_per_match_diff,avg_df_per_match_diff,CO_1st_serve_wrp_diff,1st_serve_wrp_diff
0,Marcos Giron,Richard Gasquet,0.003666,0.003357,-0.005308,-0.002319,0.398151,0.044779,-0.013747,0.006206,-0.509341,0.248231,-0.847622,-1.034559,-0.018945,-0.015093
1,Mackenzie Mcdonald,Daniel Elahi Galan,0.025972,0.013268,-0.001941,0.006563,0.009754,-0.170104,0.026875,0.027006,1.966346,1.543370,0.560096,1.291288,0.002895,-0.009121
2,Miomir Kecmanovic,Christopher Oconnell,-0.010899,-0.023359,-0.106047,-0.044856,0.005892,0.153931,-0.140633,-0.055942,-7.618056,-4.659332,-0.506944,0.015621,0.023443,0.024988
3,Yoshihito Nishioka,Holger Rune,0.018062,0.030227,-0.083067,-0.077687,-0.003036,-0.016432,-0.072883,-0.080092,-2.666532,-2.186826,-1.738923,-2.009373,-0.022720,-0.025177
4,Alexei Popyrin,Felix Auger Aliassime,-0.028370,-0.034825,-0.000771,-0.003670,-0.007384,-0.009488,0.020650,0.011054,0.092428,-0.085016,-0.157466,-0.351460,-0.032933,-0.030256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,Novak Djokovic,Marin Cilic,0.068520,0.082432,-0.097994,-0.064537,-0.163816,-0.158663,-0.077037,-0.017370,-5.989177,-5.249510,-0.872294,-0.557097,0.060492,0.042248
14107,Roger Federer,Kei Nishikori,0.001433,0.021468,0.084541,0.101806,-0.167201,-0.022359,0.104516,0.113635,8.742647,7.093651,1.164706,0.411111,0.040079,0.016111
14108,Karen Khachanov,Dominic Thiem,0.070168,0.054033,-0.023562,0.007711,-0.001519,-0.094590,0.026620,0.038743,1.511957,2.262487,-1.981522,-1.471780,0.001248,-0.012370
14109,Novak Djokovic,Roger Federer,0.032738,0.045524,-0.092351,-0.083243,-0.083743,-0.017212,-0.076261,-0.057940,-5.197917,-5.093815,1.716667,0.243685,0.006929,0.035041


In [11]:
# new_columns = matches[['winner_name', 'loser_name', 'CO_1st_serve_in_p_diff', '1st_serve_in_p_diff', "CO_active"]]
# new_columns

Unnamed: 0,winner_name,loser_name,CO_1st_serve_in_p_diff,1st_serve_in_p_diff,CO_active
0,Marcos Giron,Richard Gasquet,0.003666,0.003357,1
1,Mackenzie Mcdonald,Daniel Elahi Galan,0.025972,0.013268,1
2,Miomir Kecmanovic,Christopher Oconnell,-0.010899,-0.023359,1
3,Yoshihito Nishioka,Holger Rune,0.018062,0.030227,1
4,Alexei Popyrin,Felix Auger Aliassime,-0.028370,-0.034825,1
...,...,...,...,...,...
14106,Novak Djokovic,Marin Cilic,0.068520,0.082432,1
14107,Roger Federer,Kei Nishikori,0.001433,0.021468,1
14108,Karen Khachanov,Dominic Thiem,0.070168,0.054033,1
14109,Novak Djokovic,Roger Federer,0.032738,0.045524,1


In [7]:
# Check for missing values in the DataFrame
missing_values = new_columns.isnull().sum()

# Filter columns with missing values
columns_with_nan = missing_values[missing_values > 0]

# Display the columns and their corresponding NaN counts
print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
Series([], dtype: int64)


In [8]:
matches.to_csv('../data/all_years_nc_tc_elo_aggr_stats/matches.csv', index=False)