In [1]:
import pandas as pd
matches = pd.read_csv("../data/all_years_nc_tc_elo/matches.csv")
matches = matches.sort_values(by='Date')

In [2]:
def check_if_common_opponents_exist(row, df):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']
    
    # Filter for matches that occurred before the current match date
    df = df[df['Date'] < current_date]
    
    # Find common opponents
    player_a_opponents = set(df[(df['winner_id'] == player_a)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_a)]['winner_id']))
    player_b_opponents = set(df[(df['winner_id'] == player_b)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_b)]['winner_id']))

    common_opponents = player_a_opponents.intersection(player_b_opponents)

    if not common_opponents:
        return 0
    else:
        return 1

In [3]:
matches[['CO_active']] = matches.apply(
    lambda row: pd.Series(check_if_common_opponents_exist(row, matches)), axis=1
)

In [4]:
import pandas as pd

def calculate_mean(matches, column_name):
    """Calculate mean excluding NaN, or return 0 if DataFrame is empty or only contains NaN."""
    if matches.empty:
        return 0
    column_mean = matches[column_name].dropna().mean()
    return column_mean if pd.notnull(column_mean) else 0

def calculate_wrp(player_win_matches, player_lose_matches, win_column, lose_column):
    """Calculate weighted return percentage (WRP) for a player."""
    win_mean = calculate_mean(player_win_matches, win_column)
    lose_mean = calculate_mean(player_lose_matches, lose_column)
    return (win_mean + lose_mean) / 2

def filter_matches_by_player(df, player_id):
    """Filter matches for a player as winner or loser."""
    win_matches = df[df['winner_id'] == player_id]
    lose_matches = df[df['loser_id'] == player_id]
    return win_matches, lose_matches

def calculate_common_opponent_stats(df, player_a, player_b, winner_column, loser_column):
    """Calculate WRP difference for common opponents."""
    # Find common opponents
    player_a_opponents = set(df[df['winner_id'] == player_a]['loser_id']).union(
        set(df[df['loser_id'] == player_a]['winner_id'])
    )
    player_b_opponents = set(df[df['winner_id'] == player_b]['loser_id']).union(
        set(df[df['loser_id'] == player_b]['winner_id'])
    )
    common_opponents = player_a_opponents.intersection(player_b_opponents)

    # Filter matches against common opponents
    co_player_a_win_matches = df[(df['winner_id'] == player_a) & (df['loser_id'].isin(common_opponents))]
    co_player_a_lose_matches = df[(df['loser_id'] == player_a) & (df['winner_id'].isin(common_opponents))]
    co_player_b_win_matches = df[(df['winner_id'] == player_b) & (df['loser_id'].isin(common_opponents))]
    co_player_b_lose_matches = df[(df['loser_id'] == player_b) & (df['winner_id'].isin(common_opponents))]

    # Calculate WRP for players against common opponents
    co_player_a_wrp = calculate_wrp(co_player_a_win_matches, co_player_a_lose_matches, winner_column, loser_column)
    co_player_b_wrp = calculate_wrp(co_player_b_win_matches, co_player_b_lose_matches, winner_column, loser_column)

    # Return the difference
    return co_player_a_wrp - co_player_b_wrp

def calculate_player_stat_difference(row, df, winner_column_name, loser_column_name):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']

    # Filter matches before the current match date
    past_matches = df[df['Date'] < current_date]

    # Calculate WRP for Player A and Player B
    player_a_win_matches, player_a_lose_matches = filter_matches_by_player(past_matches, player_a)
    player_b_win_matches, player_b_lose_matches = filter_matches_by_player(past_matches, player_b)

    player_a_wrp = calculate_wrp(player_a_win_matches, player_a_lose_matches, winner_column_name, loser_column_name)
    player_b_wrp = calculate_wrp(player_b_win_matches, player_b_lose_matches, winner_column_name, loser_column_name)

    wrp_difference = player_a_wrp - player_b_wrp

    # Calculate common opponent WRP difference if active
    if row["CO_active"] == 1:
        co_wrp_difference = calculate_common_opponent_stats(
            past_matches, player_a, player_b, winner_column_name, loser_column_name
        )
        return co_wrp_difference, wrp_difference
    else:
        return wrp_difference, wrp_difference


In [5]:
# Apply the function to each row
matches[['CO_1st_serve_in_p_diff', '1st_serve_in_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_in_pct', 'loser_1st_serve_in_pct')), axis=1
)
matches[['CO_1st_serve_win_p_diff', '1st_serve_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_win_pct', 'loser_1st_serve_win_pct')), axis=1
)
matches[['CO_2nd_serve_win_p_diff', '2nd_serve_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_2nd_serve_win_pct', 'loser_2nd_serve_win_pct')), axis=1
)
matches[['CO_serve_games_win_p_diff', 'serve_games_win_p_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_service_games_won_pct', 'loser_service_games_won_pct')), axis=1
)
matches[['CO_avg_ace_per_match_diff', 'avg_ace_per_match_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_ace', 'l_ace')), axis=1
)
matches[['CO_avg_df_per_match_diff', 'avg_df_per_match_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_df', 'l_df')), axis=1
)
matches[['CO_1st_serve_wrp_diff', '1st_serve_wrp_diff']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_return_win_pct', 'loser_1st_serve_return_win_pct')), axis=1
)

In [6]:
new_columns = matches[['winner_name', 'loser_name', 'CO_1st_serve_in_p_diff', '1st_serve_in_p_diff', 'CO_1st_serve_win_p_diff', '1st_serve_win_p_diff', 'CO_2nd_serve_win_p_diff', '2nd_serve_win_p_diff', 'CO_serve_games_win_p_diff', 'serve_games_win_p_diff', 'CO_avg_ace_per_match_diff', 'avg_ace_per_match_diff', 'CO_avg_df_per_match_diff', 'avg_df_per_match_diff', 'CO_1st_serve_wrp_diff', '1st_serve_wrp_diff']]
new_columns

Unnamed: 0,winner_name,loser_name,CO_1st_serve_in_p_diff,1st_serve_in_p_diff,CO_1st_serve_win_p_diff,1st_serve_win_p_diff,CO_2nd_serve_win_p_diff,2nd_serve_win_p_diff,CO_serve_games_win_p_diff,serve_games_win_p_diff,CO_avg_ace_per_match_diff,avg_ace_per_match_diff,CO_avg_df_per_match_diff,avg_df_per_match_diff,CO_1st_serve_wrp_diff,1st_serve_wrp_diff
0,Alexandr Dolgopolov,Diego Schwartzman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Alex De Minaur,Steve Johnson,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
18,Nicolas Jarry,Pablo Andujar,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
17,Matthew Ebden,Frances Tiafoe,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
16,Ryan Harrison,Leonardo Mayer,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14105,Ugo Humbert,Fabio Fognini,0.011360,0.016711,0.050499,0.059635,-0.143898,-0.105533,0.065474,0.086896,3.649643,3.632892,-0.937790,-1.148325,-0.018654,-0.027384
14106,Adrian Mannarino,Pavel Kotov,-0.056370,-0.045295,0.017660,0.017611,0.011988,0.285805,0.017668,0.012119,1.297580,1.614628,-0.997992,-0.131560,-0.029539,-0.000017
14108,Jack Draper,Jan Lennard Struff,0.006466,0.028850,-0.026672,-0.008361,-0.616090,-0.501805,-0.015245,0.015352,-4.380027,-2.384875,-1.185859,-0.591883,-0.000607,0.011495
14109,Ugo Humbert,Alexander Shevchenko,0.024793,0.032655,0.044067,0.049131,0.003156,0.036281,0.044298,0.063574,0.759712,2.999714,-0.051692,0.345579,0.046557,0.000318


In [7]:
# new_columns = matches[['winner_name', 'loser_name', 'CO_1st_serve_in_p_diff', '1st_serve_in_p_diff', "CO_active"]]
# new_columns

In [8]:
# Check for missing values in the DataFrame
missing_values = new_columns.isnull().sum()

# Filter columns with missing values
columns_with_nan = missing_values[missing_values > 0]

# Display the columns and their corresponding NaN counts
print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
Series([], dtype: int64)


In [9]:
matches.to_csv('../data/all_years_nc_tc_elo_aggr_stats/matches.csv', index=False)