In [1]:
import pandas as pd
matches = pd.read_csv("../data/all_years_nc_tc_elo/matches.csv")
matches = matches.sort_values(by='Date')

In [2]:
def number_of_common_opponent_matches(row, df):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']
    
    # Filter for matches that occurred before the current match date
    df = df[df['Date'] < current_date]
    
    # Find common opponents
    player_a_opponents = set(df[(df['winner_id'] == player_a)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_a)]['winner_id']))
    player_b_opponents = set(df[(df['winner_id'] == player_b)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_b)]['winner_id']))

    common_opponents = player_a_opponents.intersection(player_b_opponents)
    return len(common_opponents)

In [3]:
matches[['num_CO_matches']] = matches.apply(
    lambda row: pd.Series(number_of_common_opponent_matches(row, matches)), axis=1
)

In [4]:
import pandas as pd

def calculate_mean(matches, column_name):
    if matches.empty:
        return None
    column_mean = matches[column_name].dropna().mean()
    return column_mean if pd.notnull(column_mean) else 0

def calculate_stat_avg(player_win_matches, player_lose_matches, win_column, lose_column):
    win_mean = calculate_mean(player_win_matches, win_column)
    lose_mean = calculate_mean(player_lose_matches, lose_column)
    if win_mean and lose_mean:
        return (win_mean + lose_mean) / 2
    elif win_mean:
        return win_mean
    elif lose_mean:
        return lose_mean
    else:
        return 0

def filter_matches_by_player(df, player_id):
    win_matches = df[df['winner_id'] == player_id]
    lose_matches = df[df['loser_id'] == player_id]
    return win_matches, lose_matches

def calculate_common_opponent_stats(df, player_a, player_b, winner_column, loser_column):
    # Find common opponents
    player_a_opponents = set(df[df['winner_id'] == player_a]['loser_id']).union(
        set(df[df['loser_id'] == player_a]['winner_id'])
    )
    player_b_opponents = set(df[df['winner_id'] == player_b]['loser_id']).union(
        set(df[df['loser_id'] == player_b]['winner_id'])
    )
    common_opponents = player_a_opponents.intersection(player_b_opponents)

    # Filter matches against common opponents
    co_player_a_win_matches = df[(df['winner_id'] == player_a) & (df['loser_id'].isin(common_opponents))]
    co_player_a_lose_matches = df[(df['loser_id'] == player_a) & (df['winner_id'].isin(common_opponents))]
    co_player_b_win_matches = df[(df['winner_id'] == player_b) & (df['loser_id'].isin(common_opponents))]
    co_player_b_lose_matches = df[(df['loser_id'] == player_b) & (df['winner_id'].isin(common_opponents))]

    # Calculate stat avg for players against common opponents
    co_player_a_stat_avg = calculate_stat_avg(co_player_a_win_matches, co_player_a_lose_matches, winner_column, loser_column)
    co_player_b_stat_avg = calculate_stat_avg(co_player_b_win_matches, co_player_b_lose_matches, winner_column, loser_column)
    
    return co_player_a_stat_avg, co_player_b_stat_avg
    #return co_player_a_stat_avg - co_player_b_stat_avg

def calculate_player_stat_difference(row, df, winner_column_name, loser_column_name):
    winner = row['winner_id']
    loser = row['loser_id']
    current_date = row['Date']

    # Filter matches before the current match date
    past_matches = df[df['Date'] < current_date]

    # Calculate stat average for Player A and Player B
    winner_won_matches, player_a_lose_matches = filter_matches_by_player(past_matches, winner)
    loser_won_matches, player_b_lose_matches = filter_matches_by_player(past_matches, loser)

    winner_stat_avg = calculate_stat_avg(winner_won_matches, player_a_lose_matches, winner_column_name, loser_column_name)
    loser_stat_avg = calculate_stat_avg(loser_won_matches, player_b_lose_matches, winner_column_name, loser_column_name)

    #stat_avg_difference = player_a_stat_avg - player_b_stat_avg

    # Calculate common opponent stat averages if there were common opponents 
    if row["num_CO_matches"] > 0:
        winner_co_stat_avg, loser_co_stat_avg = calculate_common_opponent_stats(
            past_matches, winner, loser, winner_column_name, loser_column_name
        )
        return winner_co_stat_avg, loser_co_stat_avg, winner_stat_avg, loser_stat_avg
    else:
        return winner_stat_avg, loser_stat_avg, winner_stat_avg, loser_stat_avg


In [5]:
# Apply the function to each row
matches[['winner_CO_1st_serve_in_pct_avg', 'loser_CO_1st_serve_in_pct_avg', 'winner_1st_serve_in_pct_avg', 'loser_1st_serve_in_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_in_pct', 'loser_1st_serve_in_pct')), axis=1
)
matches[['winner_CO_1st_serve_win_pct_avg', 'loser_CO_1st_serve_win_pct_avg', 'winner_1st_serve_win_pct_avg', 'loser_1st_serve_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_win_pct', 'loser_1st_serve_win_pct')), axis=1
)
matches[['winner_CO_2nd_serve_win_pct_avg', 'loser_CO_2nd_serve_win_pct_avg', 'winner_2nd_serve_win_pct_avg', 'loser_2nd_serve_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_2nd_serve_win_pct', 'loser_2nd_serve_win_pct')), axis=1
)
matches[['winner_CO_serve_games_win_pct_avg', 'loser_CO_serve_games_win_pct_avg', 'winner_serve_games_win_pct_avg', 'loser_serve_games_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_service_games_won_pct', 'loser_service_games_won_pct')), axis=1
)
matches[['winner_CO_ace_avg', 'loser_CO_ace_avg', 'winner_ace_avg', 'loser_ace_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_ace', 'l_ace')), axis=1
)
matches[['winner_CO_df_avg', 'loser_CO_df_avg', 'winner_df_avg', 'loser_df_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_df', 'l_df')), axis=1
)
matches[['winner_CO_1st_serve_return_win_pct_avg', 'loser_CO_1st_serve_return_win_pct_avg', 'winner_1st_serve_return_win_pct_avg', 'loser_1st_serve_return_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_return_win_pct', 'loser_1st_serve_return_win_pct')), axis=1
)

In [6]:
new_columns = matches[['winner_name', 'loser_name', 'winner_1st_serve_in_pct_avg', 'winner_CO_1st_serve_in_pct_avg', 'loser_CO_1st_serve_in_pct_avg', 'winner_1st_serve_in_pct_avg', 'loser_1st_serve_in_pct_avg', 'winner_CO_1st_serve_win_pct_avg', 'loser_CO_1st_serve_win_pct_avg', 'winner_1st_serve_win_pct_avg', 'loser_1st_serve_win_pct_avg', 'winner_CO_2nd_serve_win_pct_avg', 'loser_CO_2nd_serve_win_pct_avg', 'winner_2nd_serve_win_pct_avg', 'loser_2nd_serve_win_pct_avg', 'winner_CO_serve_games_win_pct_avg', 'loser_CO_serve_games_win_pct_avg', 'winner_serve_games_win_pct_avg', 'loser_serve_games_win_pct_avg', 'winner_CO_ace_avg', 'loser_CO_ace_avg', 'winner_ace_avg', 'loser_ace_avg', 'winner_CO_df_avg', 'loser_CO_df_avg', 'winner_df_avg', 'loser_df_avg', 'winner_CO_1st_serve_return_win_pct_avg', 'loser_CO_1st_serve_return_win_pct_avg', 'winner_1st_serve_return_win_pct_avg', 'loser_1st_serve_return_win_pct_avg']]
new_columns

Unnamed: 0,winner_name,loser_name,winner_1st_serve_in_pct_avg,winner_CO_1st_serve_in_pct_avg,loser_CO_1st_serve_in_pct_avg,winner_1st_serve_in_pct_avg.1,loser_1st_serve_in_pct_avg,winner_CO_1st_serve_win_pct_avg,loser_CO_1st_serve_win_pct_avg,winner_1st_serve_win_pct_avg,...,winner_ace_avg,loser_ace_avg,winner_CO_df_avg,loser_CO_df_avg,winner_df_avg,loser_df_avg,winner_CO_1st_serve_return_win_pct_avg,loser_CO_1st_serve_return_win_pct_avg,winner_1st_serve_return_win_pct_avg,loser_1st_serve_return_win_pct_avg
0,Alexandr Dolgopolov,Diego Schwartzman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Alex De Minaur,Steve Johnson,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
18,Nicolas Jarry,Pablo Andujar,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
17,Matthew Ebden,Frances Tiafoe,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
16,Ryan Harrison,Leonardo Mayer,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14105,Ugo Humbert,Fabio Fognini,0.618706,0.617726,0.606366,0.618706,0.601995,0.724222,0.673723,0.734086,...,7.186170,3.553279,3.066667,4.004456,2.893501,4.041826,0.280857,0.299511,0.277544,0.304929
14106,Adrian Mannarino,Pavel Kotov,0.603324,0.599911,0.656281,0.603324,0.648619,0.708408,0.690748,0.707702,...,4.945183,3.330556,2.151580,3.149573,2.446217,2.577778,0.265265,0.294804,0.284617,0.284634
14108,Jack Draper,Jan Lennard Struff,0.592027,0.575839,0.569373,0.592027,0.563177,0.751200,0.777871,0.756438,...,7.650000,10.034875,2.915789,4.101648,2.934444,3.526328,0.274165,0.274771,0.288529,0.277035
14109,Ugo Humbert,Alexander Shevchenko,0.618692,0.616401,0.591608,0.618692,0.586036,0.709557,0.665490,0.734618,...,7.188858,4.189145,2.394737,2.446429,2.880119,2.534539,0.294567,0.248010,0.278437,0.278119


In [7]:
# Check for missing values in new columns
missing_values = new_columns.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
Series([], dtype: int64)


In [8]:
matches.to_csv('../data/all_years_nc_tc_elo_aggr_stats/matches.csv', index=False)