In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("../../data/Merged_year_data/matches_cleaned.csv")
matches = matches.sort_values(by='Date')

### First we need to create necessary match statistics from present data

In [3]:
matches['w_2ndIn'] = matches['w_svpt'] - matches['w_1stIn']-matches['w_df']
matches['l_2ndIn'] = matches['l_svpt'] - matches['l_1stIn']-matches['l_df']

In [4]:
def calculate_1st_serve_in_percentage(row):
    if pd.isna(row['w_svpt']) or pd.isna(row['w_1stIn']):
        return None
    if row['w_svpt'] > 0:
        return row['w_1stIn'] / row['w_svpt']
    return 0

def calculate_1st_serve_win_percentage(row):
    if pd.isna(row['w_1stIn']) or pd.isna(row['w_1stWon']):
        return None
    if row['w_1stIn'] > 0:
        return row['w_1stWon'] / row['w_1stIn']
    return 0

def calculate_2nd_serve_in_percentage(row):
    if pd.isna(row['w_svpt']) or pd.isna(row['w_2ndIn']):
        return None
    if row['w_svpt'] > 0:
        return row['w_2ndIn'] / row['w_svpt']
    return 0

def calculate_2nd_serve_win_percentage(row):
    if pd.isna(row['w_2ndIn']) or pd.isna(row['w_1stIn']) or pd.isna(row['w_2ndWon']):
        return None
    if row['w_2ndIn'] - row['w_1stIn'] > 0:
        return row['w_2ndWon'] / (row['w_2ndIn'] - row['w_1stIn'])
    return 0

def calculate_service_games_won_percentage(row):
    if pd.isna(row['w_SvGms']) or pd.isna(row['w_bpFaced']) or pd.isna(row['w_bpSaved']):
        return None
    if row['w_SvGms'] > 0:
        return (row['w_SvGms'] - (row['w_bpFaced'] - row['w_bpSaved'])) / row['w_SvGms']
    return 0
    

def calculate_1st_serve_in_percentage_loser(row):
    if pd.isna(row['l_svpt']) or pd.isna(row['l_1stIn']):
        return None
    if row['l_svpt'] > 0:
        return row['l_1stIn'] / row['l_svpt']
    return 0

def calculate_1st_serve_win_percentage_loser(row):
    if pd.isna(row['l_1stIn']) or pd.isna(row['l_1stWon']):
        return None
    if row['l_1stIn'] > 0:
        return row['l_1stWon'] / row['l_1stIn']
    return 0

def calculate_2nd_serve_in_percentage_loser(row):
    if pd.isna(row['l_svpt']) or pd.isna(row['l_1stIn']) or pd.isna(row['l_2ndIn']):
        return None
    if row['l_svpt'] - row['l_1stIn'] > 0:
        return row['l_2ndIn'] / (row['l_svpt'] - row['l_1stIn'])
    return 0

def calculate_2nd_serve_win_percentage_loser(row):
    if pd.isna(row['l_2ndIn']) or pd.isna(row['l_2ndWon']):
        return None
    if row['l_2ndIn'] > 0:
        return row['l_2ndWon'] / row['l_2ndIn']
    return 0

def calculate_service_games_won_percentage_loser(row):
    if pd.isna(row['l_SvGms']) or pd.isna(row['l_bpFaced']) or pd.isna(row['l_bpSaved']):
        return None
    if row['l_SvGms'] > 0:
        return (row['l_SvGms'] - (row['l_bpFaced'] - row['l_bpSaved'])) / row['l_SvGms']
    return 0

In [5]:
matches['winner_1st_serve_in_pct'] = matches.apply(calculate_1st_serve_in_percentage, axis=1)
matches['winner_1st_serve_win_pct'] = matches.apply(calculate_1st_serve_win_percentage, axis=1)
matches['winner_2nd_serve_in_pct'] = matches.apply(calculate_2nd_serve_in_percentage, axis=1)
matches['winner_2nd_serve_win_pct'] = matches.apply(calculate_2nd_serve_win_percentage, axis=1)
matches['winner_service_games_won_pct'] = matches.apply(calculate_service_games_won_percentage, axis=1)
matches['loser_1st_serve_in_pct'] = matches.apply(calculate_1st_serve_in_percentage_loser, axis=1)
matches['loser_1st_serve_win_pct'] = matches.apply(calculate_1st_serve_win_percentage_loser, axis=1)
matches['loser_2nd_serve_in_pct'] = matches.apply(calculate_2nd_serve_in_percentage_loser, axis=1)
matches['loser_2nd_serve_win_pct'] = matches.apply(calculate_2nd_serve_win_percentage_loser, axis=1)
matches['loser_service_games_won_pct'] = matches.apply(calculate_service_games_won_percentage_loser, axis=1)

In [6]:
def calculate_1st_serve_return_win_percentage(row):
    if pd.isna(row['loser_1st_serve_win_pct']):
        return None
    return 1 - row['loser_1st_serve_win_pct']

def calculate_2nd_serve_return_win_percentage(row):
    if pd.isna(row['loser_2nd_serve_win_pct']):
        return None
    return 1 - row['loser_2nd_serve_win_pct']

def calculate_return_games_win_percentage(row):
    if pd.isna(row['loser_service_games_won_pct']):
        return None
    return 1 - row['loser_service_games_won_pct']

def calculate_1st_serve_return_win_percentage_loser(row):
    if pd.isna(row['winner_1st_serve_win_pct']):
        return None
    return 1 - row['winner_1st_serve_win_pct']

def calculate_2nd_serve_return_win_percentage_loser(row):
    if pd.isna(row['winner_2nd_serve_win_pct']):
        return None
    return 1 - row['winner_2nd_serve_win_pct']

def calculate_return_games_win_percentage_loser(row):
    if pd.isna(row['winner_service_games_won_pct']):
        return None
    return 1 - row['winner_service_games_won_pct']

In [7]:
matches['winner_1st_serve_return_win_pct'] = matches.apply(calculate_1st_serve_return_win_percentage, axis=1)
matches['winner_2nd_serve_return_win_pct'] = matches.apply(calculate_2nd_serve_return_win_percentage, axis=1)
matches['winner_return_games_win_pct'] = matches.apply(calculate_return_games_win_percentage, axis=1)
matches['loser_1st_serve_return_win_pct'] = matches.apply(calculate_1st_serve_return_win_percentage_loser, axis=1)
matches['loser_2nd_serve_return_win_pct'] = matches.apply(calculate_2nd_serve_return_win_percentage_loser, axis=1)
matches['loser_return_games_win_pct'] = matches.apply(calculate_return_games_win_percentage_loser, axis=1)

In [8]:
def calculate_percentage_of_break_points_won_on_opponents_serve(row):
    if pd.isna(row['l_bpFaced']) or pd.isna(row['l_bpSaved']):
        return None
    if row['l_bpFaced'] > 0:
        return (row['l_bpFaced'] - row['l_bpSaved']) / row['l_bpFaced']
    return 0

def calculate_percentage_of_break_points_won_on_opponents_serve_loser(row):
    if pd.isna(row['w_bpFaced']) or pd.isna(row['w_bpSaved']):
        return None
    if row['w_bpFaced'] > 0:
        return (row['w_bpFaced'] - row['w_bpSaved']) / row['w_bpFaced']
    return 0

def calculate_percentage_of_break_points_saved(row):
    if pd.isna(row['w_bpFaced']) or pd.isna(row['w_bpSaved']):
        return None
    if row['w_bpFaced'] > 0:
        return row['w_bpSaved'] / row['w_bpFaced']
    return 0

def calculate_percentage_of_break_points_saved_loser(row):
    if pd.isna(row['l_bpFaced']) or pd.isna(row['l_bpSaved']):
        return None
    if row['l_bpFaced'] > 0:
        return row['l_bpSaved'] / row['l_bpFaced']
    return 0

In [9]:
matches['winner_bp_won_pct'] = matches.apply(calculate_percentage_of_break_points_won_on_opponents_serve, axis=1)
matches['loser_bp_won_pct'] = matches.apply(calculate_percentage_of_break_points_won_on_opponents_serve_loser, axis=1)
matches['winner_bp_saved_pct'] = matches.apply(calculate_percentage_of_break_points_saved, axis=1)
matches['loser_bp_saved_pct'] = matches.apply(calculate_percentage_of_break_points_saved_loser, axis=1)

### Now we can aggregate them and utilize common opponents method and time discounting

In [10]:
matches['Date'] = pd.to_datetime(matches['Date'], errors='coerce')

In [11]:
def number_of_common_opponent_matches(row, df):
    player_a = row['winner_id']
    player_b = row['loser_id']
    current_date = row['Date']
    
    # Filter for matches that occurred before the current match date
    df = df[df['Date'] < current_date]
    
    # Find common opponents
    player_a_opponents = set(df[(df['winner_id'] == player_a)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_a)]['winner_id']))
    player_b_opponents = set(df[(df['winner_id'] == player_b)]['loser_id']).union(
                         set(df[(df['loser_id'] == player_b)]['winner_id']))

    common_opponents = player_a_opponents.intersection(player_b_opponents)
    return len(common_opponents)

In [12]:
matches[['num_CO_matches']] = matches.apply(
    lambda row: pd.Series(number_of_common_opponent_matches(row, matches)), axis=1
)

In [13]:
import numpy as np

# Time Discounting
def calculate_weighted_mean(matches, column_name, time_column, decay_rate=0.3):
    if matches.empty:
        return None
    # Calculate weights using exponential decay
    matches['weight'] = np.exp(-decay_rate * matches[time_column])
    
    weighted_sum = (matches[column_name] * matches['weight']).sum()
    total_weight = matches['weight'].sum()
    weighted_mean = weighted_sum / total_weight if total_weight > 0 else 0
    return weighted_mean if pd.notnull(weighted_mean) else 0

def calculate_stat_avg(player_win_matches, player_lose_matches, win_column, lose_column):
    win_mean = calculate_weighted_mean(player_win_matches, win_column, 'time_since_match')
    lose_mean = calculate_weighted_mean(player_lose_matches, lose_column, 'time_since_match')
    if win_mean and lose_mean:
        return (win_mean + lose_mean) / 2
    elif win_mean:
        return win_mean
    elif lose_mean:
        return lose_mean
    else:
        return 0

def filter_matches_by_player(df, player_id):
    win_matches = df[df['winner_id'] == player_id]
    lose_matches = df[df['loser_id'] == player_id]
    return win_matches, lose_matches

def calculate_common_opponent_stats(df, player_a, player_b, winner_column, loser_column):
    # Find common opponents
    player_a_opponents = set(df[df['winner_id'] == player_a]['loser_id']).union(
        set(df[df['loser_id'] == player_a]['winner_id'])
    )
    player_b_opponents = set(df[df['winner_id'] == player_b]['loser_id']).union(
        set(df[df['loser_id'] == player_b]['winner_id'])
    )
    common_opponents = player_a_opponents.intersection(player_b_opponents)

    # Filter matches against common opponents
    co_player_a_win_matches = df[(df['winner_id'] == player_a) & (df['loser_id'].isin(common_opponents))]
    co_player_a_lose_matches = df[(df['loser_id'] == player_a) & (df['winner_id'].isin(common_opponents))]
    co_player_b_win_matches = df[(df['winner_id'] == player_b) & (df['loser_id'].isin(common_opponents))]
    co_player_b_lose_matches = df[(df['loser_id'] == player_b) & (df['winner_id'].isin(common_opponents))]

    # Calculate stat avg for players against common opponents
    co_player_a_stat_avg = calculate_stat_avg(co_player_a_win_matches, co_player_a_lose_matches, winner_column, loser_column)
    co_player_b_stat_avg = calculate_stat_avg(co_player_b_win_matches, co_player_b_lose_matches, winner_column, loser_column)
    
    return co_player_a_stat_avg, co_player_b_stat_avg

def calculate_player_stat_difference(row, df, winner_column_name, loser_column_name):
    winner = row['winner_id']
    loser = row['loser_id']
    current_date = row['Date']
    # Filter matches before the current match date
    past_matches = df[df['Date'] < current_date].copy()
    past_matches['time_since_match'] = (current_date - past_matches['Date']).dt.days / 365.0  # Convert to years
    
    # Calculate stat average for Player A and Player B
    winner_won_matches, player_a_lose_matches = filter_matches_by_player(past_matches, winner)
    loser_won_matches, player_b_lose_matches = filter_matches_by_player(past_matches, loser)

    winner_stat_avg = calculate_stat_avg(winner_won_matches, player_a_lose_matches, winner_column_name, loser_column_name)
    loser_stat_avg = calculate_stat_avg(loser_won_matches, player_b_lose_matches, winner_column_name, loser_column_name)


    # Calculate common opponent stat averages if there were common opponents 
    if row["num_CO_matches"] > 0:
        winner_co_stat_avg, loser_co_stat_avg = calculate_common_opponent_stats(
            past_matches, winner, loser, winner_column_name, loser_column_name
        )
        return winner_co_stat_avg, loser_co_stat_avg, winner_stat_avg, loser_stat_avg
    else:
        return winner_stat_avg, loser_stat_avg, winner_stat_avg, loser_stat_avg


In [14]:
# Apply the function to each row
matches[['winner_CO_1st_serve_in_pct_avg', 'loser_CO_1st_serve_in_pct_avg', 'winner_1st_serve_in_pct_avg', 'loser_1st_serve_in_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_in_pct', 'loser_1st_serve_in_pct')), axis=1
)
matches[['winner_CO_1st_serve_win_pct_avg', 'loser_CO_1st_serve_win_pct_avg', 'winner_1st_serve_win_pct_avg', 'loser_1st_serve_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_win_pct', 'loser_1st_serve_win_pct')), axis=1
)
matches[['winner_CO_2nd_serve_win_pct_avg', 'loser_CO_2nd_serve_win_pct_avg', 'winner_2nd_serve_win_pct_avg', 'loser_2nd_serve_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_2nd_serve_win_pct', 'loser_2nd_serve_win_pct')), axis=1
)
matches[['winner_CO_serve_games_win_pct_avg', 'loser_CO_serve_games_win_pct_avg', 'winner_serve_games_win_pct_avg', 'loser_serve_games_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_service_games_won_pct', 'loser_service_games_won_pct')), axis=1
)
matches[['winner_CO_ace_avg', 'loser_CO_ace_avg', 'winner_ace_avg', 'loser_ace_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_ace', 'l_ace')), axis=1
)
matches[['winner_CO_df_avg', 'loser_CO_df_avg', 'winner_df_avg', 'loser_df_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'w_df', 'l_df')), axis=1
)
matches[['winner_CO_1st_serve_return_win_pct_avg', 'loser_CO_1st_serve_return_win_pct_avg', 'winner_1st_serve_return_win_pct_avg', 'loser_1st_serve_return_win_pct_avg']] = matches.apply(
    lambda row: pd.Series(calculate_player_stat_difference(row, matches, 'winner_1st_serve_return_win_pct', 'loser_1st_serve_return_win_pct')), axis=1
)

In [23]:
new_columns = matches[['match_id', 'num_CO_matches', 'winner_CO_1st_serve_in_pct_avg', 'loser_CO_1st_serve_in_pct_avg', 'winner_1st_serve_in_pct_avg', 'loser_1st_serve_in_pct_avg', 
                       'winner_CO_1st_serve_win_pct_avg', 'loser_CO_1st_serve_win_pct_avg', 'winner_1st_serve_win_pct_avg', 'loser_1st_serve_win_pct_avg', 'winner_CO_2nd_serve_win_pct_avg', 'loser_CO_2nd_serve_win_pct_avg', 'winner_2nd_serve_win_pct_avg', 'loser_2nd_serve_win_pct_avg', 'winner_CO_serve_games_win_pct_avg', 'loser_CO_serve_games_win_pct_avg', 'winner_serve_games_win_pct_avg', 'loser_serve_games_win_pct_avg', 'winner_CO_ace_avg', 'loser_CO_ace_avg', 'winner_ace_avg', 'loser_ace_avg', 'winner_CO_df_avg', 'loser_CO_df_avg', 'winner_df_avg', 'loser_df_avg', 'winner_CO_1st_serve_return_win_pct_avg', 'loser_CO_1st_serve_return_win_pct_avg', 'winner_1st_serve_return_win_pct_avg', 'loser_1st_serve_return_win_pct_avg'
                       ]]

In [24]:
# Check for missing values in new columns
missing_values = new_columns.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
Series([], dtype: int64)


In [25]:
new_columns.to_csv("../data/aggregate_player_match_stats.csv", index=False)