In [2]:
import pandas as pd
import numpy as np

In [3]:
# Concatenate all ATP matches from 2000 to 2017 into a single file

all_matches = pd.DataFrame()

tour_years = range(1991,2023)
qual_chal_years = range(2010,2023)

for year in tour_years:
    tour_matches = pd.read_csv(f'../tennis_atp/atp_matches_{year}.csv')
    all_matches = pd.concat([all_matches,tour_matches], ignore_index=True)

for year in qual_chal_years:
    qual_chal_matches = pd.read_csv(f'../tennis_atp/atp_matches_qual_chall_{year}.csv')
    all_matches = pd.concat([all_matches,qual_chal_matches], ignore_index=True)

all_matches.to_csv('../data/atp_matches_1991-2023.csv', index=False, encoding='utf-8')

In [26]:
def get_recent_player_stats(pid):
    total_first_serve_in = 0
    total_first_serve_won = 0
    total_second_serve_won = 0
    total_double_faults = 0
    total_aces = 0
    total_break_points_saved = 0
    total_break_points_faced = 0
    total_service_games = 0
    total_service_points = 0

    total_other_serve_pts = 0
    total_other_first_serve_won = 0
    total_other_second_serve_won = 0
    total_other_break_points_faced = 0
    total_other_break_points_saved = 0
    total_other_service_games = 0

    player_matches = all_matches[(all_matches['winner_id'] == pid) | (all_matches['loser_id'] == pid)]
    
    last_ten_matches = all_matches[(all_matches['winner_id'] == row['winner_id']) | (all_matches['loser_id'] == row['winner_id'])].tail(10)

    for row in last_ten_matches:
        total_first_serve_in += row['w_1stIn']
        total_first_serve_won += row['w_1stWon']
        total_second_serve_won += row['w_2ndWon']
        total_double_faults += row['w_df']
        total_aces += row['w_ace']
        total_break_points_saved += row['w_bpSaved']
        total_break_points_faced += row['w_bpFaced']
        total_service_games += row['w_SvGms']
        total_service_points += row['w_svpt']
        total_other_serve_pts += row['l_svpt']
        total_other_first_serve_won += row['l_1stWon']
        total_other_second_serve_won += row['l_2ndWon']
        total_other_break_points_faced += row['l_bpFaced']
        total_other_break_points_saved += row['l_bpSaved']
        total_other_service_games += row['l_SvGms']
    
    first_serve_pt = total_first_serve_in / total_service_points
    first_serve_won = total_first_serve_won / total_first_serve_in
    second_serve_won = total_second_serve_won / (total_service_points - total_first_serve_in - total_double_faults)
    double_faults = total_double_faults / total_service_points
    aces = total_aces / total_service_points
    break_points_saved = total_break_points_saved / total_break_points_faced
    break_points_faced = total_break_points_faced / total_service_games
    return_first_serve_pt_won = 1 - (total_other_first_serve_won / total_other_serve_pts)
    return_second_serve_won = 1 - (total_other_second_serve_won / (total_other_serve_pts - total_other_first_serve_won))
    bp_converted = (total_other_break_points_faced - total_other_break_points_saved) / total_other_break_points_faced
    bp_opportunities = total_other_break_points_faced / total_other_service_games

    return [first_serve_pt, first_serve_won, second_serve_won, double_faults, aces, break_points_saved, break_points_faced, return_first_serve_pt_won, return_second_serve_won, bp_converted, bp_opportunities]

# all_matches = pd.read_csv('../data/atp_matches_1991-2023.csv')
all_ids = pd.concat([all_matches['winner_id'], all_matches['loser_id']])
player_counts = all_ids.value_counts()

print(all_ids.shape)
print(len(all_ids.unique()))

player_10_counts = player_counts[player_counts > 10]
player_5_counts = player_counts[player_counts > 1]
print(player_5_counts.shape)
players = player_counts.index.unique().tolist()

print(len(players))
filtered_df = all_matches[all_matches['winner_id'].isin(players) | all_matches['loser_id'].isin(players)]
print(filtered_df.shape)
# for player in players
# all_matches['winner_id'].value_counts()

# ineligble_players = 

(400250,)
6627
(2420,)
2420
(199088, 49)


In [95]:
# Encode data

encoded_matches = pd.read_csv('../data/atp_matches_1991-2023.csv')

encoded_matches.dropna(inplace=True, subset=['winner_hand', 'loser_hand', 'surface', 'round', 'tourney_level', 'tourney_date', 'winner_ioc', 'loser_ioc'])

# Update hand to a boolean value (0 = R, 1 = L, 2= unknown)
encoded_matches['winner_hand'] = encoded_matches['winner_hand'].map({'R': 0, 'L': 1, 'U': 2})
encoded_matches['loser_hand'] = encoded_matches['loser_hand'].map({'R': 0, 'L': 1, 'U':2})

# Update surface to a one hot encoding
encoded_matches = pd.get_dummies(encoded_matches, columns=['surface'])

# Update round to a label encoding
encoded_matches['round'] = encoded_matches['round'].map({'R128': 1, 'R64': 2, 'R32': 3, 'R16': 4, 'QF': 5, 'SF': 6, 'F': 7})

# Update tourney_level to a one hot encoding
encoded_matches = pd.get_dummies(encoded_matches, columns=['tourney_level'])

# Columns that need extra processing
# tourney_date
# First, convert tourney_date to a datetime object
encoded_matches['tourney_date'] = pd.to_datetime(encoded_matches['tourney_date'], format='%Y%m%d')

encoded_matches['day'] = encoded_matches.tourney_date.dt.day
encoded_matches['month'] = encoded_matches.tourney_date.dt.month
encoded_matches['year'] = encoded_matches.tourney_date.dt.year

# Change this to just encode the day out of 365, and the year as an integer

def encode_dates(df):
    df['day_sin'] = np.sin(2 * np.pi * df['day']/31)
    df['day_cos'] = np.cos(2 * np.pi * df['day']/31)
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    df['year_sin'] = np.sin(2 * np.pi * df['year']/2023)
    df['year_cos'] = np.cos(2 * np.pi * df['year']/2023)
    return df

encoded_matches = encode_dates(encoded_matches)

#Drop the original columns
encoded_matches = encoded_matches.drop(['day', 'month', 'year'], axis=1)


encoded_matches.to_csv('../data/atp_matches_1991-2023_processed.csv', index=False, encoding='utf-8')

encoded_matches.isna().sum()

tourney_id      0
tourney_name    0
draw_size       0
tourney_date    0
match_num       0
               ..
day_cos         0
month_sin       0
month_cos       0
year_sin        0
year_cos        0
Length: 63, dtype: int64

In [97]:
# Clean the data

dropped_columns = [
    'tourney_name',
    'draw_size',
    'winner_seed',
    'winner_entry',
    'winner_name',
    "match_num",
    'loser_name',
    'loser_rank_points',
    'loser_rank_points',
    'loser_entry',
    'loser_seed',
    'loser_entry',
    'score'
]

matches = pd.read_csv('../data/atp_matches_1991-2023_processed.csv')
matches_length = len(matches)
print(matches.value_counts())

all_columns = matches.columns
essential_columns = [col for col in all_columns if col not in dropped_columns]

# This removes 53734 rows which have NAs in the essential columns
print(len(matches))
print(essential_columns)
matches_cleaned =  matches.dropna(subset=essential_columns)


# Remove any matches where and opponent retired
matches_cleaned = matches_cleaned[matches_cleaned['score'].str.contains('RET') == False]

dropped_columns.remove('match_num')
matches_cleaned = matches_cleaned.drop(columns=dropped_columns)
print(matches_cleaned.isna().sum())

matches_cleaned.to_csv('../data/atp_matches_1991-2023_cleaned.csv', index=False, encoding='utf-8')

Series([], Name: count, dtype: int64)
200035
['tourney_id', 'tourney_date', 'winner_id', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'loser_id', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'best_of', 'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank', 'winner_rank_points', 'loser_rank', 'surface_Carpet', 'surface_Clay', 'surface_Grass', 'surface_Hard', 'tourney_level_A', 'tourney_level_C', 'tourney_level_D', 'tourney_level_F', 'tourney_level_G', 'tourney_level_M', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'year_sin', 'year_cos']
tourney_id            0
tourney_date          0
match_num             0
winner_id             0
winner_hand           0
winner_ht             0
winner_ioc            0
winner_age            0
loser_id              0
loser_hand            0
loser_ht          

In [99]:
# Anononymize and balance the data
df = pd.read_csv('../data/atp_matches_1991-2023_cleaned.csv')

# Randomly select a winner for each match
length = len(df)
num_zeros = length // 2
num_ones = length - num_zeros

array_zeros = np.zeros(num_zeros, dtype=int)
array_ones = np.ones(num_ones, dtype=int)

# Concatenate the arrays
winners = np.concatenate((array_zeros, array_ones))
np.random.shuffle(winners)

df['winner'] = winners

# Shuffle the combined array

index = 0
player_cols = ['id', 'hand', 'ht','ioc', 'age', 'rank']
stat_cols = ['ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']

columns_dict = {}

player0_wins = df[df['winner'] == 0]
player1_wins = df[df['winner'] == 1]

# Rename the columns
player0_wins = player0_wins.rename(columns={f'winner_{col}': f'player0_{col}' for col in player_cols})
player0_wins = player0_wins.rename(columns={f'w_{col}': f'player0_{col}' for col in stat_cols})
player0_wins = player0_wins.rename(columns={f'loser_{col}': f'player1_{col}' for col in player_cols})
player0_wins = player0_wins.rename(columns={f'l_{col}': f'player1_{col}' for col in stat_cols})

player1_wins = player1_wins.rename(columns={f'winner_{col}': f'player1_{col}' for col in player_cols})
player1_wins = player1_wins.rename(columns={f'w_{col}': f'player1_{col}' for col in stat_cols})
player1_wins = player1_wins.rename(columns={f'loser_{col}': f'player0_{col}' for col in player_cols})
player1_wins = player1_wins.rename(columns={f'l_{col}': f'player0_{col}' for col in stat_cols})

anonymized_df = pd.concat([player0_wins, player1_wins])
anonymized_df.sort_values(by=['tourney_date', 'match_num'], inplace=True)
anonymized_df.to_csv('../data/atp_matches_1991-2023_processed.csv', index=False, encoding='utf-8')
anonymized_df.isna().sum()


tourney_id            0
tourney_date          0
match_num             0
player0_id            0
player0_hand          0
player0_ht            0
player0_ioc           0
player0_age           0
player1_id            0
player1_hand          0
player1_ht            0
player1_ioc           0
player1_age           0
best_of               0
round                 0
minutes               0
player0_ace           0
player0_df            0
player0_svpt          0
player0_1stIn         0
player0_1stWon        0
player0_2ndWon        0
player0_SvGms         0
player0_bpSaved       0
player0_bpFaced       0
player1_ace           0
player1_df            0
player1_svpt          0
player1_1stIn         0
player1_1stWon        0
player1_2ndWon        0
player1_SvGms         0
player1_bpSaved       0
player1_bpFaced       0
player0_rank          0
winner_rank_points    0
player1_rank          0
surface_Carpet        0
surface_Clay          0
surface_Grass         0
surface_Hard          0
tourney_level_A 

In [134]:
# Split the data into training and testing sets
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.utils import compute_class_weight, resample
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model

matches = pd.read_csv('../data/atp_matches_1991-2023_processed.csv')

train, test = train_test_split(matches, test_size=0.2)

skf = KFold(n_splits=5, random_state=42, shuffle=True)

x = train.iloc[:, :-1].drop(columns=['tourney_date', 'tourney_id'])
y = train.iloc[:, -1]

x_test = test.iloc[:, :-1].drop(columns=['tourney_date', 'tourney_id'])
y_test = test.iloc[:, -1]

model = LogisticRegression(max_iter=2000, class_weight='balanced')

# print(cross_val_score(model, x, y, cv=5, scoring='accuracy'))

model.fit(x, y)

pred = model.predict(x_test)
pred_proba = model.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, pred))

print(model)

# lasso = linear_model.Lasso()

# print(len(x))
# print(cross_val_score(model, x, y, cv=5, scoring='accuracy'))


# split_iterator = skf.split(x,y)

# for train_index, test_index in split_iterator:
#     x_train, x_test = x.iloc[train_index], x.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#     print(x_train.value_counts())

#     # Train a logistic regression model
#     model.fit(x_train, y_train)

    

Accuracy: 0.9342174422835887
LogisticRegression(class_weight='balanced', max_iter=2000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
