In [35]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt

from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise.model_selection.validation import cross_validate
from surprise import accuracy

In [36]:
tennis_df = pd.read_excel("tennis.xlsx")

In [37]:
# Create aces dictionary function. 
# This function will be used to reformat the original data.
# We want a separate row for each server instead of a seperate row for each match

def aces_dict_add(row):
    aces_dict_data["server"].append(row["winner_name"])
    aces_dict_data["returner"].append(row["loser_name"])
    aces_dict_data["X"].append(row["X"])
    aces_dict_data["match_num"].append(row["match_num"])
    aces_dict_data["WorL"].append("W")
    aces_dict_data["aces"].append(row["winner_aces"])
    aces_dict_data["male_or_female"].append(row["gender"])

    aces_dict_data["server"].append(row["loser_name"])
    aces_dict_data["returner"].append(row["winner_name"])
    aces_dict_data["X"].append(row["X"])
    aces_dict_data["match_num"].append(row["match_num"])
    aces_dict_data["WorL"].append("L")
    aces_dict_data["aces"].append(row["loser_aces"])
    aces_dict_data["male_or_female"].append(row["gender"])

    return aces_dict_data

In [38]:
# Create the aces dictionary:

aces_dict_data = {"server":[],
                  "returner":[],
                  "X":[],
                  "match_num":[],
                  "WorL":[],
                  "aces":[],
                  "male_or_female":[]
                 }

tennis_df.apply(aces_dict_add, axis=1)

# Transform the data back into a dataframe with the structure we want:
aces_df = pd.DataFrame.from_dict(aces_dict_data)

## Aces among men and women 
Male and female tennis players do not play against each other in professional competitions. There are also big differences in the prevalence of aces in mens and women's tennis. Thus, when building a model to predict the number of aces that would occur between two opponents if makes sense to address this difference. 

In this instance, we will train and test our model on women tennis players. 

In [39]:
# Select only the female tennis players:

aces_df_female = aces_df[aces_df["male_or_female"]=="female"][["server","returner","aces"]]
aces_df_female.shape

(4572, 3)

In [40]:
# Handle duplicates by averaging the ratings. 
# The surprise package expects a unique value for each server/ returner pair:

aces_df_female = aces_df_female.groupby(["server","returner"])["aces"].mean().reset_index()

In [41]:
# Make sure we get rid of aces values that are NaN because this will impact 
# our model's ability to make predictions

aces_df_female = aces_df_female[aces_df_female["aces"].notna()]

In [42]:
# min and max ratings will be used to normalize the ratings later
min_aces = min(aces_df_female["aces"])
max_aces = max(aces_df_female["aces"])

print(f"min aces are {min_aces} and max aces are {max_aces}")

min aces are 0.0 and max aces are 24.0


## Creating a utility matrix

Because I am using the surprise package I need the data to be in a format equivalent to a "User-Item-Rating Triplet" which is common to recommender systems. In this case it will be a Server, Returner, Aces triplet. 

The Surprise library often works with data in the form of triplets: (user, item, rating). Each row represents the number of aces a server achieved in a match against a particular returner.

In [43]:
# In this step we make sure the data is in the correct format for the surprise package

reader = Reader(rating_scale=(0.0, 24.0))
surprise_data = Dataset.load_from_df(aces_df_female[["server","returner","aces"]], reader)

In [44]:
# Divide the surprise data into a trainset and a testset. 

trainset, testset = train_test_split(surprise_data, test_size=.1, random_state=42)

In [45]:
def add_matches_to_testset(player1, player2):
    aces = aces_df_female[(aces_df_female["server"] == player2) & (aces_df_female["returner"] == player1)]["aces"].tolist()[0]
    output = (player2, player1, aces)
    return output

In [46]:
# find the testset "matches" and add them to the testset
testset_additions = []
for item in testset:
    testset_additions.append(add_matches_to_testset(item[0], item[1]))

testset += testset_additions

In [47]:
svd = SVD(n_epochs=3)

cross_validate(
    svd, 
    surprise_data, 
    measures=['RMSE', 'MAE'], 
    cv=3, 
    verbose=True
)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    2.5135  2.5240  2.4353  2.4909  0.0395  
MAE (testset)     1.8815  1.9262  1.8375  1.8817  0.0362  
Fit time          0.01    0.01    0.00    0.01    0.00    
Test time         0.01    0.00    0.00    0.00    0.00    


{'test_rmse': array([2.51354281, 2.52395806, 2.43534485]),
 'test_mae': array([1.88149804, 1.92615306, 1.83745569]),
 'fit_time': (0.007533073425292969,
  0.006671905517578125,
  0.004078865051269531),
 'test_time': (0.006331920623779297,
  0.004106998443603516,
  0.003287076950073242)}

In [48]:
# predicting the test data
# NOTE: This is the error associated with predicting the number of aces a particular 
# server gets when playing agains a particular opponent

predictions = svd.test(testset)
rmse_ = round(accuracy.rmse(predictions), 3)
rmse_

RMSE: 2.3372


2.337

In [49]:
def generate_ace_prediction(model, server, returner):
    pred_set = [(server, returner, 0)] #Establish the data format
    prediction = model.test(pred_set)

    for pred in prediction:
        return pred.est

In [50]:
def total_match_aces_prediction(player1, player2):
    return generate_ace_prediction(svd, player1, player2) + generate_ace_prediction(svd, player2, player1)

In [51]:
## Testing the model:

In [52]:
total_match_aces_prediction('Misaki Doi', 'Ajla Tomljanovic')

5.961506345776405

In [53]:
def total_match_aces_true(player1, player2):
    condition1 = ((aces_df_female["server"] == player1) & (aces_df_female["returner"] == player2))
    condition2 = ((aces_df_female["server"] == player2) & (aces_df_female["returner"] == player1))
    return aces_df_female[condition1 | condition2]["aces"].sum()

In [54]:
total_match_aces_true('Misaki Doi', 'Ajla Tomljanovic')

7.0

In [55]:
def prediction_vs_reality(player1, player2):
    pred = total_match_aces_prediction(player1, player2)
    true_val = total_match_aces_true(player1, player2)
    return f"The model predicts {pred} aces between these players when in reality there are on average {true_val} aces"

In [56]:
prediction_vs_reality('Camila Giorgi', 'Iga Swiatek')

'The model predicts 5.518324058736841 aces between these players when in reality there are on average 4.0 aces'

In [34]:
player1 = 'Camila Giorgi'
player2 = 'Iga Swiatek'

condition1 = ((aces_df_female["server"] == player1) & (aces_df_female["returner"] == player2))
condition2 = ((aces_df_female["server"] == player2) & (aces_df_female["returner"] == player1))
aces_df_female[condition1 | condition2]

Unnamed: 0,server,returner,aces
816,Camila Giorgi,Iga Swiatek,0.0
1698,Iga Swiatek,Camila Giorgi,4.0


In [19]:
aces_df_female.head()

Unnamed: 0,server,returner,aces
0,Ajla Tomljanovic,Alize Cornet,2.0
1,Ajla Tomljanovic,Anastasia Pavlyuchenkova,2.0
2,Ajla Tomljanovic,Anastasija Sevastova,2.0
3,Ajla Tomljanovic,Anett Kontaveit,3.0
4,Ajla Tomljanovic,Angelique Kerber,2.0


In [63]:
testset_df = pd.DataFrame(testset, columns=["server","returner","aces"])
testset_df.iloc[415]

server             Sara Bejlek
returner    Barbora Krejcikova
aces                       0.0
Name: 415, dtype: object

In [64]:
testset_df.head()

Unnamed: 0,server,returner,aces
0,Barbora Krejcikova,Sara Bejlek,2.0
1,Tamara Zidansek,Na Lae Han,2.0
2,Ons Jabeur,Aliaksandra Sasnovich,9.0
3,Tereza Martincova,Aryna Sabalenka,2.0
4,Elsa Jacquemot,Lesia Tsurenko,2.0


In [83]:
testset_df_1 = testset_df.iloc[:415]
testset_df_2 = testset_df.iloc[415:]
testset_df_2 = testset_df_2.rename(columns={"server":"server2", "returner":"returner2", "aces":"aces2"}).reset_index(drop=True)

In [86]:
test = pd.merge(testset_df_1, testset_df_2, left_index=True, right_index=True)
test["total_aces"] = test["aces"] + test["aces2"]

In [87]:
test.head()

Unnamed: 0,server,returner,aces,server2,returner2,aces2,total_aces
0,Barbora Krejcikova,Sara Bejlek,2.0,Sara Bejlek,Barbora Krejcikova,0.0,2.0
1,Tamara Zidansek,Na Lae Han,2.0,Na Lae Han,Tamara Zidansek,1.0,3.0
2,Ons Jabeur,Aliaksandra Sasnovich,9.0,Aliaksandra Sasnovich,Ons Jabeur,1.0,10.0
3,Tereza Martincova,Aryna Sabalenka,2.0,Aryna Sabalenka,Tereza Martincova,3.0,5.0
4,Elsa Jacquemot,Lesia Tsurenko,2.0,Lesia Tsurenko,Elsa Jacquemot,0.0,2.0


In [90]:
def apply_predictor(row):
    return total_match_aces_prediction(row["server"], row["returner"])

In [92]:
test["predicted_aces"] = test.apply(apply_predictor, axis=1)

In [98]:
test.head(20)

Unnamed: 0,server,returner,aces,server2,returner2,aces2,total_aces,predicted_aces
0,Barbora Krejcikova,Sara Bejlek,2.0,Sara Bejlek,Barbora Krejcikova,0.0,2.0,5.721427
1,Tamara Zidansek,Na Lae Han,2.0,Na Lae Han,Tamara Zidansek,1.0,3.0,5.057231
2,Ons Jabeur,Aliaksandra Sasnovich,9.0,Aliaksandra Sasnovich,Ons Jabeur,1.0,10.0,5.242869
3,Tereza Martincova,Aryna Sabalenka,2.0,Aryna Sabalenka,Tereza Martincova,3.0,5.0,6.7404
4,Elsa Jacquemot,Lesia Tsurenko,2.0,Lesia Tsurenko,Elsa Jacquemot,0.0,2.0,4.716881
5,Claire Liu,Liudmila Samsonova,1.0,Liudmila Samsonova,Claire Liu,6.0,7.0,5.731043
6,Misaki Doi,Petra Martic,0.0,Petra Martic,Misaki Doi,2.0,2.0,5.670087
7,Kristina Mladenovic,Anna Karolina Schmiedlova,0.0,Anna Karolina Schmiedlova,Kristina Mladenovic,1.0,1.0,5.435125
8,Sorana Cirstea,Johanna Konta,4.0,Johanna Konta,Sorana Cirstea,3.5,7.5,5.858589
9,Bianca Andreescu,Viktorija Golubic,7.0,Viktorija Golubic,Bianca Andreescu,0.0,7.0,5.1893


In [96]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [97]:
MSE = mean_squared_error(test["total_aces"], test["predicted_aces"])
RMSE = np.sqrt(MSE)
RMSE

3.5019261067572147

In [100]:
average_aces = test["total_aces"].mean()
average_aces

5.150602409638554

In [102]:
average_aces_list = [average_aces] * 415

In [104]:
# BASELINE COMPARISON:
MSE_baseline = mean_squared_error(test["total_aces"], average_aces_list)
RMSE_baseline = np.sqrt(MSE_baseline)
RMSE_baseline

3.876612197146171