In [None]:
import pandas as pd

# Get matches data

#use xls, not .csv or .xlsx, these usually have errors
df_matches = pd.read_excel("charting-m-matches.xls")

# Data cleaning
df_matches["Player 1"] = df_matches["Player 1"].str.strip()
df_matches["Player 2"] = df_matches["Player 2"].str.strip()
df_matches["Player 1"] = df_matches["Player 1"].replace(to_replace=r"\s+", value="_", regex=True)
df_matches["Player 2"] = df_matches["Player 2"].replace(to_replace=r"\s+", value="_", regex=True)
df_matches["Player 1"] = df_matches["Player 1"].str.lower()
df_matches["Player 2"] = df_matches["Player 2"].str.lower()
df_matches["Pl 1 hand"] = df_matches["Pl 1 hand"].str.strip()
df_matches["Pl 2 hand"] = df_matches["Pl 2 hand"].str.strip()
df_matches["Pl 1 hand"] = df_matches["Pl 1 hand"].str.lower()
df_matches["Pl 2 hand"] = df_matches["Pl 2 hand"].str.lower()
df_matches["Tournament"] = df_matches["Tournament"].str.strip()
df_matches["Tournament"] = df_matches["Tournament"].str.lower()
df_matches["Tournament"] = df_matches["Tournament"].replace(to_replace=r"\s+", value="_", regex=True)
df_matches["Surface"] = df_matches["Surface"].str.strip()
df_matches["Surface"] = df_matches["Surface"].str.lower()

df_matches = df_matches.astype({"Date": "str"})
# df_matches = df_matches.astype({"Best of": "int32"})

df_matches["Surface"] = df_matches["Surface"].fillna("unknown")
df_matches["Tournament"] = df_matches["Tournament"].fillna("unknown")
df_matches["Best of"] = df_matches["Best of"].fillna(0)
df_matches["Player 1"] = df_matches["Player 1"].fillna("unknown")
df_matches["Player 2"] = df_matches["Player 2"].fillna("unknown")
df_matches["Pl 1 hand"] = df_matches["Pl 1 hand"].fillna("unknown")
df_matches["Pl 2 hand"] = df_matches["Pl 2 hand"].fillna("unknown")
df_matches["Round"] = df_matches["Round"].fillna("unknown")

df_matches.info()
print()

print("There are %d matches in the database." % len(df_matches))
# df_matches.head(30)

df_matches.to_excel("charting-m-matches-cleaned.xls")

In [None]:
df_matches["Surface"].value_counts(dropna=False)

In [None]:
# How many matches for each player in the database?
players = df_matches["Player 1"].append(df_matches["Player 2"])
player_match_count = players.value_counts()

print("Match count per player in the database.")
player_match_count.head(20)

In [None]:
# Test handedness
# print(df_matches.loc[df_matches["Player 1"] == "corentin_moutet"]["Pl 1 hand"].unique())
# print(df_matches.loc[df_matches["Player 2"] == "corentin_moutet"]["Pl 2 hand"].unique())

In [None]:
# Get a list of unique players
unique_players = players.unique()
# type(unique_players)

print("There are %d players in the database." % len(unique_players))

In [None]:
# clean the handedness columns because there are errors. Create a table of players and their handednesses.
df_players_handedness = pd.DataFrame()

for player in unique_players:
    
    #these two lines select the columns "Pl 1 hand" and "Pl 2 hand" from df_matches for each player
    selected_player1_hand = df_matches[(df_matches["Player 1"] == player)]["Pl 1 hand"]
    selected_player2_hand = df_matches[(df_matches["Player 2"] == player)]["Pl 2 hand"]
    
    #correct_handedness needs to have an initial value
    #so that if there is no error in handedness,
    #we can just use the given handedness
    #if there is an error, it will display later in the if function
    if len(selected_player1_hand) > 0:
        # Use the majority of the 
        correct_handedness = selected_player1_hand.value_counts().idxmax()
    elif len(selected_player2_hand) > 0:
        correct_handedness = selected_player2_hand.value_counts().idxmax()
    else:
        correct_handedness = None
    
    #assign variable so that yoou do not have to continuously use the value_counts() function
    selected_player_hand_value_counts = selected_player1_hand.value_counts()
    
    #if the player has more than one handedness in Pl 1 hand column,
    #replace the wrong hand with correct hand
    if (len(selected_player_hand_value_counts) > 1):
        print(player)
        print(selected_player_hand_value_counts)
        
        #finding which hand occurs more frequently for the player
        #this is also the correct handedness for that player
        correct_handedness = selected_player_hand_value_counts.idxmax()
        
        #finding which hand occurs less frequently for the player
        #this is also the wrong handedness for that player
        wrong_handedness = selected_player_hand_value_counts.idxmin()
        
        #must save back to the original column and replace wrong_hand with correct_hand
        df_matches.loc[(df_matches["Player 1"] == player), "Pl 1 hand"] = \
        df_matches.loc[(df_matches["Player 1"] == player), "Pl 1 hand"].replace(wrong_handedness, correct_handedness)
        
    #same as above, only for the "Pl 2 hand" column
    selected_player_hand_value_counts = selected_player2_hand.value_counts()
    if (len(selected_player_hand_value_counts) > 1):
        print(player)
        print(selected_player_hand_value_counts)
        correct_handedness = selected_player_hand_value_counts.idxmax()
        wrong_handedness = selected_player_hand_value_counts.idxmin()
        df_matches.loc[(df_matches["Player 2"] == player), "Pl 2 hand"] = \
        df_matches.loc[(df_matches["Player 2"] == player), "Pl 2 hand"].replace(wrong_handedness, correct_handedness)
    
    row = {"player": player, "handedness": correct_handedness}
    df_players_handedness = df_players_handedness.append(row, ignore_index=True)
#     print(player)

# df_players_handedness now has the correct handedness for each player. 
print(df_players_handedness)

In [None]:
# Save the results to XLS files for use later.

df_players_handedness.dropna(inplace=True)
df_players_handedness.to_excel("player-handedness.xls")

In [None]:
# Find out how many right handed and left handed players are in the database

unique_handed_players = df_players_handedness["handedness"]
unique_handed_players_value_counts = unique_handed_players.value_counts()
unique_handed_players_value_counts.isnull().values.any()
print(len(unique_handed_players))
print("There are " + str(unique_handed_players_value_counts["r"]) + " right-handed players and " + 
      str(unique_handed_players_value_counts["l"]) + " left-handed players.")
right_handed_count = 0
left_handed_count = 0
other_count = 0
for index,value in unique_handed_players.items():
    #must represent with quotation marks
    #to indicate it is a string
    if value == "r":
        right_handed_count += 1
    elif value == "l":
        left_handed_count += 1
    else:
        print(value)
        other_count += 1
        
print("There are " + str(right_handed_count) + " right-handed players and " + 
      str(left_handed_count) + " left-handed players, with " + str(other_count) + " oddball handedness.")

In [None]:
# Find the number of matches per tournament
tournaments = df_matches["Tournament"]
tournaments_value_counts = tournaments.value_counts(dropna=False)
print(tournaments_value_counts.head(40))

In [None]:
#prints how many times roger_federer appears in Player 1
len(df_matches.loc[(((df_matches["Player 1"] == "roger_federer") & (df_matches["Player 2"] == "rafael_nadal")) | 
                  ((df_matches["Player 1"] == "rafael_nadal") & (df_matches["Player 2"] == "roger_federer")))])


In [None]:
# Select matches based on different conditions
def select_matches(data, player1=None, player2=None, tournament=None, surface=None, 
                   player1_handedness=None, player2_handedness=None, best_of=None, date=None):
    df_results = data.copy()
    
    if player1 != None:
        df_results = df_results.loc[(df_results["Player 1"].str.contains(player1)) | (df_results["Player 2"].str.contains(player1))]
    
    if player2 != None:
        df_results = df_results.loc[(df_results["Player 1"].str.contains(player2)) | (df_results["Player 2"].str.contains(player2))]
        
    if tournament != None:
        df_results = df_results.loc[(df_results["Tournament"].str.contains(tournament))]
        
    if surface != None:
        df_results = df_results.loc[(df_results["Surface"].str.contains(surface))]
    
    if (player1 != None) & (player2_handedness != None):
        df_results = df_results.loc[((df_results["Player 1"].str.contains(player1)) & (df_results["Pl 2 hand"] == player2_handedness)) | 
                                   ((df_results["Player 2"].str.contains(player1)) & (df_results["Pl 1 hand"] == player2_handedness))]
    
    if (player2 != None) & (player1_handedness != None):
        df_results = df_results.loc[((df_results["Player 1"].str.contains(player2)) & (df_results["Pl 2 hand"] == player1_handedness)) | 
                                   ((df_results["Player 2"].str.contains(player2)) & (df_results["Pl 1 hand"] == player1_handedness))]
        
    if (best_of != None):
        df_results = df_results.loc[(df_results["Best of"] == best_of)]
        
    if (date != None):
        df_results = df_results.loc[(df_results["Date"].str.contains("^" + date))]
                                    
    return df_results
    

In [None]:
# Find the number of matches for a player
player_name = "nadal"

len(select_matches(df_matches, player1=player_name))

In [None]:
# Find the head-to-head match count for two players
player1_name = "nadal"
player2_name = "federer"

len(select_matches(df_matches, player1=player1_name, player2=player2_name))

In [None]:
# Find the number of head-to-head match count for any two players in the database



## Analyzing Point-by-Point statistics

In [None]:
#when you download very large csv files and the files can't be read,
#you can use excel to save it as "CSV UTF-8(Comma delimited)" format
#then you can read the entire spreadsheet
df_points = pd.read_csv("charting-m-points_1.csv", low_memory=False)

df_points = df_points.astype({"rallyLen": "float64"})
df_points["Pts"] = df_points["Pts"].str.replace("Jan", "1", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Feb", "2", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Mar", "3", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Apr", "4", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("May", "5", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Jun", "6", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Jul", "7", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Aug", "8", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Sep", "9", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Oct", "10", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Nov", "11", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Dec", "12", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("00", "0", case=False)

df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Jan", "1", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Feb", "2", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Mar", "3", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Apr", "4", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("May", "5", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Jun", "6", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Jul", "7", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Aug", "8", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Sep", "9", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Oct", "10", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Nov", "11", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Dec", "12", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("00", "0", case=False)

In [None]:
def select_players_points(point_data, match_data):

    # Use a copy to avoid "A value is trying to be set on a copy of a slice from a DataFrame" warning. 
    df_points_selected = point_data.loc[point_data["match_id"].isin(match_data["match_id"])].copy()

    # df_points_selected.info()
    return df_points_selected

In [None]:
#the purpose is to extract the digit in the parentheses
#but just doing it straight up causes ambiguity, so I need to add a \ in front of the parentheses
#to indicate that this parentheses is just a character, not a special symbol (capture group)
# The following code does not work because in some matches, "Gm#" only has game index, not point index. 
# df_points_selected["serve_side"] = df_points_selected["Gm#"].str.extract(r".\((\d+)\)").astype("int64")
# df_points_selected["serve_side"] = df_points_selected["serve_side"] % 2
# df_points_selected["serve_side"].replace({0: "ad", 1: "deuce"}, inplace=True)

# Identify serve side for each point

import re

def identify_serve_side(data):
    data = data.copy()
    # List all the possible scores (before serve) and their corresponding serve sides. 
    dict_serve_side = {"0-0": "deuce", 
                       "0-15": "ad", 
                      "15-0": "ad",
                      "15-15": "deuce",
                      "30-0": "deuce",
                      "0-30": "deuce",
                      "30-15": "ad",
                      "15-30": "ad",
                      "40-0": "ad",
                      "0-40": "ad",
                      "40-15": "deuce",
                      "15-40": "deuce",
                       "30-30": "deuce",
                       "40-30": "ad",
                       "30-40": "ad",
                       "40-40": "deuce",
                       "40-AD": "ad",
                       "AD-40": "ad"
                      }

    data["serve_side"] = None

    for index, row in data.iterrows():
        if data.loc[index, "TB?"] == "0":
            # Identify serve sides based on the score (before serve)
            data.loc[index, "serve_side"] = dict_serve_side[data.loc[index, "Pts"]]
        elif data.loc[index, "TB?"] == "1":
            # For tiebreak points, if the sum of the two scores (before serve) are even, it's on the deuce side.
            # If the sum of the two scores (before serve) are odd, it's on the ad side. 
    #         print(data.loc[index, "Pts"])

            # Retrieve the first score
            tb_point1_str = re.search("^(\d+)-", data.loc[index, "Pts"])
            if tb_point1_str:
                tb_point1 = int(tb_point1_str.group(1))
            # Retrieve the second score
            tb_point2_str = re.search("-(\d+)", data.loc[index, "Pts"])
            if tb_point2_str:
                tb_point2 = int(tb_point2_str.group(1))

            if ((tb_point1 + tb_point2) % 2) == 0:
                data.loc[index, "serve_side"] = "deuce"
            elif ((tb_point1 + tb_point2) % 2) == 1:
                data.loc[index, "serve_side"] = "ad"

    # data[["Pts", "TB?", "serve_side"]]
    return data

In [None]:
# Separate serve direction from serve outcome
# Need to identify +

def identify_serve_direction_outcome(data):
    data = data.copy()
    
    data["Sv1_direction"] = data["Sv1"].str.extract(r"^(\d)")
    data.loc[data["Sv1_direction"].isin(["4", "5", "6"]) == False, "Sv1_direction"] = "0"
    data["Sv1_direction"].fillna(value="0", inplace=True)
    # Replace numeric code with a word. May need to keep the numbers for stats analysis. 
    data["Sv1_direction"].replace({"4": "wide", "5": "body", "6": "t", "0": "unknown"}, inplace=True)
    # To-do: replacena with "unknown"

    #whatever is inside the parentheses is what is being captured/retrieved
    data["Sv1_outcome"] = data["Sv1"].str.extract(r"^\d(.+)")
    data["Sv1_outcome"].replace({"n": "net", "d": "deep", "*": "ace", "w": "wide", "#": "unreturnable", 
                                               "x": "wide_and_deep", "+": "serve_and_volley_"}, inplace=True)
    # To-do: replace na with "unknown". Same for below. 

    data["Sv2_direction"] = data["Sv2"].str.extract(r"^(\d)")
    data.loc[data["Sv2_direction"].isin(["4", "5", "6"]) == False, "Sv2_direction"] = "0"
    data["Sv2_direction"].fillna(value="0", inplace=True)
    data["Sv2_direction"].replace({"4": "wide", "5": "body", "6": "t", "0": "unknown"}, inplace=True)

    data["Sv2_outcome"] = data["Sv2"].str.extract(r"^\d(.+)")
    data["Sv2_outcome"].replace({"n": "net", "d": "deep", "*": "ace", "w": "wide", "#": "unreturnable", 
                                               "x": "wide_and_deep", "+": "serve_and_volley_"}, inplace=True)

    # data[["Pts", "serve_side", "Sv1", "Sv2", "Sv1_direction", "Sv2_direction", "Sv1_outcome", "Sv2_outcome"]]

    # print(data["Sv1_outcome"].value_counts())
    # data.info()
    return data

In [None]:
# Identify server name
def identify_server_name(data):
    data = data.copy()
    
    data["server_name"] = None
    data["returner_name"] = None

    for index, row in data.iterrows():
        my_match = df_matches[df_matches["match_id"] == row["match_id"]]
        if len(my_match) > 1:
            print("Error: There are more than one " + row["match_id"])
            break

        if row["Svr"] == 1:
            # Save data directly to the data frame.
            # Do not save data to row["Svr"] because it will not be saved to the data frame.
            # Note that my_match is a data frame, not a series, even through there is only one row. 
            # So we must retrieve the first row from my_match. 
            data.loc[index, "server_name"] = my_match.iloc[0]["Player 1"]
            data.loc[index, "returner_name"] = my_match.iloc[0]["Player 2"]
        if row["Svr"] == 2:
            data.loc[index, "server_name"] = my_match.iloc[0]["Player 2"]
            data.loc[index, "returner_name"] = my_match.iloc[0]["Player 1"]

    # data[["server_name", "Svr", "Serving", "Pts", "serve_side", "Sv1", "Sv2", "Sv1_direction", "Sv2_direction", "Sv1_outcome", "Sv2_outcome"]]

    # data.to_csv("fed_nadal_points.csv")
    return data

In [None]:
def create_contingency_table(data, player1, player2):
    data = data.copy()
    
    players = [player1, player2]
    serve_sides = ["deuce", "ad"]
    serve_seqs = ["Sv1_direction", "Sv2_direction"]

    serve_stats = pd.DataFrame() 

    for player in players:
        for serve_side in serve_sides:
            for serve_seq in serve_seqs:
                serve_dir_counts = data.loc[(data["server_name"].str.contains(player)) & 
                                             (data["serve_side"] == serve_side), serve_seq].value_counts()
    #             print(serve_dir_counts)
                serve_dir_counts["server"] = player
                serve_dir_counts["serve_side"] = serve_side
                serve_dir_counts["serve_sequence"] = serve_seq
                serve_stats = serve_stats.append(serve_dir_counts, ignore_index=True)

    # federer_nadal_serve_stats = serve_stats[["server", "serve_side", "serve_sequence", "wide", "body", "t"]]
    serve_stats = serve_stats[["server", "serve_side", "serve_sequence", "wide", "body", "t"]]
#     print(federer_djokovic_serve_stats) 
#     print(federer_nadal_serve_stats)

    # serve_stats.to_csv("fed_nadal_saerve_dir.csv")
    return serve_stats

In [None]:
# import seaborn as sns

from scipy.stats import chi2_contingency
from scipy.stats import chi2

# Chi-square independence test
def chi_square(data, print_table = False):

    table = data.copy()

#     table = table.set_index("server")

    test_stats, p, dof, expected = chi2_contingency(table)
    print("chi-squared test results:")
    print("chi2 test stats = %f" % test_stats)
    print("p = %f" % p)
    print('dof = %d ' % dof)

     # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob,
    critical, test_stats))
    if abs(test_stats) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')

     # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')

    if print_table:
        print(table)

In [None]:
def compare_players_chi_square(table):
    servers = table["server"].unique()
    serve_sides = table["serve_side"].unique()
    serve_sequences = table["serve_sequence"].unique()
    
    for serve_side in serve_sides:
        for serve_sequence in serve_sequences:
            my_data = table.loc[(table["serve_side"] == serve_side) & (table["serve_sequence"] == serve_sequence), 
                               ["server", "wide", "body", "t"]]
            
            print("serve_side: " + serve_side)
            print(serve_sequence)
            chi_square(data=my_data, print_table=True)
            print()

In [None]:
from scipy.stats import chisquare
def serve_direction_test_of_even_distribution(point_data, match_data, player, opponent=None):
    surfaces = ["hard", "clay", "grass"]
    serve_sides = ["ad", "deuce"]
    
    serve_sequences = ["Sv1_direction", "Sv2_direction"]
        
    contingency_table = pd.DataFrame()
    
    for serve_sequence in serve_sequences:
        print(serve_sequence)
        for serve_side in serve_sides:
            print(serve_side)
            for surface in surfaces:
                print(surface)
                matches_by_surface = select_matches(data=match_data, player1=player, player2=opponent, surface=surface)
                selected_points = select_players_points(point_data, matches_by_surface)
                print("finished selecting points")

                if selected_points.empty != True:
                    selected_points = identify_serve_side(selected_points)
                    print("finished identifying serve side")
                    selected_points = identify_serve_direction_outcome(selected_points)
                    print("finished identifying serve direction and outcome")
                    selected_points = identify_server_name(selected_points)
                    print("finished identifying server name")
                    my_data = selected_points.loc[(selected_points["server_name"].str.contains(player))]

                    # Without Series, you cannot use value_counts()
                    serve_frequency = my_data.loc[my_data["serve_side"] == serve_side, serve_sequence].value_counts()
                    # Replace nan with 0
#                     serve_frequency.fillna(value=0, inplace=True)
                    # Sometimes the serve direction is not entered and is marked as nan, remove these counts. 
                    
                    serve_frequency_list = serve_frequency.tolist()
                    print(serve_frequency)
                    chisq, p = chisquare(serve_frequency_list)
                    print("chi-square: " + str(chisq))
                    print("p: " + str(p))
                    if p < 0.05:
                        print("Reject null hypothesis, not evenly distributed.\n")
                    else:
                        print("Cannot reject null hypothesis, possibly evenly distributed.\n")
                else:
                    print("No matches found")


In [None]:
#some players may generate errors due to NaN existing in the "Surface" column
serve_direction_test_of_even_distribution(point_data=df_points, match_data=df_matches, player="federer", opponent=None)
# nadal_matches = select_matches(data=df_matches, player1="nadal")
# nadal_matches["Surface"].value_counts(dropna=False)
# possible even distribution:
# goffin, shapovalov, tsitsipas, wawrinka, pouille

In [None]:
# Check if the serve direction patterns vary from match to match on the same surface and same serve side (use MANOVA)

In [None]:
# Check if there is a difference in serve directions between ad and deuce sides
def compare_players_serves_on_different_sides(player, first_serve=True, surface=None, opponent=None):
    selected_matches = select_matches(data=df_matches, player1=player, player2=opponent, surface=surface)
    selected_points = select_players_points(df_points, selected_matches)
    
    if first_serve == True:
        serve_sequence = "Sv1_direction"
    else:
        serve_sequence = "Sv2_direction"
        
    if selected_points.empty != True:
        selected_points = identify_serve_side(selected_points)
        selected_points = identify_serve_direction_outcome(selected_points)
        selected_points = identify_server_name(selected_points)
        my_data = selected_points.loc[(selected_points["server_name"].str.contains(player))]
        contingency_table = pd.DataFrame()
            
        for serve_side in my_data["serve_side"].unique():
            # Must use .loc[], as not using .loc[] does not result in a Series
            #Without Series, you cannot use value_counts()
            serve_frequency = my_data.loc[(my_data["serve_side"] == serve_side), serve_sequence].value_counts()
            serve_frequency["serve_side"] = serve_side
            contingency_table = contingency_table.append(serve_frequency, ignore_index = True)
        contingency_table = contingency_table.dropna(how="all")
        contingency_table = contingency_table.set_index("serve_side")
#         contingency_table.fillna(value=0, inplace=True)
        contingency_table = contingency_table[["wide", "body", "t"]]
        print(contingency_table)
        chi_square(contingency_table, print_table = False)
    else:
        print("No matches found")

In [None]:
compare_players_serves_on_different_sides("federer", first_serve=True, surface="hard", opponent=None)

In [None]:
# Check if there is a difference in serve directions between ad and deuce sides
def compare_players_serves_on_different_surfaces(player, first_serve=True, opponent=None):
    surfaces = ["hard", "clay", "grass"]
    serve_sides = ["ad", "deuce"]
    
    if first_serve == True:
        serve_sequence = "Sv1_direction"
    else:
        serve_sequence = "Sv2_direction"
        
    contingency_table = pd.DataFrame()
    
    for serve_side in serve_sides:
        print(serve_side)
        for surface in surfaces:
            matches_by_surface = select_matches(data=df_matches, player1=player, player2=opponent, surface=surface)
            selected_points = select_players_points(df_points, matches_by_surface)

            if selected_points.empty != True:
                selected_points = identify_serve_side(selected_points)
                selected_points = identify_serve_direction_outcome(selected_points)
                selected_points = identify_server_name(selected_points)
                my_data = selected_points.loc[(selected_points["server_name"].str.contains(player))]

                # Without Series, you cannot use value_counts()
                serve_frequency = my_data.loc[my_data["serve_side"] == serve_side, serve_sequence].value_counts()
                serve_frequency["surface"] = surface
                contingency_table = contingency_table.append(serve_frequency, ignore_index = True)    
            else:
                print("No matches found")

        contingency_table = contingency_table.dropna(how="all")
        contingency_table = contingency_table.set_index("surface")
        contingency_table.fillna(value=0, inplace=True)
        contingency_table = contingency_table[["wide", "body", "t"]]
        print(contingency_table)
        chi_square(contingency_table, print_table = False)
        contingency_table.drop(contingency_table.index, inplace=True)

In [None]:
compare_players_serves_on_different_surfaces("federer", first_serve=True)

In [None]:
# Compare players' serves when they play each other
player1 = "kyrgios"
player2 = "federer"
df_points_selected = select_players_points(point_data=df_points, match_data=df_matches, player1=player1, player2=player2)
if df_points_selected.empty == False:    
    df_points_selected = identify_serve_side(df_points_selected)
    df_points_selected = identify_serve_direction_outcome(df_points_selected)
    df_points_selected = identify_server_name(df_points_selected)
    contingency_table = create_contingency_table(df_points_selected, player1, player2)
    compare_players_chi_square(contingency_table)
    
else:
    print("No results found")

In [None]:
# To-do: Compare players' serves when they play the same player
player1 = "federer"
player2 = "nishikori"
df_points_selected = select_players_points(point_data=df_points, match_data=df_matches, player1=player1, player2=player2)
if df_points_selected.empty == False:    
    df_points_selected = identify_serve_side(df_points_selected)
    df_points_selected = identify_serve_direction_outcome(df_points_selected)
    df_points_selected = identify_server_name(df_points_selected)
    contingency_table = create_contingency_table(df_points_selected, player1, player2)
    compare_players_chi_square(contingency_table)
    
else:
    print("No results found")

To-do

1. Serve direction analysis
The characteristics of serves for one player
    - Comparison by serve sides(use chi-square)
    - Comparison first and second serves(use chi-square, ind=serves, dep=serve_direction)
    - Comparison by surfaces (control by opponent)(use chi-square)
    - Comparison by opponents (different serve patterns for different opponents?) (use chi-square)
    - Comparison by age (early years vs later years) (chi-square, ind=different ages, dep=serve_directions)
    - Comparison by tournament (grand slam vs other tournaments) (chi-square)
    - Show the change of serve directions during a match (draw a line chart show the counts of each serve direction over time)
    - Serve patterns at critical moments vs non-critical moments (1 points from winning, 1 points from losing, etc.)
    - Serve pattern based on tiredness, group serves by the count of points played (early, middle, or later) and compare between groups.
    - Serve patterns when leading vs when trailing
    - Serve patterns when first serve rate is high (high confidence) vs when the rate is low (low confidence)
    - Serve patterns when winning several points in a row (high confidence), serve patterns when losing several points in a row (low confidence, frustration). 
    - Serve after unforced errors (one or more). Serve after the opponent hitting a winner. Serve after hitting winners. 
    - Serve right after long points. 
    - Serve right after aces (is there a difference? Is he likely to serve a different direction?)
    - Serve right after double faults (is there a difference?)
    - Serve right after repeated first serve faults
    - Is the success or failure of certain serve directions (e.g., aces, short points won, double faults) in the beginning influence the later serve decisions? (anchor effect) Any short term or long term effects?
    - Can we find any subtle bias in serve selection?
    
Compare the serves of two players
    - When they play each other ...
    - When they play the same opponent ...
    - When they are tired
    - When they are leading or losing
    - At critical moments ...
    - At high or low confidence level 

Compare the serves for three or more players?
    - Is it useful?
    
2. Serve error analysis
    - Frequency of errors at critical moments (double fault, first serve fault)
    - Types of errors correlated with tension, confidence, etc. 


In [None]:
t