In [1]:
import pandas as pd

# Get matches data

#use xls, not .csv or .xlsx, these usually have errors
df_matches = pd.read_excel("charting-m-matches.xls")

# Data cleaning
df_matches["Player 1"] = df_matches["Player 1"].str.strip()
df_matches["Player 2"] = df_matches["Player 2"].str.strip()
df_matches["Player 1"] = df_matches["Player 1"].replace(to_replace=r"\s+", value="_", regex=True)
df_matches["Player 2"] = df_matches["Player 2"].replace(to_replace=r"\s+", value="_", regex=True)
df_matches["Player 1"] = df_matches["Player 1"].str.lower()
df_matches["Player 2"] = df_matches["Player 2"].str.lower()
df_matches["Pl 1 hand"] = df_matches["Pl 1 hand"].str.strip()
df_matches["Pl 2 hand"] = df_matches["Pl 2 hand"].str.strip()
df_matches["Pl 1 hand"] = df_matches["Pl 1 hand"].str.lower()
df_matches["Pl 2 hand"] = df_matches["Pl 2 hand"].str.lower()
df_matches["Tournament"] = df_matches["Tournament"].str.strip()
df_matches["Tournament"] = df_matches["Tournament"].str.lower()
df_matches["Tournament"] = df_matches["Tournament"].replace(to_replace=r"\s+", value="_", regex=True)
df_matches["Surface"] = df_matches["Surface"].str.strip()
df_matches["Surface"] = df_matches["Surface"].str.lower()
df_matches = df_matches.astype({"Date": "str"})
# df_matches = df_matches.astype({"Best of": "int32"})

df_matches.info()
print()

print("There are %d matches in the database." % len(df_matches))
# df_matches.head(30)

df_matches.to_excel("charting-m-matches-cleaned.xls")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   match_id    2354 non-null   object 
 1   Player 1    2353 non-null   object 
 2   Player 2    2354 non-null   object 
 3   Pl 1 hand   2354 non-null   object 
 4   Pl 2 hand   2354 non-null   object 
 5   Gender      2354 non-null   object 
 6   Date        2354 non-null   object 
 7   Tournament  2354 non-null   object 
 8   Round       2354 non-null   object 
 9   Time        1406 non-null   object 
 10  Court       1807 non-null   object 
 11  Surface     2351 non-null   object 
 12  Umpire      1504 non-null   object 
 13  Best of     2353 non-null   float64
 14  Final TB?   2353 non-null   object 
 15  Charted by  2353 non-null   object 
dtypes: float64(1), object(15)
memory usage: 294.4+ KB

There are 2354 matches in the database.


In [13]:
# How many matches for each player in the database?
players = df_matches["Player 1"].append(df_matches["Player 2"])
player_match_count = players.value_counts()

print("Match count per player in the database.")
player_match_count.head(20)

Match count per player in the database.


roger_federer            405
rafael_nadal             249
novak_djokovic           234
andy_murray              131
stefan_edberg            130
pete_sampras             109
andre_agassi             105
lleyton_hewitt            96
juan_martin_del_potro     82
ivan_lendl                66
boris_becker              65
stanislas_wawrinka        55
gael_monfils              53
andy_roddick              51
alexander_zverev          50
david_ferrer              47
karim_mohamed_maamoun     46
dominic_thiem             46
daniil_medvedev           40
tomas_berdych             40
dtype: int64

In [14]:
# Test handedness
# print(df_matches.loc[df_matches["Player 1"] == "corentin_moutet"]["Pl 1 hand"].unique())
# print(df_matches.loc[df_matches["Player 2"] == "corentin_moutet"]["Pl 2 hand"].unique())

In [79]:
# Get a list of unique players
unique_players = pd.Series(players.unique())

# There may be an nan in the list
unique_players = unique_players.dropna()
unique_players = unique_players.reset_index(drop=True)
print(unique_players)

# print("There are %d players in the database." % len(unique_players))

0                 ernests_gulbis
1               alexander_bublik
2              mikhail_kukushkin
3                 novak_djokovic
4                    marin_cilic
                 ...            
511    christophe_roger_vasselin
512                brian_teacher
513                  jiri_hrebec
514              adriano_panatta
515                 ken_rosewall
Length: 516, dtype: object


In [16]:
# clean the handedness columns because there are errors. Create a table of players and their handednesses.
df_players_handedness = pd.DataFrame()

for player in unique_players:
    
    #these two lines select the columns "Pl 1 hand" and "Pl 2 hand" from df_matches for each player
    selected_player1_hand = df_matches[(df_matches["Player 1"] == player)]["Pl 1 hand"]
    selected_player2_hand = df_matches[(df_matches["Player 2"] == player)]["Pl 2 hand"]
    
    #correct_handedness needs to have an initial value
    #so that if there is no error in handedness,
    #we can just use the given handedness
    #if there is an error, it will display later in the if function
    if len(selected_player1_hand) > 0:
        # Use the majority of the 
        correct_handedness = selected_player1_hand.value_counts().idxmax()
    elif len(selected_player2_hand) > 0:
        correct_handedness = selected_player2_hand.value_counts().idxmax()
    else:
        correct_handedness = None
    
    #assign variable so that yoou do not have to continuously use the value_counts() function
    selected_player_hand_value_counts = selected_player1_hand.value_counts()
    
    #if the player has more than one handedness in Pl 1 hand column,
    #replace the wrong hand with correct hand
    if (len(selected_player_hand_value_counts) > 1):
        print(player)
        print(selected_player_hand_value_counts)
        
        #finding which hand occurs more frequently for the player
        #this is also the correct handedness for that player
        correct_handedness = selected_player_hand_value_counts.idxmax()
        
        #finding which hand occurs less frequently for the player
        #this is also the wrong handedness for that player
        wrong_handedness = selected_player_hand_value_counts.idxmin()
        
        #must save back to the original column and replace wrong_hand with correct_hand
        df_matches.loc[(df_matches["Player 1"] == player), "Pl 1 hand"] = \
        df_matches.loc[(df_matches["Player 1"] == player), "Pl 1 hand"].replace(wrong_handedness, correct_handedness)
        
    #same as above, only for the "Pl 2 hand" column
    selected_player_hand_value_counts = selected_player2_hand.value_counts()
    if (len(selected_player_hand_value_counts) > 1):
        print(player)
        print(selected_player_hand_value_counts)
        correct_handedness = selected_player_hand_value_counts.idxmax()
        wrong_handedness = selected_player_hand_value_counts.idxmin()
        df_matches.loc[(df_matches["Player 2"] == player), "Pl 2 hand"] = \
        df_matches.loc[(df_matches["Player 2"] == player), "Pl 2 hand"].replace(wrong_handedness, correct_handedness)
    
    row = {"player": player, "handedness": correct_handedness}
    df_players_handedness = df_players_handedness.append(row, ignore_index=True)
#     print(player)

# df_players_handedness now has the correct handedness for each player. 
print(df_players_handedness)

stanislas_wawrinka
r    20
l     1
Name: Pl 2 hand, dtype: int64
rafael_nadal
l    87
r     1
Name: Pl 1 hand, dtype: int64
rafael_nadal
l    160
r      1
Name: Pl 2 hand, dtype: int64
kyle_edmund
r    2
l    1
Name: Pl 2 hand, dtype: int64
nick_kyrgios
r    8
l    1
Name: Pl 1 hand, dtype: int64
thomas_muster
l    11
r     1
Name: Pl 2 hand, dtype: int64
    handedness                     player
0            r             ernests_gulbis
1            r           alexander_bublik
2            r          mikhail_kukushkin
3            r             novak_djokovic
4            r                marin_cilic
..         ...                        ...
512          r  christophe_roger_vasselin
513          r              brian_teacher
514          r                jiri_hrebec
515          r            adriano_panatta
516          r               ken_rosewall

[517 rows x 2 columns]


In [17]:
# Save the results to XLS files for use later.

df_players_handedness.dropna(inplace=True)
df_players_handedness.to_excel("player-handedness.xls")

In [18]:
# Find out how many right handed and left handed players are in the database

unique_handed_players = df_players_handedness["handedness"]
unique_handed_players_value_counts = unique_handed_players.value_counts()
unique_handed_players_value_counts.isnull().values.any()
print(len(unique_handed_players))
print("There are " + str(unique_handed_players_value_counts["r"]) + " right-handed players and " + 
      str(unique_handed_players_value_counts["l"]) + " left-handed players.")
right_handed_count = 0
left_handed_count = 0
other_count = 0
for index,value in unique_handed_players.items():
    #must represent with quotation marks
    #to indicate it is a string
    if value == "r":
        right_handed_count += 1
    elif value == "l":
        left_handed_count += 1
    else:
        print(value)
        other_count += 1
        
print("There are " + str(right_handed_count) + " right-handed players and " + 
      str(left_handed_count) + " left-handed players, with " + str(other_count) + " oddball handedness.")

516
There are 441 right-handed players and 75 left-handed players.
There are 441 right-handed players and 75 left-handed players, with 0 oddball handedness.


In [19]:
# Find the number of matches per tournament
tournaments = df_matches["Tournament"]
tournaments_value_counts = tournaments.value_counts(dropna=False)
print(tournaments_value_counts.head(40))

australian_open            198
wimbledon                  180
us_open                    164
indian_wells_masters       121
roland_garros              119
tour_finals                100
miami_masters               93
cincinnati_masters          66
monte_carlo_masters         65
paris_masters               62
rome_masters                60
canada_masters              58
madrid_masters              45
shanghai_masters            43
dubai                       39
rotterdam                   33
queens_club                 29
olympics                    29
halle                       27
doha                        27
masters_cup                 25
hamburg_masters             24
brisbane                    22
washington                  22
basel                       20
barcelona                   20
sydney                      17
rio_de_janeiro              14
stockholm                   14
davis_cup_world_group_f     13
atp_cup                     13
tokyo                       13
acapulco

In [20]:
#prints how many times roger_federer appears in Player 1
len(df_matches.loc[(((df_matches["Player 1"] == "roger_federer") & (df_matches["Player 2"] == "rafael_nadal")) | 
                  ((df_matches["Player 1"] == "rafael_nadal") & (df_matches["Player 2"] == "roger_federer")))])


34

In [21]:
# Select matches based on different conditions
def select_matches(data, player1=None, player2=None, tournament=None, surface=None, 
                   player1_handedness=None, player2_handedness=None, best_of=None, date=None):
    df_results = data.copy()
    
    if player1 != None:
        df_results = df_results.loc[(df_results["Player 1"].str.contains(player1)) | (df_results["Player 2"].str.contains(player1))]
    
    if player2 != None:
        df_results = df_results.loc[(df_results["Player 1"].str.contains(player2)) | (df_results["Player 2"].str.contains(player2))]
        
    if tournament != None:
        df_results = df_results.loc[(df_results["Tournament"].str.contains(tournament))]
        
    if surface != None:
        df_results = df_results.loc[(df_results["Surface"].str.contains(surface))]
    
    if (player1 != None) & (player2_handedness != None):
        df_results = df_results.loc[((df_results["Player 1"].str.contains(player1)) & (df_results["Pl 2 hand"] == player2_handedness)) | 
                                   ((df_results["Player 2"].str.contains(player1)) & (df_results["Pl 1 hand"] == player2_handedness))]
    
    if (player2 != None) & (player1_handedness != None):
        df_results = df_results.loc[((df_results["Player 1"].str.contains(player2)) & (df_results["Pl 2 hand"] == player1_handedness)) | 
                                   ((df_results["Player 2"].str.contains(player2)) & (df_results["Pl 1 hand"] == player1_handedness))]
        
    if (best_of != None):
        df_results = df_results.loc[(df_results["Best of"] == best_of)]
        
    if (date != None):
        df_results = df_results.loc[(df_results["Date"].str.contains("^" + date))]
                                    
    return df_results
    

In [22]:
# Find the number of matches for a player
player_name = "nadal"

len(select_matches(df_matches, player1=player_name))

249

In [23]:
# Find the head-to-head match count for two players
player1_name = "nadal"
player2_name = "federer"

len(select_matches(df_matches, player1=player1_name, player2=player2_name))

34

In [65]:
type(unique_players)

numpy.ndarray

In [83]:
# Find the number of head-to-head match count for any two players in the database
# This will take a long time to run. 

head_to_head_match_count = pd.DataFrame()

for i in range(0, len(unique_players)): 
    for j in range(i+1, len(unique_players)):
        if ((unique_players[i] != None) & (unique_players[j] != None)): 
            my_matches = select_matches(df_matches, player1=unique_players[i], player2=unique_players[j])
            if len(my_matches) > 0:
                entry = pd.DataFrame({"player1": [unique_players[i]], 
                                     "player2": [unique_players[j]],
                                     "match_count": [len(my_matches)]})
                head_to_head_match_count = head_to_head_match_count.append(entry, ignore_index = True)

# head_to_head_match_count.sort()

In [90]:
head_to_head_match_count.sort_values(by="match_count", ascending=False).head(30)

head_to_head_match_count.to_excel("head_to_head_match_count.xls")

## Analyzing Point-by-Point statistics

In [3]:
#when you download very large csv files and the files can't be read,
#you can use excel to save it as "CSV UTF-8(Comma delimited)" format
#then you can read the entire spreadsheet
df_points = pd.read_csv("charting-m-points_1.csv", low_memory=False)

df_points = df_points.astype({"rallyLen": "float64"})

# Clean up tiebreak Pts scores because they are mistkenly treated as dates by Excel. 
df_points["Pts"] = df_points["Pts"].str.replace("Jan", "1", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Feb", "2", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Mar", "3", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Apr", "4", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("May", "5", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Jun", "6", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Jul", "7", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Aug", "8", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Sep", "9", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Oct", "10", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Nov", "11", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("Dec", "12", case=False)
df_points["Pts"] = df_points["Pts"].str.replace("00", "0", case=False)

df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Jan", "1", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Feb", "2", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Mar", "3", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Apr", "4", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("May", "5", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Jun", "6", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Jul", "7", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Aug", "8", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Sep", "9", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Oct", "10", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Nov", "11", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("Dec", "12", case=False)
df_points["PtsAfter"] = df_points["PtsAfter"].str.replace("00", "0", case=False)

In [None]:
# df_points_selected = df_points_selected.sort_values(by=["match_id", "Pt"])
# df_points_selected.reset_index(drop=True, inplace=True)
# df_points_selected["Gm#"].value_counts()

In [None]:
# pd.set_option("display.max_rows", None, "display.min_rows", None)
# print(df_points["rallyCount"].value_counts(dropna=False))

In [None]:
# rallyCount is sometimes > rallyLen by 1
# other times, it equals rallyLen
# but never less than rallyLen
# maybe because rallyCount includes the serve, and rallyLen does not
# df_points.loc[df_points["rallyCount"] > df_points["rallyLen"], ["rallyCount", "rallyLen"]]

In [None]:
# The person who charts the match only needs to fill 3 cells: 1st, 2nd, Notes; the rest are computer-generated through Excel macro
# a "+" sign means the player comes to the net
# when there is a "+" sign in column "1st", the column "1stSV" will equal 1. The player tried to serve and volley.
# otherwise it will equal zero. It means the playe
# same principle goes for "2nd" and "2ndSV"
# "Sv1" and "Sv2" equal the serve directions themselves
# "Rally" is the entire point without the serve direction
# "Gm1.1" is the number of games the first player won, resets for every new set
# "Gm2.1" is the number of games the second player won, resets for every new set
# "Set1.1" is the number of sets the first player won
# "Set2.1" is the number of sets the second player won
# "isSvrWinner" indicates whether the server won the point(0) or lost the point(1)
# "SetW" indicates the winner of the current set. It only occurs after someone wins a set point; otherwise, the value is zero.
# "GameW" indicates the winner of the current game. It only occurs after someone wins a game point; otherwise, the value is zero.
# "isRally1st" and "isRally2nd" indicate whether the rally occured after the first serve or second serve 
# (e.x if it was after the first serve, a '1' would occur in isRally1st column)
# if there are 0s in both columns, either there was an ace, unreturnable serve, or double fault.
# "#" is equal to unreturnable ball; "*" is equal to an ace
# df_points_selected[["1st", "2nd", "Pts", "TB?", "TBpt"]]

# df_points_selected["serve_side"] = df_points_selected["Gm#"].str.extract(r"^.+\((\d+)\)")
# df_points_selected["serve_side"].value_counts(dropna=False)
# df_points_selected["Gm#"].value_counts(dropna=False)

In [5]:
#the purpose is to extract the digit in the parentheses
#but just doing it straight up causes ambiguity, so I need to add a \ in front of the parentheses
#to indicate that this parentheses is just a character, not a special symbol (capture group)
# The following code does not work because in some matches, "Gm#" only has game index, not point index. 
# df_points_selected["serve_side"] = df_points_selected["Gm#"].str.extract(r".\((\d+)\)").astype("int64")
# df_points_selected["serve_side"] = df_points_selected["serve_side"] % 2
# df_points_selected["serve_side"].replace({0: "ad", 1: "deuce"}, inplace=True)

# Identify serve side for each point

import re

def identify_serve_side(data):
    data = data.copy()
    # List all the possible scores (before serve) and their corresponding serve sides. 
    dict_serve_side = {"0-0": "deuce", 
                       "0-15": "ad", 
                      "15-0": "ad",
                      "15-15": "deuce",
                      "30-0": "deuce",
                      "0-30": "deuce",
                      "30-15": "ad",
                      "15-30": "ad",
                      "40-0": "ad",
                      "0-40": "ad",
                      "40-15": "deuce",
                      "15-40": "deuce",
                       "30-30": "deuce",
                       "40-30": "ad",
                       "30-40": "ad",
                       "40-40": "deuce",
                       "40-AD": "ad",
                       "AD-40": "ad"
                      }

    data["serve_side"] = None

    for index, row in data.iterrows():
        if data.loc[index, "TB?"] == "0":
            # Identify serve sides based on the score (before serve)
            data.loc[index, "serve_side"] = dict_serve_side[data.loc[index, "Pts"]]
        elif data.loc[index, "TB?"] == "1":
            # For tiebreak points, if the sum of the two scores (before serve) are even, it's on the deuce side.
            # If the sum of the two scores (before serve) are odd, it's on the ad side. 
    #         print(data.loc[index, "Pts"])

            # Retrieve the first score
            tb_point1_str = re.search("^(\d+)-", data.loc[index, "Pts"])
            if tb_point1_str:
                tb_point1 = int(tb_point1_str.group(1))
            # Retrieve the second score
            tb_point2_str = re.search("-(\d+)", data.loc[index, "Pts"])
            if tb_point2_str:
                tb_point2 = int(tb_point2_str.group(1))

            if ((tb_point1 + tb_point2) % 2) == 0:
                data.loc[index, "serve_side"] = "deuce"
            elif ((tb_point1 + tb_point2) % 2) == 1:
                data.loc[index, "serve_side"] = "ad"

    # data[["Pts", "TB?", "serve_side"]]
    return data

In [6]:
# Separate serve direction from serve outcome
# Need to identify +

def identify_serve_direction_outcome(data):
    data = data.copy()
    
    data["Sv1_direction"] = data["Sv1"].str.extract(r"^(\d)")
    # Replace numeric code with a word. May need to keep the numbers for stats analysis. 
    data["Sv1_direction"].replace({"4": "wide", "5": "body", "6": "t"}, inplace=True)

    #whatever is inside the parentheses is what is being captured/retrieved
    data["Sv1_outcome"] = data["Sv1"].str.extract(r"^\d(.+)")
    data["Sv1_outcome"].replace({"n": "net", "d": "deep", "*": "ace", "w": "wide", "#": "unreturnable", 
                                               "x": "wide_and_deep", "+": "serve_and_volley_"}, inplace=True)

    data["Sv2_direction"] = data["Sv2"].str.extract(r"^(\d)")
    data["Sv2_direction"].replace({"4": "wide", "5": "body", "6": "t"}, inplace=True)

    data["Sv2_outcome"] = data["Sv2"].str.extract(r"^\d(.+)")
    data["Sv2_outcome"].replace({"n": "net", "d": "deep", "*": "ace", "w": "wide", "#": "unreturnable", 
                                               "x": "wide_and_deep", "+": "serve_and_volley_"}, inplace=True)

    # data[["Pts", "serve_side", "Sv1", "Sv2", "Sv1_direction", "Sv2_direction", "Sv1_outcome", "Sv2_outcome"]]

    # print(data["Sv1_outcome"].value_counts())
    # data.info()
    return data

In [7]:
# Identify server name
def identify_server_name(data):
    data = data.copy()
    
    data["server_name"] = None
    data["returner_name"] = None

    for index, row in data.iterrows():
        my_match = df_matches[df_matches["match_id"] == row["match_id"]]
        if len(my_match) > 1:
            print("Error: There are more than one " + row["match_id"])
            break

        if row["Svr"] == 1:
            # Save data directly to the data frame.
            # Do not save data to row["Svr"] because it will not be saved to the data frame.
            # Note that my_match is a data frame, not a series, even through there is only one row. 
            # So we must retrieve the first row from my_match. 
            data.loc[index, "server_name"] = my_match.iloc[0]["Player 1"]
            data.loc[index, "returner_name"] = my_match.iloc[0]["Player 2"]
        if row["Svr"] == 2:
            data.loc[index, "server_name"] = my_match.iloc[0]["Player 2"]
            data.loc[index, "returner_name"] = my_match.iloc[0]["Player 1"]

    # data[["server_name", "Svr", "Serving", "Pts", "serve_side", "Sv1", "Sv2", "Sv1_direction", "Sv2_direction", "Sv1_outcome", "Sv2_outcome"]]

    # data.to_csv("fed_nadal_points.csv")
    return data

In [27]:
# Identify serve side
df_points = identify_serve_side(df_points)

# Separate serve direction and serve outcome (ace or error)
df_points = identify_serve_direction_outcome(df_points)

# Identify server and returner names
df_points = identify_server_name(df_points)

KeyboardInterrupt: 

In [4]:
# Select all the points based on the list of matches provided. 
def select_players_points(point_data, match_data):

    # Use a copy to avoid "A value is trying to be set on a copy of a slice from a DataFrame" warning. 
    df_points_selected = point_data.loc[point_data["match_id"].isin(match_data["match_id"])].copy()

    # df_points_selected.info()
    return df_points_selected

In [8]:
def create_contingency_table(data, player1, player2):
    data = data.copy()
    
    players = [player1, player2]
    serve_sides = ["deuce", "ad"]
    serve_seqs = ["Sv1_direction", "Sv2_direction"]

    serve_stats = pd.DataFrame() 

    for player in players:
        for serve_side in serve_sides:
            for serve_seq in serve_seqs:
                serve_dir_counts = data.loc[(data["server_name"].str.contains(player)) & 
                                             (data["serve_side"] == serve_side), serve_seq].value_counts()
    #             print(serve_dir_counts)
                serve_dir_counts["server"] = player
                serve_dir_counts["serve_side"] = serve_side
                serve_dir_counts["serve_sequence"] = serve_seq
                serve_stats = serve_stats.append(serve_dir_counts, ignore_index=True)

    # federer_nadal_serve_stats = serve_stats[["server", "serve_side", "serve_sequence", "wide", "body", "t"]]
    serve_stats = serve_stats[["server", "serve_side", "serve_sequence", "wide", "body", "t"]]
#     print(federer_djokovic_serve_stats) 
#     print(federer_nadal_serve_stats)

    # serve_stats.to_csv("fed_nadal_saerve_dir.csv")
    return serve_stats

In [9]:
# Chi-square independence test

from scipy.stats import chi2_contingency
from scipy.stats import chi2

# chi-squared test
def chi_square(data, print_table = False):

    table = data.copy()

    table = table.set_index("server")

    test_stats, p, dof, expected = chi2_contingency(table)
    print("chi-squared test results:")
    print("chi2 test stats = %f" % test_stats)
    print("p = %f" % p)
    print('dof = %d ' % dof)

     # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob,
    critical, test_stats))
    if abs(test_stats) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')

     # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')

    if print_table:
        print(table)

In [11]:

def compare_players_chi_square(table):
    servers = table["server"].unique()
    serve_sides = table["serve_side"].unique()
    serve_sequences = table["serve_sequence"].unique()
    
    for serve_side in serve_sides:
        for serve_sequence in serve_sequences:
            my_data = table.loc[(table["serve_side"] == serve_side) & (table["serve_sequence"] == serve_sequence), 
                               ["server", "wide", "body", "t"]]
            
            print("serve_side: " + serve_side)
            print(serve_sequence)
            chi_square(data=my_data, print_table=True)
            print()

In [25]:
# Compare players' serves when they play each other
player1 = "kyrgios"
player2 = "federer"
df_points_selected = select_players_points(point_data=df_points, match_data=df_matches, player1=player1, player2=player2)
if df_points_selected.empty == False:    
    df_points_selected = identify_serve_side(df_points_selected)
    df_points_selected = identify_serve_direction_outcome(df_points_selected)
    df_points_selected = identify_server_name(df_points_selected)
    contingency_table = create_contingency_table(df_points_selected, player1, player2)
    compare_players_chi_square(contingency_table)
    
else:
    print("No results found")

serve_side: deuce
Sv1_direction
chi-squared test results:
chi2 test stats = 25.022304
p = 0.000004
dof = 2 
probability=0.950, critical=5.991, stat=25.022
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
          wide  body      t
server                     
kyrgios   73.0  21.0  173.0
federer  113.0  17.0  102.0

serve_side: deuce
Sv2_direction
chi-squared test results:
chi2 test stats = 2.270078
p = 0.321410
dof = 2 
probability=0.950, critical=5.991, stat=2.270
Independent (fail to reject H0)
significance=0.050, p=0.321
Independent (fail to reject H0)
         wide  body     t
server                   
kyrgios  22.0  32.0  35.0
federer  27.0  27.0  24.0

serve_side: ad
Sv1_direction
chi-squared test results:
chi2 test stats = 3.989642
p = 0.136038
dof = 2 
probability=0.950, critical=5.991, stat=3.990
Independent (fail to reject H0)
significance=0.050, p=0.136
Independent (fail to reject H0)
          wide  body      t
server                     
kyrgios  118

In [26]:
# To-do: Compare players' serves when they play the same player
player1 = "federer"
player2 = "nishikori"
df_points_selected = select_players_points(point_data=df_points, match_data=df_matches, player1=player1, player2=player2)
if df_points_selected.empty == False:    
    df_points_selected = identify_serve_side(df_points_selected)
    df_points_selected = identify_serve_direction_outcome(df_points_selected)
    df_points_selected = identify_server_name(df_points_selected)
    contingency_table = create_contingency_table(df_points_selected, player1, player2)
    compare_players_chi_square(contingency_table)
    
else:
    print("No results found")

serve_side: deuce
Sv1_direction
chi-squared test results:
chi2 test stats = 21.821817
p = 0.000018
dof = 2 
probability=0.950, critical=5.991, stat=21.822
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
           wide  body     t
server                     
federer    70.0   5.0  77.0
nishikori  68.0  38.0  83.0

serve_side: deuce
Sv2_direction
chi-squared test results:
chi2 test stats = 10.267942
p = 0.005893
dof = 2 
probability=0.950, critical=5.991, stat=10.268
Dependent (reject H0)
significance=0.050, p=0.006
Dependent (reject H0)
           wide  body     t
server                     
federer    19.0  13.0  17.0
nishikori  10.0  21.0  40.0

serve_side: ad
Sv1_direction
chi-squared test results:
chi2 test stats = 25.099605
p = 0.000004
dof = 2 
probability=0.950, critical=5.991, stat=25.100
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
           wide  body     t
server                     
federer    73.0   2.0  60.0
nishikori  9

To-do

1. Serve direction analysis
The characteristics of serves for one player
    - Comparison by serve sides(use chi-square)
    - Comparison first and second serves(use chi-square, ind=serves, dep=serve_direction)
    - Comparison by surfaces (control by opponent)(use chi-square)
    - Comparison by opponents (different serve patterns for different opponents?) (use chi-square)
    - Comparison by age (early years vs later years) (chi-square, ind=different ages, dep=serve_directions)
    - Comparison by tournament (grand slam vs other tournaments) (chi-square)
    - Show the change of serve directions during a match (draw a line chart show the counts of each serve direction over time)
    - Serve patterns at critical moments vs non-critical moments (1 points from winning, 1 points from losing, etc.)
    - Serve pattern based on tiredness, group serves by the count of points played (early, middle, or later) and compare between groups.
    - Serve patterns when leading vs when trailing
    - Serve patterns when first serve rate is high (high confidence) vs when the rate is low (low confidence)
    - Serve patterns when winning several points in a row (high confidence), serve patterns when losing several points in a row (low confidence, frustration). 
    - Serve after unforced errors (one or more). Serve after the opponent hitting a winner. Serve after hitting winners. 
    - Serve right after long points. 
    - Serve right after aces (is there a difference? Is he likely to serve a different direction?)
    - Serve right after double faults (is there a difference?)
    - Serve right after repeated first serve faults
    - Is the success or failure of certain serve directions (e.g., aces, short points won, double faults) in the beginning influence the later serve decisions? (anchor effect) Any short term or long term effects?
    - Can we find any subtle bias in serve selection?
    
Compare the serves of two players
    - When they play each other ...
    - When they play the same opponent ...
    - When they are tired
    - When they are leading or losing
    - At critical moments ...
    - At high or low confidence level 

Compare the serves for three or more players?
    - Is it useful?
    
2. Serve error analysis
    - Frequency of errors at critical moments (double fault, first serve fault)
    - Types of errors correlated with tension, confidence, etc. 


In [None]:
t