In [1]:
import pandas as pd
import numpy as np

# Dataset description
Each tournament has two datasets associated with it, one is named *year*-*tourney name*-matches.csv, which holds the metadata for the matches between players.
The other dataset is named *year*-*tourney name*-points.csv, which holds the point-by-point data of the matches.
In this EDA we will be looking at the 2018 US Open dataset:

In [2]:
us18_matches = pd.read_csv('tennis_data/2018-usopen-matches.csv')
us18_matches = us18_matches.dropna(axis=1, how='all')

us18_points = pd.read_csv('tennis_data/2018-usopen-points.csv')
us18_points = us18_points.dropna(axis=1, how='all')

In [3]:
print(us18_points.columns)
us18_points.head()

Index(['match_id', 'ElapsedTime', 'SetNo', 'P1GamesWon', 'P2GamesWon',
       'SetWinner', 'GameNo', 'GameWinner', 'PointNumber', 'PointWinner',
       'PointServer', 'Speed_KMH', 'P1Score', 'P2Score', 'P1Momentum',
       'P2Momentum', 'P1PointsWon', 'P2PointsWon', 'P1Ace', 'P2Ace',
       'P1Winner', 'P2Winner', 'P1DoubleFault', 'P2DoubleFault', 'P1UnfErr',
       'P2UnfErr', 'P1NetPoint', 'P2NetPoint', 'P1NetPointWon',
       'P2NetPointWon', 'P1BreakPoint', 'P2BreakPoint', 'P1BreakPointWon',
       'P2BreakPointWon', 'History', 'Speed_MPH', 'P1BreakPointMissed',
       'P2BreakPointMissed', 'ServeIndicator', 'ServeNumber', 'WinnerType',
       'WinnerShotType', 'P1DistanceRun', 'P2DistanceRun', 'RallyCount',
       'ServeWidth', 'ServeDepth', 'ReturnDepth'],
      dtype='object')


Unnamed: 0,match_id,ElapsedTime,SetNo,P1GamesWon,P2GamesWon,SetWinner,GameNo,GameWinner,PointNumber,PointWinner,...,ServeIndicator,ServeNumber,WinnerType,WinnerShotType,P1DistanceRun,P2DistanceRun,RallyCount,ServeWidth,ServeDepth,ReturnDepth
0,2018-usopen-1101,0:00:00,1,0,0,0,1,0,0X,0,...,0,0,0,0,0.0,0.0,0,,,
1,2018-usopen-1101,0:00:00,1,0,0,0,1,0,0Y,0,...,0,0,0,0,0.0,0.0,0,,,
2,2018-usopen-1101,0:00:00,1,0,0,0,1,0,1,1,...,2,1,0,0,5.951,2.944,2,W,CTL,
3,2018-usopen-1101,0:00:19,1,0,0,0,1,0,2,2,...,2,1,0,0,2.875,2.285,1,,,ND
4,2018-usopen-1101,0:00:37,1,0,0,0,1,0,3,2,...,2,2,0,0,3.886,3.986,1,BW,NCTL,D


In [4]:
us18_points.P1DoubleFault.unique()

array([0, 1])

### ServeWidth

B: Body
BC: Body/Center
BW: Body/Wide
C: Center ["down the T"]
W: Wide

### ServeDepth

CTL: Close To Line
NCTL: Not Close To Line

### ReturnDepth

D: Deep
ND: Not Deep

### ServeIndicator

1: 1st serve
2: 2nd servea

In [5]:
print(us18_matches.columns)
us18_matches.head()

Index(['match_id', 'year', 'slam', 'match_num', 'player1', 'player2'], dtype='object')


Unnamed: 0,match_id,year,slam,match_num,player1,player2
0,2018-usopen-1101,2018,usopen,1101,Rafael Nadal,David Ferrer
1,2018-usopen-1105,2018,usopen,1105,Jack Sock,Guido Andreozzi
2,2018-usopen-1108,2018,usopen,1108,Paolo Lorenzi,Kyle Edmund
3,2018-usopen-1109,2018,usopen,1109,Dominic Thiem,Mirza Basic
4,2018-usopen-1111,2018,usopen,1111,Mischa Zverev,Taylor Fritz


In [6]:
print(np.size(us18_points))
print(np.size(us18_points['match_id'].unique()))

1528848
178


In [7]:
name = 'Dominic Thiem'
us18_matches[(us18_matches.player1 == name) | (us18_matches.player2 == name)]

Unnamed: 0,match_id,year,slam,match_num,player1,player2
3,2018-usopen-1109,2018,usopen,1109,Dominic Thiem,Mirza Basic
31,2018-usopen-1205,2018,usopen,1205,Dominic Thiem,Steve Johnson
57,2018-usopen-1303,2018,usopen,1303,Dominic Thiem,Taylor Fritz
72,2018-usopen-1402,2018,usopen,1402,Dominic Thiem,Kevin Anderson
79,2018-usopen-1501,2018,usopen,1501,Rafael Nadal,Dominic Thiem


# Comments on data
Our sample size of matches isn't terribly large, but considering we have each point made in four grand slams a year from 2011 to present, there is a substantial amount of data to go through!

# Preliminary player Stat aggregation


In [8]:
# Get all player names from a tourney

def get_player_names(matches):
    return np.unique(np.append(matches.player1.values, matches.player2.values))

# Given a player name and tourney, get their matches
def get_player_points(player, matches, points):
    # Get all match numbers
    match_ids = matches[(matches['player1'] == player) | (matches['player2'] == player)].match_id

    player_points = points[points.match_id.isin(match_ids)]

    return player_points

get_player_points('Rafael Nadal', us18_matches, us18_points)

Unnamed: 0,match_id,ElapsedTime,SetNo,P1GamesWon,P2GamesWon,SetWinner,GameNo,GameWinner,PointNumber,PointWinner,...,ServeIndicator,ServeNumber,WinnerType,WinnerShotType,P1DistanceRun,P2DistanceRun,RallyCount,ServeWidth,ServeDepth,ReturnDepth
0,2018-usopen-1101,0:00:00,1,0,0,0,1,0,0X,0,...,0,0,0,0,0.000,0.000,0,,,
1,2018-usopen-1101,0:00:00,1,0,0,0,1,0,0Y,0,...,0,0,0,0,0.000,0.000,0,,,
2,2018-usopen-1101,0:00:00,1,0,0,0,1,0,1,1,...,2,1,0,0,5.951,2.944,2,W,CTL,
3,2018-usopen-1101,0:00:19,1,0,0,0,1,0,2,2,...,2,1,0,0,2.875,2.285,1,,,ND
4,2018-usopen-1101,0:00:37,1,0,0,0,1,0,3,2,...,2,2,0,0,3.886,3.986,1,BW,NCTL,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18530,2018-usopen-1601,1:57:52,2,2,5,0,8,0,132,1,...,1,1,0,0,13.600,13.936,3,W,CTL,ND
18531,2018-usopen-1601,1:58:17,2,2,5,0,8,0,133,1,...,1,1,0,0,19.441,20.650,5,W,NCTL,ND
18532,2018-usopen-1601,1:58:43,2,2,5,0,8,0,134,2,...,1,1,0,0,12.195,11.350,4,BW,CTL,D
18533,2018-usopen-1601,1:59:11,2,2,5,0,8,0,135,2,...,1,1,0,0,10.293,11.543,4,BW,NCTL,D


# The tennis playstyles

In order to categorize what type of playstyle each player has I plan to use K-means clustering. 
But what statistics should be gathered to cluster upon? Good question!
I have some general ideas on what to use such as what is on 'Ultimate Tennis Statistics', but by laying out the descriptions of the general 4 playstyles determined by professionaly who have had decades of experience in the sport, we can hone in on what stats or characteristics are really important in capturing playstyle (i.e. a pusher will make few mistakes).

## Aggresive Baseliner
- Stick to baseline (duh)
- Have strong forehand and backhand and hit winners when given a shot at the baseline
- Avoid vollying / aren't very good at it
- Rely on getting a rhythm going
- Occasionally make unforced errors because of aggresive playstyle

### Defeated by:
- Players who hit deep and vary their shots
- Players who bring them up to the net, which is their weakness

## The Serve-and-Volleyer
- Has a killer serve
- Makes a strong first serve and usually rushes to the net to make a well placed volley and possibly finish off the point
- Always looking to hit deep and wide to rush to net when returning
- Weak / incosistent groundstrokes

### Defeated by:
- players who can break their serve and return on the rise
- players who can hit down the line, lob, and have sharply angled topspin shots
- players who can keep them on the baseline by keeping shots deep and lob when they come in

## The Counterpuncher / Pusher
- Consistent defense, almost never makes unforced errors
- Usually are fast and have good court coverage
- Wear opponents down until they make a mistake
- Hit shots deep, lobs, and places effectively

### Defeated by:
- Players who are skilled enough to have a strong enough style to consistently beat them
- Players who can aggress them by rushing the net
- On that note players who have strong overhead shots
- Players that hit behind them / move them up and down the court 

## All-Court Player
- Adapts to opponent and is comfortable using a wide variety of shots
- Jack of all trades, does not have one specific weapon

### Defeated by:
- Players who stick to their strengths
- Players who keep opponent at baseline with looping topspins (you cant hit a variety of shots from there)
- Players that are aggresive and good at it

# Create Game outcomes DF

In [9]:
def who_won(points, p1, p2):
    set_points = points[points.SetWinner == 1]
    p1SetsWon = 0
    p2SetsWon = 0

    for i in range(np.shape(set_points)[0]):
        p1GamesWon = set_points.P1GamesWon.iloc[i]
        p2GamesWon = set_points.P2GamesWon.iloc[i]

        if p1GamesWon > p2GamesWon:
            p1SetsWon += 1
        else:
            p2SetsWon += 1
    
    if p1SetsWon > p2SetsWon:
        return 1
    else:
        return -1
    

In [12]:
colnames = ['Player1', 'Player2', 'Outcome']
Outcomes = pd.DataFrame(index=[''], columns=colnames)

tours = ['ausopen', 'frenchopen', 'usopen', 'wimbledon']

ind = -1
for year in np.arange(2011, 2018):
    for tour in tours:
        matches = pd.read_csv('tennis_data/' + str(year) + '-' + tour + '-matches.csv')
        points = pd.read_csv('tennis_data/' + str(year) + '-' + tour + '-points.csv')

        for i in range(np.shape(matches)[0]):
            match_id = matches.match_id[i]
            match_points = points[points.match_id == match_id]

            outcome = who_won(match_points, matches.player1[i], matches.player2[i])

            ind += 1
            Outcomes.loc[ind] = [matches.player1[i], matches.player2[i], outcome]

            

# Subset Mens Top 100 tennis players
Unfortunately the data available between Mens and Womens tennis is unqequal in detail and availability, so I'll be subsetting to the top 100 ATP tennis players as of 2018 (Where the point by point data ends)

In [13]:
# I'm using the Mens ATP rankings from 2017 as a reference to separate (most) instances
atp_ranks = pd.read_csv('ATP_Rankings.csv')

atp_mens = atp_ranks.name
players = (Outcomes.Player1.append(Outcomes.Player2)).unique()

atp100 = []
for player in players:
    if (player in list(atp_mens)):
        atp100.append(player)
    else:
        pass 

Outcomes = Outcomes[Outcomes.Player1.isin(atp100) & Outcomes.Player2.isin(atp100)]

In [16]:
Outcomes = Outcomes.dropna()
Outcomes.to_csv('match_outcomes.csv')

In [17]:
Outcomes.head()

Unnamed: 0,Player1,Player2,Outcome
3,Donald Young,Marin Cilic,-1
12,Dudi Sela,Juan Martin Del Potro,-1
17,Fabio Fognini,Kei Nishikori,1
28,Lukas Lacko,Roger Federer,-1
33,Mikhail Youzhny,Blaz Kavcic,1


## Ten players arent caught in this search look into later

In [18]:
matchups = pd.DataFrame(0, index = atp100, columns= atp100)
matchups.head()

Unnamed: 0,Rafael Nadal,John Isner,Donald Young,Dudi Sela,Tomas Berdych,Fabio Fognini,Lukas Lacko,Mikhail Youzhny,Milos Raonic,Andreas Seppi,...,Andrey Rublev,Karen Khachanov,Daniil Medvedev,Marius Copil,Nicolas Kicker,Tennys Sandgren,Denis Shapovalov,Marton Fucsovics,Stefanos Tsitsipas,Victor Estrella
Rafael Nadal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
John Isner,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Donald Young,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dudi Sela,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tomas Berdych,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
n = len(atp100)
for i in range(0,n):
    p1 = Outcomes.iloc[i].Player1
    p2 = Outcomes.iloc[i].Player2

    if Outcomes.iloc[i].Outcome == 1:
        matchups.loc[p2, p1] = matchups.loc[p2, p1] + 1
    else:
        matchups.loc[p1, p2] = matchups.loc[p1, p2] + 1


In [20]:
matchups.head()

matchups.to_csv('matchups_atp100.csv')