In [14]:
import pandas as pd
from IPython.display import Markdown as md

In [2]:
# read data into DataFrame
data = pd.read_csv('PGA_Raw_Data_2017-2022.csv')
data = data.drop(['hole_DKP', 'hole_FDP', 'hole_SDP', 
                  'streak_DKP', 'streak_FDP', 'streak_SDP', 
                  'n_rounds', 'finish_DKP', 'finish_FDP', 
                  'finish_SDP', 'total_DKP', 'total_FDP', 
                  'total_SDP', 'Unnamed: 2', 'Unnamed: 3', 
                  'Unnamed: 4', 'purse', 'no_cut'],
                 axis=1
                );  # the semicolon supresses output, w/o it the df would be printed here

In [3]:
# take a look at the data
data.head(3)

Unnamed: 0,Player_initial_last,tournament id,player id,hole_par,strokes,made_cut,pos,player,tournament name,course,date,season,Finish,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total
0,A. Ancer,401353224,9261,288,289,1,32.0,Abraham Ancer,The Memorial Tournament pres. by Nationwide,"Muirfield Village Golf Club - Dublin, OH",2022-06-05,2022,T32,0.2,-0.13,-0.08,0.86,0.65,0.85
1,A. Hadwin,401353224,5548,288,286,1,18.0,Adam Hadwin,The Memorial Tournament pres. by Nationwide,"Muirfield Village Golf Club - Dublin, OH",2022-06-05,2022,T18,0.36,0.75,0.31,0.18,1.24,1.6
2,A. Lahiri,401353224,4989,144,147,0,,Anirban Lahiri,The Memorial Tournament pres. by Nationwide,"Muirfield Village Golf Club - Dublin, OH",2022-06-05,2022,CUT,-0.56,0.74,-1.09,0.37,0.02,-0.54


## Data Explanation

**Player_initial_last**  
Initial of the players first name and their full last name.  
**tournament id**  
Unique id for each tournament.  
**player id**  
Unique id for each player.  
**hole_par**  
Number of strokes to score a par overall.  
**strokes**  
Number of strokes the player needed for the tournament.  
**made_cut**  
0: did not make cut  
1: made the cut  
**pos**  
Position of the player in the tournament.  
**player**  
The player's full name.  
**tournament name**
Name of the tournament.  
**course**  
Golf course on which was played.  
**date**  
Date of the tournament, format YYYY-MM-DD  
**season**  
Year of the season (2017 - 2022). Keep in mind that season 2017 started in 2016 and so on.  
**Finish**  
Finishing position of the player, if the player was cut the 'Finish' is 'CUT'.  
**sg_put**  
Shots gained while putting.  
**sg_arg**  
Shots gained around the green.  
**sg_app**  
Shots gained during approach.  
**sg_ott**  
Shots gained off the tee.  
**sg_t2g**  
Shots gained tee to green.  
**sg_total**  
Shots gained in total.

In [4]:
data.describe()

Unnamed: 0,tournament id,player id,hole_par,strokes,made_cut,pos,season,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total
count,28525.0,28525.0,28525.0,28525.0,28525.0,16167.0,28525.0,23523.0,23523.0,23523.0,23523.0,23523.0,23524.0
mean,301348000.0,102279.9,224.712428,223.119755,0.598422,33.883714,2019.428466,-0.125924,-0.044617,-0.10822,-0.049563,-0.202325,-0.323437
std,173443800.0,652390.1,70.446659,66.811562,0.490226,22.518739,1.66258,1.12112,0.728476,1.117115,0.803879,1.638853,1.972278
min,2689.0,5.0,70.0,66.0,0.0,1.0,2017.0,-5.99,-5.41,-7.51,-7.74,-13.95,-13.67
25%,401025200.0,1264.0,143.0,146.0,0.0,15.0,2018.0,-0.78,-0.45,-0.75,-0.46,-1.1,-1.41
50%,401056600.0,4449.0,280.0,271.0,1.0,32.0,2019.0,-0.04,-0.01,-0.01,0.04,-0.03,-0.18
75%,401243000.0,7001.0,287.0,281.0,1.0,51.0,2021.0,0.62,0.41,0.635,0.48,0.91,1.04
max,401366900.0,4845309.0,292.0,313.0,1.0,999.0,2022.0,4.3,3.17,4.14,2.47,5.57,7.01


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28525 entries, 0 to 28524
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player_initial_last  28525 non-null  object 
 1   tournament id        28525 non-null  int64  
 2   player id            28525 non-null  int64  
 3   hole_par             28525 non-null  int64  
 4   strokes              28525 non-null  int64  
 5   made_cut             28525 non-null  int64  
 6   pos                  16167 non-null  float64
 7   player               28525 non-null  object 
 8   tournament name      28525 non-null  object 
 9   course               28525 non-null  object 
 10  date                 28525 non-null  object 
 11  season               28525 non-null  int64  
 12  Finish               23524 non-null  object 
 13  sg_putt              23523 non-null  float64
 14  sg_arg               23523 non-null  float64
 15  sg_app               23523 non-null 

In [9]:
data.isna()

Unnamed: 0,Player_initial_last,tournament id,player id,hole_par,strokes,made_cut,pos,player,tournament name,course,date,season,Finish,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28520,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True
28521,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True
28522,False,False,False,False,False,False,True,False,False,False,False,False,True,True,True,True,True,True,True
28523,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True


## Taking a look at Justin Thomas

In [10]:
# store data regarding Justin Thomas in jt_data
jt_data = data[data['player']=='Justin Thomas']

In [11]:
jt_data.describe()

Unnamed: 0,tournament id,player id,hole_par,strokes,made_cut,pos,season,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total
count,119.0,119.0,119.0,119.0,119.0,104.0,119.0,91.0,91.0,91.0,91.0,91.0,91.0
mean,293291200.0,4848.0,264.932773,257.142857,0.865546,14.173077,2019.294118,0.028571,0.295824,0.769121,0.353187,1.418242,1.460659
std,178625000.0,0.0,49.258008,45.22885,0.342582,14.171928,1.679149,0.997236,0.503799,0.938631,0.538896,1.110451,1.618174
min,2689.0,4848.0,140.0,141.0,0.0,1.0,2017.0,-2.91,-1.35,-2.41,-1.47,-3.06,-3.94
25%,3790.0,4848.0,280.0,268.0,1.0,3.0,2018.0,-0.51,-0.05,0.235,0.05,0.955,0.725
50%,401056500.0,4848.0,284.0,274.0,1.0,9.0,2019.0,0.19,0.36,0.88,0.3,1.53,1.67
75%,401233400.0,4848.0,288.0,279.0,1.0,21.0,2021.0,0.765,0.645,1.45,0.71,2.035,2.465
max,401366900.0,4848.0,292.0,292.0,1.0,75.0,2022.0,1.81,1.69,2.7,1.75,3.61,4.38


In [24]:
jt_cut = jt_data[jt_data['made_cut']==0]
jt_cut = jt_cut['made_cut'].count()  # number of times JT has been cut

jt_not_cut = jt_data[jt_data['made_cut']==1]
jt_not_cut = jt_not_cut['made_cut'].count()  # number of times JT made the cut

print(f'jt_not_cut: {jt_not_cut}\njt_cut: {jt_cut}')

jt_not_cut: 103
jt_cut: 16


#### Cuts made by Justin Thomas
Justin Thomas made 103 cuts out of 119 between 2017 and 2022.

#### In which part of his game does Justin Thomas gain the most or least shots?

In [38]:
# tee to green and total can be ignored here, because they would include (almost) every part of the game
jt_avg_sg_putt = jt_data['sg_putt'].mean()  # mean shots gained putting
jt_avg_sg_arg = jt_data['sg_arg'].mean()  # mean shots gained around the green
jt_avg_sg_app = jt_data['sg_app'].mean()  # mean shots gained on approach
jt_avg_sg_ott = jt_data['sg_ott'].mean()  # mean shots gained off the tee

# dict with sg
jt_dict_mean_sg = {'Putting':jt_avg_sg_putt, 
                   'Around the Green':jt_avg_sg_arg, 
                   'Approach':jt_avg_sg_app, 
                   'Off the Tee':jt_avg_sg_ott}

# get variable with max and min value
jt_max_sg_var = max(jt_dict_mean_sg, key=jt_dict_mean_sg.get)
jt_min_sg_var = min(jt_dict_mean_sg, key=jt_dict_mean_sg.get)

print(f'Mean:\nSG Putting: {jt_avg_sg_putt}\nSG ARG: {jt_avg_sg_arg}\nSG App: {jt_avg_sg_app}\nSG OTT: {jt_avg_sg_ott}')
print(f'Most SG: {jt_max_sg_var}\nLeast SG: {jt_min_sg_var}')

Mean:
SG Putting: 0.028571428571428557
SG ARG: 0.29582417582417575
SG App: 0.7691208791208792
SG OTT: 0.3531868131868132
Most SG: Approach
Least SG: Putting


Justin Thomas has the highest amount of shots gained during his approach while he struggles with gaining shots whilst putting.