# How do the accuracies of random forests and decision tree models compare?
1) Build a decision tree. <br>
2) Try to match decision tree with the simplest random forest you can. <br>
3) Measure simplicity with runtime. Compare with decision tree. <br>


In [36]:
# Import modules.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn import tree
from IPython.display import Image

In [37]:
# Load data.
df = pd.read_csv('~/src/data/unit3/golf-stats-2018.csv', encoding='windows-1252')
print(df.shape)
print(df.columns)
df.head()

(967, 33)
Index(['Player_Name', 'Country', 'Rank', 'SG_TeeToGreen', 'SG_OffTheTee',
       'DrivingDistance_AllDrives', 'Driving_Accuracy',
       'Good_Drive_Percentage', 'SG_ApproachTheGreen', 'GIR_Percentage',
       'GIR_Percentage_FromFairway', 'GIR_Percentage_FromFairwayBunker',
       'GIR_Percentage_OtherThanFairway', 'Fairway_Proximity',
       'Proximity_To_Hole', 'SG_AroundTheGreen', 'Sand_Save_Percentage',
       'Scrambling', 'Scrambling_FromTheSand', 'Scrambling_FromTheRough',
       'Scrambling_Other', 'Proximity_ARG', 'SG_Putting', 'OnePutt_Percentage',
       'Three_Putt_Avoidance', 'Putts_Per_Round',
       'BirdieOrBetter_ConversionPercentage', 'SG_Total', 'Scoring_Average',
       'Par3_ScoringAverage', 'Par4_ScoringAverage', 'Par5_ScoringAverage',
       'Major_Winner'],
      dtype='object')


Unnamed: 0,Player_Name,Country,Rank,SG_TeeToGreen,SG_OffTheTee,DrivingDistance_AllDrives,Driving_Accuracy,Good_Drive_Percentage,SG_ApproachTheGreen,GIR_Percentage,...,OnePutt_Percentage,Three_Putt_Avoidance,Putts_Per_Round,BirdieOrBetter_ConversionPercentage,SG_Total,Scoring_Average,Par3_ScoringAverage,Par4_ScoringAverage,Par5_ScoringAverage,Major_Winner
0,Dustin Johnson,USA,1,2.036,1.036,305.1,57.67,77.97,0.763,69.31,...,38.16,1.45,28.19,38.15,2.587,68.751,2.99,3.94,4.47,1
1,Justin Rose,ENG,2,1.456,0.562,291.0,64.12,77.53,0.521,69.7,...,38.43,2.55,28.45,35.82,2.082,69.081,3.05,3.93,4.57,1
2,Justin Thomas,USA,3,1.564,0.506,302.1,57.74,78.12,0.845,68.83,...,39.27,2.97,28.41,35.84,1.821,69.415,3.08,3.97,4.5,1
3,Brooks Koepka,USA,4,0.469,0.446,299.0,55.26,71.48,-0.158,67.01,...,36.42,3.55,28.72,34.03,0.738,69.587,3.02,4.0,4.64,1
4,Jon Rahm,ESP,5,1.14,0.984,307.6,58.85,79.23,-0.036,70.37,...,38.64,2.9,28.79,37.36,1.345,70.117,3.1,3.96,4.55,0


## Clean the data

In [38]:
df.dtypes

Player_Name                             object
Country                                 object
Rank                                     int64
SG_TeeToGreen                          float64
SG_OffTheTee                           float64
DrivingDistance_AllDrives              float64
Driving_Accuracy                       float64
Good_Drive_Percentage                  float64
SG_ApproachTheGreen                    float64
GIR_Percentage                         float64
GIR_Percentage_FromFairway             float64
GIR_Percentage_FromFairwayBunker       float64
GIR_Percentage_OtherThanFairway        float64
Fairway_Proximity                       object
Proximity_To_Hole                       object
SG_AroundTheGreen                      float64
Sand_Save_Percentage                   float64
Scrambling                             float64
Scrambling_FromTheSand                 float64
Scrambling_FromTheRough                float64
Scrambling_Other                       float64
Proximity_ARG

In [39]:
# The dataset includes the top 1000 ranked players. However, statistics are only measured for
# top ~200 players within each category. Additionally, there were 33 player rows with either/or 
# 'Player_Name' & 'Country' missing from the website. These 33 players were omitted from the dataset. 
print(len(df))
df.isnull().sum()

967


Player_Name                              0
Country                                  0
Rank                                     0
SG_TeeToGreen                          764
SG_OffTheTee                           764
DrivingDistance_AllDrives              764
Driving_Accuracy                       764
Good_Drive_Percentage                  764
SG_ApproachTheGreen                    764
GIR_Percentage                         764
GIR_Percentage_FromFairway             766
GIR_Percentage_FromFairwayBunker       764
GIR_Percentage_OtherThanFairway        764
Fairway_Proximity                      764
Proximity_To_Hole                      764
SG_AroundTheGreen                      764
Sand_Save_Percentage                   764
Scrambling                             764
Scrambling_FromTheSand                 764
Scrambling_FromTheRough                764
Scrambling_Other                       764
Proximity_ARG                          764
SG_Putting                             764
OnePutt_Per

In [40]:
# For analysis, we will drop the rows (players) where at
# least one element is missing.
df = df.dropna(axis='rows')
print(len(df))
#df.isnull().sum()

201


In [41]:
df.tail()

Unnamed: 0,Player_Name,Country,Rank,SG_TeeToGreen,SG_OffTheTee,DrivingDistance_AllDrives,Driving_Accuracy,Good_Drive_Percentage,SG_ApproachTheGreen,GIR_Percentage,...,OnePutt_Percentage,Three_Putt_Avoidance,Putts_Per_Round,BirdieOrBetter_ConversionPercentage,SG_Total,Scoring_Average,Par3_ScoringAverage,Par4_ScoringAverage,Par5_ScoringAverage,Major_Winner
669,Roberto Díaz,MEX,670,-0.986,-0.155,280.0,63.78,81.78,-0.573,63.53,...,38.08,2.87,29.23,25.88,-0.889,71.799,3.11,4.08,4.73,0
690,K.J. Choi,KOR,691,-1.08,-0.378,269.6,70.81,83.99,-0.603,62.87,...,41.08,3.22,28.97,28.37,-1.032,72.0,3.04,4.11,4.72,0
699,Zecheng Dou,CHN,700,-0.566,-0.157,286.3,55.02,76.4,-0.276,63.33,...,36.42,4.81,29.96,26.03,-1.241,72.855,3.11,4.14,4.79,0
779,Ernie Els,RSA,780,-0.192,-0.197,284.5,54.69,74.67,-0.241,62.81,...,34.21,2.19,29.36,23.15,-0.265,72.096,3.1,4.14,4.77,0
800,Brett Stegmaier,USA,801,-0.337,-0.153,286.5,58.78,82.44,0.068,70.2,...,36.97,3.13,29.51,28.53,0.06,71.244,2.99,4.06,4.64,0


In [42]:
df_stage = df.drop(['Fairway_Proximity', 'Proximity_To_Hole', 'Proximity_ARG'], axis=1)
df = df_stage

## Set our target and features

In [43]:
X = df.drop(['Player_Name', 'Country', 'Rank', 'Major_Winner'], axis=1)
X = pd.get_dummies(X)
Y = df['Major_Winner']

## Decision tree

In [44]:
# Packages for rendering our tree.
import pydotplus
#import graphviz

# Initialize and train our tree.
maxFeatures = 1 #number of features used per node.
maxDepth = 4 #number of decision levels below the root for our classification.
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=maxFeatures,
    max_depth=maxDepth
)
decision_tree.fit(X, Y)
print(cross_val_score(decision_tree, X, Y, cv=10))

'''
# Render our tree.

dot_data = tree.export_graphviz(
    decision_tree, out_file=None,
    feature_names=customers.columns,
    class_names=['Not Returning', 'Returning'],
    filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
'''

# Run-time.
decision_tree_time_start = time.time()
print('Decision tree runtime: {}'.format(time.time() - decision_tree_time_start))

[0.76190476 0.8        0.95       0.9        0.95       0.85
 0.9        0.9        0.9        0.9       ]
Decision tree runtime: 0.0


In [45]:
df.columns

Index(['Player_Name', 'Country', 'Rank', 'SG_TeeToGreen', 'SG_OffTheTee',
       'DrivingDistance_AllDrives', 'Driving_Accuracy',
       'Good_Drive_Percentage', 'SG_ApproachTheGreen', 'GIR_Percentage',
       'GIR_Percentage_FromFairway', 'GIR_Percentage_FromFairwayBunker',
       'GIR_Percentage_OtherThanFairway', 'SG_AroundTheGreen',
       'Sand_Save_Percentage', 'Scrambling', 'Scrambling_FromTheSand',
       'Scrambling_FromTheRough', 'Scrambling_Other', 'SG_Putting',
       'OnePutt_Percentage', 'Three_Putt_Avoidance', 'Putts_Per_Round',
       'BirdieOrBetter_ConversionPercentage', 'SG_Total', 'Scoring_Average',
       'Par3_ScoringAverage', 'Par4_ScoringAverage', 'Par5_ScoringAverage',
       'Major_Winner'],
      dtype='object')

# Standardize

In [46]:
from sklearn.preprocessing import StandardScaler
df = pd.DataFrame(StandardScaler().fit_transform(df.drop(
    ['Player_Name', 'Country', 'Rank'], axis=1)))
df.columns = ['SG_TeeToGreen', 'SG_OffTheTee',
       'DrivingDistance_AllDrives', 'Driving_Accuracy',
       'Good_Drive_Percentage', 'SG_ApproachTheGreen', 'GIR_Percentage',
       'GIR_Percentage_FromFairway', 'GIR_Percentage_FromFairwayBunker',
       'GIR_Percentage_OtherThanFairway', 'SG_AroundTheGreen',
       'Sand_Save_Percentage', 'Scrambling', 'Scrambling_FromTheSand',
       'Scrambling_FromTheRough', 'Scrambling_Other', 'SG_Putting',
       'OnePutt_Percentage', 'Three_Putt_Avoidance', 'Putts_Per_Round',
       'BirdieOrBetter_ConversionPercentage', 'SG_Total', 'Scoring_Average',
       'Par3_ScoringAverage', 'Par4_ScoringAverage', 'Par5_ScoringAverage',
       'Major_Winner']
df.head()

Unnamed: 0,SG_TeeToGreen,SG_OffTheTee,DrivingDistance_AllDrives,Driving_Accuracy,Good_Drive_Percentage,SG_ApproachTheGreen,GIR_Percentage,GIR_Percentage_FromFairway,GIR_Percentage_FromFairwayBunker,GIR_Percentage_OtherThanFairway,...,OnePutt_Percentage,Three_Putt_Avoidance,Putts_Per_Round,BirdieOrBetter_ConversionPercentage,SG_Total,Scoring_Average,Par3_ScoringAverage,Par4_ScoringAverage,Par5_ScoringAverage,Major_Winner
0,2.352689,2.292061,2.652016,-0.70135,-0.613449,1.628711,0.963889,0.735113,-1.047017,1.929892,...,0.123935,-2.215183,-1.785507,3.428337,2.762593,-2.635849,-1.758409,-2.311261,-2.574533,2.9277
1,1.626167,1.191536,0.647408,0.541325,-0.738792,1.058383,1.081669,1.333071,0.764177,0.742138,...,0.232226,-0.560531,-1.280169,2.474868,2.174306,-2.240398,-0.495371,-2.535147,-1.172145,2.9277
2,1.761451,1.061516,2.225504,-0.687864,-0.570718,1.821963,0.818929,0.975671,0.412288,1.745472,...,0.56913,0.071245,-1.357913,2.483053,1.87026,-1.840154,0.136148,-1.639603,-2.153817,2.9277
3,0.389828,0.922209,1.784774,-1.165668,-2.462256,-0.541837,0.26929,-0.141206,0.484736,0.77507,...,-0.573939,0.943698,-0.755395,1.742376,0.608647,-1.63404,-1.12689,-0.967945,-0.190474,2.9277
4,1.230338,2.171328,3.007443,-0.474008,-0.254512,-0.254316,1.284009,2.254065,1.747397,1.13952,...,0.316452,-0.034051,-0.619343,3.105058,1.315756,-0.998923,0.557161,-1.863489,-1.452623,-0.341565


## RFC

In [47]:
rfc = ensemble.RandomForestClassifier()
print(cross_val_score(rfc, X, Y, cv=10)) #cv= number of cross-val-scores

# Run-time.
rfc_time_start = time.time()
print('rfc runtime: {}'.format(time.time() - rfc_time_start))

[0.76190476 0.9        0.95       0.9        0.95       0.95
 0.9        0.95       0.9        0.9       ]
rfc runtime: 0.0


Runtime both models 0.0 seconds due to short length of dataset.