In [33]:
#February 16, 2020
#Chapter 3
#Predicting Sports Winners with Decision Trees

In [34]:
#Loads the NBA 2015-2016 season data using pandas, read_csv function
import pandas as pd
data_filename = "basketball.csv"
dataset = pd.read_csv(data_filename)

In [35]:
#prints out full data frame
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Tue Oct 27 2015,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,Tue Oct 27 2015,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,Tue Oct 27 2015,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,Wed Oct 28 2015,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,
4,Wed Oct 28 2015,7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,


In [36]:
#Changes the heading after loading the file to clean up the display of the data
dataset = pd.read_csv(data_filename, parse_dates=["Date"])

dataset.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Attendance", "Notes"]

In [37]:
#Uses pandas to check the data types of the data, the date is in datetime64[ns] type , any number is int64, any string of words is object
print(dataset.dtypes)

Date            datetime64[ns]
Start (ET)              object
Visitor Team            object
VisitorPts               int64
Home Team               object
HomePts                  int64
OT?                     object
Score Type              object
Attendance               int64
Notes                   object
dtype: object


In [38]:
#Specifies the dataset, computes the winner of the match, using a comparision of points : Home Wins when visitor points < home point
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]

In [39]:
#Displays the data, HomeWin shown using boolean of true or false, according to previous calculation
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attendance,Notes,HomeWin
0,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,,False
1,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,,True
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,,True
3,2015-10-28,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,,False
4,2015-10-28,7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,,True


In [40]:
#Copies the values of dataset["HomeWin"] into a NumPy array called y_true to use for scikit-learn classifiers, work around as there is no clean integration with pandas and scikit-learn
y_true = dataset["HomeWin"].values

In [41]:
#Calculates the mean to predict the "Home Field Advantage"
dataset["HomeWin"].mean()

0.5942249240121581

In [42]:
#Creates a default dictionary to store the team's last result
from collections import defaultdict
won_last = defaultdict(int)

In [43]:
#Creates two new features on the data set 
dataset["HomeLastWin"] = 0
dataset["VisitorLastWin"] = 0

In [44]:
#For sorted dataset, in chronological order, iterates through the rows in the dictionary and updates each row with the latest result
#The last two lines update the features in the dictionary with a 1 if the home team wins, or a 0 if the visiting team wins
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    dataset.at[index, "HomeLastWin"] = won_last[home_team] #dataset.set_value(index, "HomeLastWin", won_last[home_team])
    dataset.at[index, "VisitorLastWin"] = won_last[visitor_team] #dataset.set_value(index, "VisitorLastWin", won_last[visitor_team])
    
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])

In [45]:
#Displays the first six rows of the data set
dataset.head(6)

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attendance,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,,False,0,0
1,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,,True,0,0
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,,True,0,0
3,2015-10-28,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,,False,0,0
4,2015-10-28,7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,,True,0,0
5,2015-10-28,7:30p,Charlotte Hornets,94,Miami Heat,104,Box Score,,19724,,True,0,0


In [46]:
#Uses panda's indexer to examine parts of the data set, specifically .loc for label based indexing  
dataset.loc[1000:1005]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attendance,Notes,HomeWin,HomeLastWin,VisitorLastWin
1000,2016-03-15,8:30p,Los Angeles Clippers,87,San Antonio Spurs,108,Box Score,,18418,,True,1,0
1001,2016-03-15,10:30p,Sacramento Kings,106,Los Angeles Lakers,98,Box Score,,18997,,False,0,0
1002,2016-03-16,7:00p,Oklahoma City Thunder,130,Boston Celtics,109,Box Score,,18624,,False,0,1
1003,2016-03-16,7:00p,Orlando Magic,99,Charlotte Hornets,107,Box Score,,16148,,True,0,1
1004,2016-03-16,7:00p,Dallas Mavericks,98,Cleveland Cavaliers,99,Box Score,,20562,,True,0,1
1005,2016-03-16,7:00p,Chicago Bulls,96,Washington Wizards,117,Box Score,,19556,,True,1,1


In [47]:
#Creams a dataset using the last win values for the home and visiting team, will be used to specify the columns we use and extract data to use with scikit-learn
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values

In [48]:
#Imports DecisionTree Classifier and creates a Decision Tree using 14 as the random_State point 
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

In [49]:
#Imports cross_val_score method to find the average
from sklearn.model_selection import cross_val_score
import numpy as np

In [50]:
#Uses scikit-learn to find the average
scores = cross_val_score(clf, X_previouswins, y_true,
scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.4%




In [51]:
#Import the 2015 NBA standings for comparision using CSV
import os
standings_filename = os.path.join("standings.csv")

standings = pd.read_csv(standings_filename, skiprows=1)

In [52]:
#Prints out the table for standings
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,67-15,39-2,28-13,25-5,42-10,9-1,7-3,9-1,...,25-6,5-3,45-9,1-0,13-2,11-3,12-3,8-3,16-2,6-2
1,2,Atlanta Hawks,60-22,35-6,25-16,38-14,22-8,12-6,14-4,12-4,...,17-11,6-4,30-10,0-1,9-5,14-2,17-0,7-4,9-7,4-3
2,3,Houston Rockets,56-26,30-11,26-15,23-7,33-19,9-1,8-2,6-4,...,20-9,8-4,31-14,2-0,11-4,9-5,11-6,7-3,10-6,6-2
3,4,Los Angeles Clippers,56-26,30-11,26-15,19-11,37-15,7-3,6-4,6-4,...,21-7,3-5,33-9,2-0,9-5,11-6,11-4,5-6,11-5,7-0
4,5,Memphis Grizzlies,55-27,31-10,24-17,20-10,35-17,8-2,5-5,7-3,...,16-13,9-3,26-13,2-0,13-2,8-6,12-4,7-4,9-8,4-3


In [53]:
#Iterate over the rows, and looks up the standings for the home and away team to get a sense of their records
dataset["HomeTeamRanksHigher"] = 0
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank) 
    dataset.iloc[index] = row
    #dataset.set_value(index, "HomeTeamRanksHigher", int(home_rank < visitor_rank))

In [54]:
#Extracts the dataset
X_homehigher = dataset[[ "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values

In [55]:
#Uses cross_val_score function to test the result
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 60.9%




In [56]:
#Tests out which team won the last head-to-head match for the purpose of predicting certain matchup results, as some teams can play better again others
#Creates a dictionary to store the feature in the dataframe, creates a tuple called teams, stores the result in the dictionary, during game time 
#Recreates tuple and examines the previous result
last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    # Set in the row, who won the last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    dataset.at[index, "HomeTeamWonLast"] = home_team_won_last #dataset.set_value(index, "HomeTeamWonLast", home_team_won_last)
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

In [57]:
#Prints out the given rows of the dataset
dataset.loc[400:450]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attendance,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher,HomeTeamWonLast
400,2015-12-19,8:00p,Los Angeles Clippers,97,Houston Rockets,107,Box Score,,18212,,True,1,0,0,1
401,2015-12-19,8:00p,Indiana Pacers,84,Memphis Grizzlies,96,Box Score,,18119,,True,0,1,0,1
402,2015-12-20,1:00p,Minnesota Timberwolves,100,Brooklyn Nets,85,Box Score,,14552,,False,0,1,0,0
403,2015-12-20,1:00p,Portland Trail Blazers,109,Miami Heat,116,Box Score,,19600,,True,0,0,1,0
404,2015-12-20,3:30p,Philadelphia 76ers,86,Cleveland Cavaliers,108,Box Score,,20562,,True,1,0,0,1
405,2015-12-20,5:00p,Milwaukee Bucks,101,Phoenix Suns,95,Box Score,,16859,,False,1,0,1,0
406,2015-12-20,6:00p,Atlanta Hawks,103,Orlando Magic,100,Box Score,,16982,,False,1,1,1,0
407,2015-12-20,6:00p,Sacramento Kings,104,Toronto Raptors,94,Box Score,,19800,,False,1,0,0,0
408,2015-12-20,8:00p,New Orleans Pelicans,130,Denver Nuggets,125,Box Score,,13857,,False,0,0,1,1
409,2015-12-21,7:00p,Sacramento Kings,99,Washington Wizards,113,Box Score,,15124,,True,1,1,0,0


In [58]:
#Evaluates as before but includes the new features
X_lastwinner = dataset[[ "HomeTeamWonLast", "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 62.2%




In [59]:
#Uses LabelEncoder Transformer to convert string values to assigned integer values, as scikit-learn requires features to be encoded as numbers here
#This is used for both home and visiting team values for consistency
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
encoding.fit(dataset["Home Team"].values)
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

#In order to fix one inconsistency with continuous features, use OneHotEncoder transformer to encode as number of binary values
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

#Runs the Decision Tree on new dataset 
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 63.8%


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [60]:
#Use Estimator interface to use the cross-fold validation
#Ensemble is used to make note of random errors and cancel them out, examines the Variance
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Accuracy: 64.2%




In [61]:
#Throws more features and runs the classifiers again using RandomForest
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Accuracy: 63.5%




In [62]:
#Runs classifier using n_estimators, which dictates how many decision trees are built
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14, n_estimators=250)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Accuracy: 64.1%


In [63]:
#Uses other parameters with GridSearchCV to improve the accuracy
from sklearn.model_selection import GridSearchCV
parameter_space = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))



Accuracy: 68.0%


In [64]:
#Prints out the best model with the grid search
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=14, verbose=0,
                       warm_start=False)
