In [1]:
import pandas as pd

data_filename = "data/basketball.csv"
dataset = pd.read_csv(data_filename)

# I. Import and Clean dataset

In [2]:
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Tue Oct 27 2015,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
1,Tue Oct 27 2015,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
2,Tue Oct 27 2015,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,Wed Oct 28 2015,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,
4,Wed Oct 28 2015,7:30p,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,18624,


In [3]:
dataset = pd.read_csv(data_filename, parse_dates=["Date"])
dataset = dataset.drop('Attend.', axis='columns') 
dataset.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Notes"]

In [4]:
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Notes
0,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,
1,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,
3,2015-10-28,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,
4,2015-10-28,7:30p,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,


In [5]:
dataset.dtypes

Date            datetime64[ns]
Start (ET)              object
Visitor Team            object
VisitorPts               int64
Home Team               object
HomePts                  int64
OT?                     object
Score Type              object
Notes                   object
dtype: object

# Extract new features

We specify our class as 1 if hte home team wins and 0 if the visitor team wins

In [6]:
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]

In [7]:
y_true = dataset["HomeWin"].values
y_true

array([ True, False,  True, ..., False,  True, False])

In [8]:
dataset["HomeWin"].mean()

0.5942249240121581

The resulting value, around 0.59, indicates that the home team wins 59 percent of games on average. ***This means that if we our ML model always choose our Home team as the winning team. It has 59.42% of being correct***

***Important Note***: We **can't** really use the features already present in their current form to do a prediction. We wouldn't know the scores of a game before we would need to predict the outcome of the game, so we can not use them as features.

The first 2 features we want ot create to help us predict which team will win are whehter either of those two teams won their previous game. This would roughly approximate which team is currently playing well.

In [9]:
from collections import defaultdict

# Key of this dictionary will be the team and the value will be whether they won their previous game
won_last = defaultdict(int)
dataset["HomeLastWin"] = 0
dataset["VisitorLastWin"] = 0

# Assume that our dataset is already in ascending order of "Date"
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    dataset.at[index, "HomeLastWin"] = won_last[home_team]
    dataset.at[index, "VisitorLastWin"] = won_last[visitor_team]
    
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])

Currently, `dataset` gives us a false values to all teams when they are first seen. We couls improve this features uing the previous year's data, but for simplicit, we will not do that in this chapter.

The dataset will use as features is `X_previouswins`

In [10]:
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values

# III. Decision Trees

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_previouswins, y_true,
scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.4%


This scores 59.4 percent: we are better than choosing randomly! However, we aren't beating our other baseline of just choosing the home team.

# IV. Add a new feature from a new dataset

In [12]:
standings_filename = "data/standings.csv"
standings = pd.read_csv(standings_filename, skiprows=1)
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,67-15,39-2,28-13,25-5,42-10,9-1,7-3,9-1,...,25-6,5-3,45-9,1-0,13-2,11-3,12-3,8-3,16-2,6-2
1,2,Atlanta Hawks,60-22,35-6,25-16,38-14,22-8,12-6,14-4,12-4,...,17-11,6-4,30-10,0-1,9-5,14-2,17-0,7-4,9-7,4-3
2,3,Houston Rockets,56-26,30-11,26-15,23-7,33-19,9-1,8-2,6-4,...,20-9,8-4,31-14,2-0,11-4,9-5,11-6,7-3,10-6,6-2
3,4,Los Angeles Clippers,56-26,30-11,26-15,19-11,37-15,7-3,6-4,6-4,...,21-7,3-5,33-9,2-0,9-5,11-6,11-4,5-6,11-5,7-0
4,5,Memphis Grizzlies,55-27,31-10,24-17,20-10,35-17,8-2,5-5,7-3,...,16-13,9-3,26-13,2-0,13-2,8-6,12-4,7-4,9-8,4-3


We will create a feature that tells us if the home team is generally better than the visitors. High ranked team is more likely to win.

In [15]:
dataset["HomeTeamRanksHigher"] = 0
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    dataset.at[index, "HomeTeamRanksHigher"] = int(home_rank < visitor_rank)

In [16]:
X_homehigher = dataset[[ "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values

In [17]:
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 61.8%


Next, we will add another feature: which of the two teams won their last match against each other. The reason behind this is some teams may have strategies or players that work agianst specific teams really well.

In [19]:
last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    # Set in the row, who won the last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    dataset.at[index, "HomeTeamWonLast"] = home_team_won_last
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

Test the Decision Tree algo on the new dataset.

In [20]:
X_lastwinner = dataset[[ "HomeTeamWonLast", "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 62.4%


Our result is getting better and better.

# V. Add another feature

In [25]:
# Encode categorical data into integer values
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
encoding.fit(dataset["Home Team"].values)

# Use the same transformer for encoding both the home team and visitor teams so that the same team gets the same integer
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

'''
The integers stored in `X_teams` can be fed into the Decision Tree but they will still be interpreted as continuous features 
by `DecisionTreeClassifier`. For example, teams may be allocated integers from 0 to 16. The algo will see teams 1 and 2 as 
being similar, while team 1 and 10 will be very different. This doesn't make sense since 2 teams are either the same 
or different. 

To fix this inconsistency, we use the `OnceHotEncoder` to encode these integers into a number of binary features. Each binary 
feature will be a single value for the feature. For example, if the Chicago Bulls is allocated as integer 7 by the LabelEncoder,
then the seventh feature returned by the `OneHotEncoder` will be a 1 if the team is Chicago Bulls nad 0 for all other features.
---> this might result in a much larger dataset.
'''

from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

`X_teams` is a dataset containing information on what teams are playing.

In [27]:
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 63.7%


The accuracy score is better than before even though the info given is just the teams playing. It is possible that the larger # of features were not handled properly by the decision trees.

# VI. Use Random Forests

Use Random Forests algo on only what teams playing

In [44]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 64.7%


Use Random Forests on all features we have so far

In [51]:
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 64.4%


Bring the number of trees to 250

In [52]:
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14, n_estimators=250)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 64.4%


***Important***: Changing the random state value will have more of an impact on the accuracy than the slight difference etween these feature sets. You should run many tests with different random states to get a good sense of the mean and spread of accuracy values.

In [54]:
from sklearn.model_selection import GridSearchCV
parameter_space = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

Accuracy: 67.7%


To see what parameters used, we can print out the best model that was found in the grid search

In [55]:
grid.best_estimator_

RandomForestClassifier(criterion='entropy', min_samples_leaf=4,
                       n_estimators=200, random_state=14)

# VII. Ideas for more features
- How many days has it been since each team's previous match? Teams may be tired if they play too many games in a short time frame
- How many games of the last five did each team win? This will give a more stable form of the HomeLastWin and VisitorLastWin features we extracted earlier (and can be extracted in a very similar way).
- Do teams have a good record when visiting certain other teams? For instance, one team may play well in a particular stadium, even if they are the visitors.