# Visualizing NBA Stats

Examples of visualizations of stats from the 2017-18 NBA season using the interactive visualization library Bokeh. 

----
## Imports and Setups

#### Data Visualization

In [2]:
from bokeh.plotting import figure, show, save
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, FactorRange

#### Data Handling

In [3]:
import pandas as pd
import numpy as np

----
## Data Acquisition

The data referenced below was souced from **[Kaggle - NBA Enhanced Box Score and Standings Stats](https://www.kaggle.com/pablote/nba-enhanced-stats/home)**

### Player Box Score Stats
This dataset contains game-by-game statistics for individual **players**.

The data can be found at: **```2017-18_playerBoxScore.csv```**

In [4]:
# Import the data
playerStats = pd.read_csv('2017-18_playerBoxScore.csv')

In [5]:
# Print out the column names
print(playerStats.columns.values)

['gmDate' 'gmTime' 'seasTyp' 'playLNm' 'playFNm' 'teamAbbr' 'teamConf'
 'teamDiv' 'teamLoc' 'teamRslt' 'teamDayOff' 'offLNm1' 'offFNm1' 'offLNm2'
 'offFNm2' 'offLNm3' 'offFNm3' 'playDispNm' 'playStat' 'playMin' 'playPos'
 'playHeight' 'playWeight' 'playBDate' 'playPTS' 'playAST' 'playTO'
 'playSTL' 'playBLK' 'playPF' 'playFGA' 'playFGM' 'playFG%' 'play2PA'
 'play2PM' 'play2P%' 'play3PA' 'play3PM' 'play3P%' 'playFTA' 'playFTM'
 'playFT%' 'playORB' 'playDRB' 'playTRB' 'opptAbbr' 'opptConf' 'opptDiv'
 'opptLoc' 'opptRslt' 'opptDayOff']


### Team Box Score Stats
This dataset contains game-by-game statistics for individual **teams**.

The data can be found at: **```2017-18_teamBoxScore.csv```**

In [6]:
# Import the data
teamStats = pd.read_csv('2017-18_teamBoxScore.csv')

In [7]:
# Print out the column names
print(teamStats.columns.values)

['gmDate' 'gmTime' 'seasTyp' 'offLNm1' 'offFNm1' 'offLNm2' 'offFNm2'
 'offLNm3' 'offFNm3' 'teamAbbr' 'teamConf' 'teamDiv' 'teamLoc' 'teamRslt'
 'teamMin' 'teamDayOff' 'teamPTS' 'teamAST' 'teamTO' 'teamSTL' 'teamBLK'
 'teamPF' 'teamFGA' 'teamFGM' 'teamFG%' 'team2PA' 'team2PM' 'team2P%'
 'team3PA' 'team3PM' 'team3P%' 'teamFTA' 'teamFTM' 'teamFT%' 'teamORB'
 'teamDRB' 'teamTRB' 'teamPTS1' 'teamPTS2' 'teamPTS3' 'teamPTS4'
 'teamPTS5' 'teamPTS6' 'teamPTS7' 'teamPTS8' 'teamTREB%' 'teamASST%'
 'teamTS%' 'teamEFG%' 'teamOREB%' 'teamDREB%' 'teamTO%' 'teamSTL%'
 'teamBLK%' 'teamBLKR' 'teamPPS' 'teamFIC' 'teamFIC40' 'teamOrtg'
 'teamDrtg' 'teamEDiff' 'teamPlay%' 'teamAR' 'teamAST/TO' 'teamSTL/TO'
 'opptAbbr' 'opptConf' 'opptDiv' 'opptLoc' 'opptRslt' 'opptMin'
 'opptDayOff' 'opptPTS' 'opptAST' 'opptTO' 'opptSTL' 'opptBLK' 'opptPF'
 'opptFGA' 'opptFGM' 'opptFG%' 'oppt2PA' 'oppt2PM' 'oppt2P%' 'oppt3PA'
 'oppt3PM' 'oppt3P%' 'opptFTA' 'opptFTM' 'opptFT%' 'opptORB' 'opptDRB'
 'opptTRB' 'opptPTS1' 'oppt

### Team Standings
This dataset contains daily snapshots of the **league standings**.

The data can be found at: **```2017-18_standings.csv```**

In [8]:
# Import the data
standings = pd.read_csv('2017-18_standings.csv')

In [9]:
# Print out the column names
print(standings.columns.values)

['stDate' 'teamAbbr' 'rank' 'rankOrd' 'gameWon' 'gameLost' 'stk' 'stkType'
 'stkTot' 'gameBack' 'ptsFor' 'ptsAgnst' 'homeWin' 'homeLoss' 'awayWin'
 'awayLoss' 'confWin' 'confLoss' 'lastFive' 'lastTen' 'gamePlay'
 'ptsScore' 'ptsAllow' 'ptsDiff' 'opptGmPlay' 'opptGmWon' 'opptOpptGmPlay'
 'opptOpptGmWon' 'sos' 'rel%Indx' 'mov' 'srs' 'pw%' 'pyth%13.91'
 'wpyth13.91' 'lpyth13.91' 'pyth%16.5' 'wpyth16.5' 'lpyth16.5']


------
## Basic Charts

### Bar/Column Charts

Bar and column charts are useful in comparing categories.

A basic bar or column chart could compare the final winning percentage of the various divisions.

In [10]:
# Each row contains a cumulative win and loss total, so need to find the latest date in the dataset for each team
standings.groupby('teamAbbr').max().loc[:, ['stDate']]

Unnamed: 0_level_0,stDate
teamAbbr,Unnamed: 1_level_1
ATL,2018-04-11
BKN,2018-04-11
BOS,2018-04-11
CHA,2018-04-11
CHI,2018-04-11
CLE,2018-04-11
DAL,2018-04-11
DEN,2018-04-11
DET,2018-04-11
GS,2018-04-11


In [11]:
# Isolate the final wins and losses by team
teamFinalWL = standings[standings['stDate'] == '2018-04-11'].loc[:, ['teamAbbr','gameWon','gameLost']]

In [12]:
# Each team's division is in the Team Box Score dataset
teamDivision = teamStats.groupby('teamAbbr').max().loc[:, 'teamDiv']

In [13]:
# Merge the team win/loss Dataframe and the team division Series
teamFinalWL = pd.merge(teamFinalWL, teamDivision.reset_index(), on='teamAbbr', how='inner', left_index=False, right_index=False)

In [14]:
teamFinalWL.head()

Unnamed: 0,teamAbbr,gameWon,gameLost,teamDiv
0,ATL,24,58,Southeast
1,BKN,28,54,Atlantic
2,BOS,55,27,Atlantic
3,CHA,36,46,Southeast
4,CHI,27,55,Central


In [15]:
# Aggregate the resulting DataFrame by division name
divisionWL = teamFinalWL.groupby('teamDiv').sum().loc[:, ['gameWon', 'gameLost']]

In [16]:
# Calculate a winning percentage column
divisionWL['winPct'] = divisionWL['gameWon'] / (divisionWL['gameWon'] + divisionWL['gameLost'])

In [17]:
divisionWL.head()

Unnamed: 0_level_0,gameWon,gameLost,winPct
teamDiv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Atlantic,223,187,0.543902
Central,208,202,0.507317
Northwest,238,172,0.580488
Pacific,183,227,0.446341
Southeast,172,238,0.419512


In [18]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

# Assign the x (division) and y (winning percentage) data
xData = divisionWL.index.values
yData = divisionWL['winPct'].values

# Create the figure
fig = figure(x_range=xData,
             y_range=(0.2, 0.6),
             plot_width=500,
             plot_height=300,
             title='Division Winning Percentages, 2017-18',
             toolbar_location=None)

# Create a column chart 
fig.vbar(x=xData, top=yData, width=0.5)

# # Configure to output within the notebook and visualize
output_notebook()
show(fig)

The _column_ chart above can easily be converted to a _bar_ chart.

In [19]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

# Assign the x (division) and y (winning percentage) data
yData = divisionWL.index.values
xData = divisionWL['winPct'].values

# Create the figure
fig = figure(x_range=(0.2, 0.6),
             y_range=yData,
             plot_height=300,
             plot_width=500,
             title='Division Winning Percentages, 2017-18',
             toolbar_location=None)

# Create a column chart 
fig.hbar(y=yData, right=xData, height=0.5)

# # Configure to output within the notebook and visualize
output_notebook()
show(fig)

It would be nice to group the divisions into their respective conferences: 
* Eastern Conference: Atlantic, Central, Southeast
* Western Conference: Northwest, Pacific, Southwest

In [20]:
# Assign the conference name to the teamFinalWL DataFrame
def assignConference(division):
    if division in ['Atlantic', 'Central', 'Southeast']:
        return 'Eastern'
    else:
        return 'Western'
    
teamFinalWL['teamConf'] = teamFinalWL['teamDiv'].apply(assignConference)

In [21]:
# Re-aggregate the divisionWL DataFrame and calculate winning percentage
divConfWL = teamFinalWL.groupby(['teamConf','teamDiv']).sum().loc[:, ['gameWon', 'gameLost']]
divConfWL['winPct'] = divConfWL['gameWon'] / (divConfWL['gameWon'] + divConfWL['gameLost'])

In [22]:
divConfWL

Unnamed: 0_level_0,Unnamed: 1_level_0,gameWon,gameLost,winPct
teamConf,teamDiv,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eastern,Atlantic,223,187,0.543902
Eastern,Central,208,202,0.507317
Eastern,Southeast,172,238,0.419512
Western,Northwest,238,172,0.580488
Western,Pacific,183,227,0.446341
Western,Southwest,206,204,0.502439


In [24]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, FactorRange

# Assign the x (conference/division) and y (winning percentage) data
xData = divConfWL.index.values
yData = divConfWL['winPct'].values

# Store the data in a ColumnDataSource
source = ColumnDataSource(data=dict(x=xData, y=yData))

# Create the figure
fig = figure(x_range=FactorRange(*xData),
             y_range=(0.2, 0.6),
             plot_width=500,
             plot_height=300,
             title='Division Winning Percentages, 2017-18',
             toolbar_location=None)

# Configure the spacing and remove the vertical grid lines
fig.x_range.range_padding = 0.1
fig.xgrid.grid_line_color = None

# Create a column chart 
fig.vbar(x='x', top='y', width=0.75, source=source)

# Configure to output within the notebook and visualize
output_notebook()
show(fig)
save(fig, 'division_winning_percentage_201718.png')

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/lsdangio/Desktop/Python/Python-Documentation/Bokeh/RealPython_Tutorial/data/division_winning_percentage_201718.png'

### Pie Charts

Pie charts are useful in showing the percentage distribution of parts within a category. 

An example would be the percent breakdown of a teams total points by player. 

In [108]:
# Let's choose the defending champion Golden State Warriors
gswPts = playerStats[playerStats['teamAbbr'] == 'GS'].loc[:,['playLNm','playFNm','playPTS']]

In [109]:
# Aggregate by first and last name, and sum up the points
gswPts = gswPts.groupby(['playLNm','playFNm']).sum().sort_values('playPTS', ascending=False).reset_index()
gswPts.head()

Unnamed: 0,playLNm,playFNm,playPTS
0,Durant,Kevin,1792
1,Thompson,Klay,1459
2,Curry,Wardell,1346
3,Green,Draymond,775
4,Young,Nick,581


In [110]:
# Consolidate the first and last names into a single column (first letter of first name, last name)
gswPts['name'] = [gswPts.iloc[ix].playFNm[0] + '. ' + gswPts.iloc[ix].playLNm for ix in range(len(gswPts))]
gswPts.head()

Unnamed: 0,playLNm,playFNm,playPTS,name
0,Durant,Kevin,1792,K. Durant
1,Thompson,Klay,1459,K. Thompson
2,Curry,Wardell,1346,W. Curry
3,Green,Draymond,775,D. Green
4,Young,Nick,581,N. Young


In [111]:
# Isolate the full name and the total points
gswPts = gswPts.loc[:, ['name','playPTS']]
gswPts.head()

Unnamed: 0,name,playPTS
0,K. Durant,1792
1,K. Thompson,1459
2,W. Curry,1346
3,D. Green,775
4,N. Young,581


In [221]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import cividis
from bokeh.transform import cumsum
from bokeh.models import ColumnDataSource, HoverTool

# Convert the points to angles and assign a color to each player from the palette
gswPts['angle'] = gswPts['playPTS'] / gswPts['playPTS'].sum() * 2*np.pi
gswPts['color'] = cividis(len(gswPts))

# Store data in ColumnDataSource
source = ColumnDataSource(gswPts[gswPts['playPTS'] > 0])

# Format tooltips
tooltips=[('Player','@name'), 
          ('Points','@playPTS'),]

# Create the figure
fig = figure(plot_height=400,
            title='Golden State Warriors - Points Breakdown, 2017-18',
            title_location='above',
            toolbar_location=None,
            tools='hover',
            tooltips=tooltips
            )

# Use wedge glyphs to create the pie chart
fig.wedge(x=0,
          y=1,
          radius=0.5,
          start_angle=cumsum('angle', include_zero=True),
          end_angle=cumsum('angle'),
          line_color='yellow',
          fill_color='color',
          source=source)

# Remove all extraneous the axes and labels from figure
fig.axis.axis_label=None
fig.axis.visible=False
fig.grid.grid_line_color=None

# Configure to output within the notebook and visualize
output_notebook()
show(fig)

### Line Charts

Line charts are best used for showing trends in data over time. 

For instance, trending the win totals of the top 2 teams in the Western Conference - the Houston Rockets and Golden State Warriors - over the course of the season.

In [173]:
# Get the standings for the three teams referenced above
westTop2 = standings[(standings['teamAbbr'] == 'HOU') |
                     (standings['teamAbbr'] == 'GS')]\
           .loc[:,['stDate', 'teamAbbr', 'gameWon']]\
           .sort_values(['teamAbbr','stDate'])

# Convert stDate column to datetime
westTop2['stDate'] = pd.to_datetime(westTop2['stDate'])
        
westTop2.head()

Unnamed: 0,stDate,teamAbbr,gameWon
9,2017-10-17,GS,0
39,2017-10-18,GS,0
69,2017-10-19,GS,0
99,2017-10-20,GS,1
129,2017-10-21,GS,1


In [178]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, DataRange1d

# Split the data into a ColumnDataSource for each team
rockets = ColumnDataSource(westTop2[westTop2['teamAbbr'] == 'HOU'])
warriors = ColumnDataSource(westTop2[westTop2['teamAbbr'] == 'GS'])

# Create and configure the figure
fig = figure(x_axis_type="datetime", 
             plot_height=300,
             plot_width=600,
             title="Western Conference Top 2 Teams Wins Race, 2017-18",
             toolbar_location=None)
fig.xaxis.axis_label = 'Date'
fig.yaxis.axis_label = 'Wins'

# Add Lines
fig.line('stDate', 'gameWon', color='#CC0000', legend='Rockets', source=rockets)
fig.line('stDate', 'gameWon', color='#223E91', legend='Warriors', source=warriors)

# Place Legend
fig.legend.location = "top_left"

# Configure to output within the notebook and visualize
output_notebook()
show(fig)

### Scatter Plots

Scatter plots are helpful in showing possible relationships between variables within a dataset.

For example, a scatter plot can use to see where various players fall in terms of how many three point shots they _take_ versus how many they _make_. 

In [191]:
# First, let's find players who took at least 1 three-point shot during the season
threeTakers = playerStats[playerStats['play3PA'] > 0]

In [199]:
# First let's clean up the player names, placing them in a single column
threeTakers['name'] = [threeTakers.iloc[ix]['playFNm'] + ' ' + threeTakers.iloc[ix]['playLNm'] for ix in range(len(threeTakers))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [201]:
# Aggregate the total three-point attempts and makes for each player over the course of the season
threeTakers = threeTakers.groupby('name').sum().loc[:,['play3PA', 'play3PM']].sort_values('play3PA', ascending=False)

In [205]:
# Filter out anyone who didn't take at least 100 three-point shots
threeTakers = threeTakers[threeTakers['play3PA'] >= 100].reset_index()

In [206]:
# Add a column with a calculated three-point percentage (made/attempted)
threeTakers['pct3PM'] = threeTakers['play3PM'] / threeTakers['play3PA']

In [222]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.models import NumeralTickFormatter

# Store the data in a ColumnDataSource
source = ColumnDataSource(threeTakers)

# Format tooltips
tooltips = [
            ('Player','@name'),
            ('3PM', '@play3PM'),
            ('3PA', '@play3PA'),
            ('3P%','@pct3PM{00.0%}'),
           ]

# Create the figure
fig = figure(plot_height=400,
             plot_width=600,
             x_axis_label='Three-Point Shots Attempted',
             y_axis_label='Percentage Made',
             title='3PT Shots Attempted vs. Percentage Made (min. 100 3PA), 2017-18',
             toolbar_location=None,
             tools='hover',
             tooltips=tooltips)

# Format the y-axis tick labels as percenages
fig.yaxis[0].formatter = NumeralTickFormatter(format="00.0%")

# Add circle representing each player
fig.circle('play3PA', 'pct3PM', source=source)

# Configure to output within the notebook and visualize
output_notebook()
show(fig)

----
## Appendix

#### Basic output_file() example

In [26]:
# Bokeh Libraries
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show

# The figure will be rendered in a static HTML file called output_file_test.html
output_file('output_file_test.html', title='Empty Bokeh Figure')

# Set up a generic figure() object
fig = figure()

# See what it looks like
show(fig)

W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='d5abcef2-3377-48b1-857e-7b13ae21cc1d', ...)


W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='b7fd0749-2285-4915-8f2a-6a7c10bd183c', ...)
W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='d5abcef2-3377-48b1-857e-7b13ae21cc1d', ...)


#### output_notebook() Example

In [27]:
# Bokeh Libraries
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show

# The figure will be right in my Jupyter Notebook
output_notebook()

# Set up a generic figure() object
fig = figure()

# See what it looks like
show(fig)

W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='f0f387f5-8b44-4e2f-890b-619fa242e5a7', ...)


W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='b7fd0749-2285-4915-8f2a-6a7c10bd183c', ...)
W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='d5abcef2-3377-48b1-857e-7b13ae21cc1d', ...)
W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='f0f387f5-8b44-4e2f-890b-619fa242e5a7', ...)


#### Customized figure()

In [60]:
from bokeh.plotting import reset_output
reset_output()

# Bokeh Libraries
from bokeh.io import output_notebook
from bokeh.plotting import figure, show

# The figure will be right in my Jupyter Notebook
output_notebook()

# Example figure
fig = figure(
    background_fill_color='gray',
    background_fill_alpha=0.5,
    border_fill_color='blue',
    border_fill_alpha=0.25,
    plot_height=300,
    plot_width=500,
    h_symmetry=True,
    x_axis_label='X Label',
    x_axis_type='datetime',
    x_axis_location='above',
    x_range=(0,1),
    y_axis_label='Y Label',
    y_axis_type='linear',
    y_axis_location='left',
    y_range=(0,100),
    y_minor_ticks=2,
    title='Example Figure',
    title_location='right',
    toolbar_location='below',
    tools='save')

# See what it looks like 
show(fig)

W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='7926d538-e59b-4743-8d2c-23f826907074', ...)


In [61]:
# Remove the gridlines from the figure() object
fig.grid.grid_line_color = None

# Observe changes
show(fig)

W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure(id='7926d538-e59b-4743-8d2c-23f826907074', ...)


#### First Glyphs

In [70]:
reset_output()

# Bokeh Libraries
from bokeh.io import output_notebook
from bokeh.plotting import figure, show

# My x-y coordinate data
x = [1, 2, 1]
y = [1, 1, 2]

# Output the visualization directly in the notebook
output_notebook()

# Create a figure called 'My Coordinates' with no toolbar and axis ranges of [0,3]
fig = figure(
    title='My Coordinates',
    plot_height=300,
    plot_width=300,
    x_range=(0,3),
    y_range=(0,3),
    toolbar_location=None)

# Draw the coordinates as circles
fig.circle(
    x=x,
    y=y,
    color='green',
    size=10,
    alpha=0.5)

# Show plot
show(fig)

#### Another glyph example

In [92]:
reset_output()

In [101]:
# Pandas and Numpy for generating my dummy data
import pandas as pd
import numpy as np

# Bokeh Libraries
from bokeh.io import output_notebook
from bokeh.plotting import figure, show

# My word count data
dates = pd.date_range('2018-09-21', '2018-09-30')
dailyWords = [450, 628, 488, 210, 287, 791, 508, 639, 397, 943]
cumWords = np.cumsum(dailyWords)

# Output the visualization directly in the notebook
output_notebook()
#output_file('tutorial_progress.html')

# Create a figure with a datetime type x-axis
fig = figure(
    title='My Tutorial Progress',
    plot_height=400,
    plot_width=700,
    x_axis_label='Date',
    x_axis_type='datetime',
    y_axis_label='Words Written',
    toolbar_location=None)

# The daily words will be represented as vertical bars (columns)
fig.vbar(
    x=dates, 
    bottom=0, 
    top=dailyWords, 
    color='#51B2E8', 
    width=0.75,
    legend='Daily')

# The cumulative sum will be a trend line
#fig.line(
#    x=dates,
#    y=cumWords, 
#    line_width=1,
#    color='#23223E',
#    legend='Cumulative')

# Put the legend in the upper left corner
fig.legend.location = 'top_left'

# Let's check it out
show(fig)

In [88]:
len(dailyWords)

10

In [99]:
dates

DatetimeIndex(['2018-09-21', '2018-09-22', '2018-09-23', '2018-09-24',
               '2018-09-25', '2018-09-26', '2018-09-27', '2018-09-28',
               '2018-09-29', '2018-09-30'],
              dtype='datetime64[ns]', freq='D')

Timestamp('2018-09-21 00:00:00', freq='D')